In [6]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #breaks down text into individual units of meaning

#model.summary()

In [7]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
import tensorflow as tf
import pandas as pd

train = tf.keras.preprocessing.text_dataset_from_directory(
    'C:\\Users\\Viktorija\\Desktop\\JT\\SV2\\separatedText', batch_size=4000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'C:\\Users\\Viktorija\\Desktop\\JT\\SV2\\separatedText', batch_size=4000, validation_split=0.2, 
    subset='validation', seed=123)

#.numpy() -> converts an array-like object into a numpy array

for i in train.take(1):
    train_feat = i[0].numpy() #array of the tweets
    train_lab = i[1].numpy() #array of their ratings

#DataFrame contains labeled axes (rows and columns). Can be thought of as a dict-like container.
#puts the data in the format: nr_of_tweet tweet sentiment_value
train = pd.DataFrame([train_feat, train_lab]).T

#in this case, data is the tweet, and label is the sentiment value
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
#train.head() #returns the first N (N=5) entries in the train DataFrame

for j in test.take(1):
    test_feat = j[0].numpy()
    test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
#test.head()

In [8]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    
    #train and test are pandas DataFrames, the other two are strings
    
    #.apply -> Apply the function x on the columns (axis = 1) of train. 
    #Converts the train table into a collection of InputExamples for BERT processing.
    #text_a is the tweet, label is the sentiment value.
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
  
    return train_InputExamples, validation_InputExamples

    #train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'DATA_COLUMN', 'LABEL_COLUMN')
    
    
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    
    #examples is a list of InputExamples
    
    features = [] # -> will hold InputFeatures to be converted later

    #for each InputExample
    for e in examples:
        
        #encode_plus encodes a text input as a set of numerical inputs that can be used as input to a model.
        #text_a is the input text to be encoded
        #special tokens are used to indicate the start and the end of the encoded input text
        #max_length is the maximum length of the encoded input
        #token_type_ids should be returned along with the encoded input
        #attention_mask is a binary mask that indicates which input tokens should be attended to by the model, and which ones should be ignored. An attention mask is needed, because most inputs are padded and the added zeroes (usually at the end) shouldn't be attended to by the model.
        #the encoded input should be padded with special padding tokens to ensure that all inputs have the same length
        #input should be truncated if it exceeds max_length
        
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )
        
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])
        
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )
    
    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [9]:
#Convert the train and test tables into collections of InputExamples for BERT processing.
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

#Convert the InputExamples into tf.data.Datasets

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
#model.summary()
model.fit(train_data, epochs=2, validation_data=validation_data, verbose=True)