In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #breaks down text into individual units of meaning

#model.summary()

  from .autonotebook import tqdm as notebook_tqdm
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
import tensorflow as tf
import pandas as pd

train = tf.keras.preprocessing.text_dataset_from_directory(
    'C:\\Users\\Viktorija\\Desktop\\JT\\SV2\\separatedText', batch_size=4000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'C:\\Users\\Viktorija\\Desktop\\JT\\SV2\\separatedText', batch_size=4000, validation_split=0.2, 
    subset='validation', seed=123)

#.numpy() -> converts an array-like object into a numpy array

for i in train.take(1):
    train_feat = i[0].numpy() #array of the tweets
    train_lab = i[1].numpy() #array of their ratings

#DataFrame contains labeled axes (rows and columns). Can be thought of as a dict-like container.
#puts the data in the format: nr_of_tweet tweet sentiment_value
train = pd.DataFrame([train_feat, train_lab]).T

#in this case, data is the tweet, and label is the sentiment value
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
#train.head() #returns the first N (N=5) entries in the train DataFrame

for j in test.take(1):
    test_feat = j[0].numpy()
    test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
#test.head()

Found 4869 files belonging to 3 classes.
Using 3896 files for training.
Found 4869 files belonging to 3 classes.
Using 973 files for validation.


In [3]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    
    #train and test are pandas DataFrames, the other two are strings
    
    #.apply -> Apply the function x on the columns (axis = 1) of train. 
    #Converts the train table into a collection of InputExamples for BERT processing.
    #text_a is the tweet, label is the sentiment value.
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
  
    return train_InputExamples, validation_InputExamples

    #train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'DATA_COLUMN', 'LABEL_COLUMN')
    
    
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    
    #examples is a list of InputExamples
    
    features = [] # -> will hold InputFeatures to be converted later

    #for each InputExample
    for e in examples:
        
        #encode_plus encodes a text input as a set of numerical inputs that can be used as input to a model.
        #text_a is the input text to be encoded
        #special tokens are used to indicate the start and the end of the encoded input text
        #max_length is the maximum length of the encoded input
        #token_type_ids should be returned along with the encoded input
        #attention_mask is a binary mask that indicates which input tokens should be attended to by the model, and which ones should be ignored. An attention mask is needed, because most inputs are padded and the added zeroes (usually at the end) shouldn't be attended to by the model.
        #the encoded input should be padded with special padding tokens to ensure that all inputs have the same length
        #input should be truncated if it exceeds max_length
        
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )
        
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])
        
        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )
    
    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [4]:
#Convert the train and test tables into collections of InputExamples for BERT processing.
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

#Convert the InputExamples into tf.data.Datasets

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [5]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
#model.summary()
model.fit(train_data, epochs=2, validation_data=validation_data, verbose=True)

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "D:\Anaconda\envs\bert\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "D:\Anaconda\envs\bert\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "D:\Anaconda\envs\bert\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
      self.io_loop.start()
    File "D:\Anaconda\envs\bert\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "D:\Anaconda\envs\bert\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "D:\Anaconda\envs\bert\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "D:\Anaconda\envs\bert\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "D:\Anaconda\envs\bert\lib\site-packages\ipykernel\zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "D:\Anaconda\envs\bert\lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Viktorija\AppData\Local\Temp\ipykernel_9784\1013356108.py", line 5, in <module>
      model.fit(train_data, epochs=2, validation_data=validation_data, verbose=True)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "D:\Anaconda\envs\bert\lib\site-packages\transformers\modeling_tf_utils.py", line 1529, in train_step
      loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\losses.py", line 2084, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "D:\Anaconda\envs\bert\lib\site-packages\keras\backend.py", line 5630, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
Received a label value of 2 which is outside the valid range of [0, 2).  Label values: 0 0 1 1 1 1 2 0 2 2 0 2 0 0 1 2 1 2 0 0 1 1 2 1 1 0 2 2 2 2 1 1
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_31762]