## Step 1 – Preparing Our Data, Model, And Tokenizer

In [1]:
# Importing necessary tools
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset
import tensorflow as tf
import numpy as np

# Loading our dataset
tweet_dataset = load_dataset(path="tweet_eval", name="emotion")

Reusing dataset tweet_eval (C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# Instantiating our DistilBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

## Step 2 – Data Preprocessing

### Inspecting the Dataset

In [3]:
# Inspecting our dataset
print(tweet_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})


In [4]:
# Inspecting the train set
print(tweet_dataset["train"])

Dataset({
    features: ['text', 'label'],
    num_rows: 3257
})


In [5]:
# Inspecting train text and labels
print(f"Sequence samples:\n {tweet_dataset['train']['text'][:2]}\n")
print(f"Label samples:\n {tweet_dataset['train']['label'][:2]}")

Sequence samples:
 ["“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", "My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs"]

Label samples:
 [2, 0]


In [6]:
# Making a dictionary with class names for conversion
class_names = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}

In [7]:
# A function for finding the length of the longest sequence in the data
def find_max_length(dataset):
    return len(max(dataset, key=lambda x: len(x.split())).split())

In [8]:
# Obtaining the length of the longest sequences in our data splits
train_max_length = find_max_length(tweet_dataset["train"]["text"])
val_max_length = find_max_length(tweet_dataset["validation"]["text"])
test_max_length = find_max_length(tweet_dataset["test"]["text"])

# Inspecting the length of the longest sequences
print(f"Longest sequence in train set has {train_max_length} words")
print(f"Longest sequence in val set has {val_max_length} words")
print(f"Longest sequence in test set has {test_max_length} words")

Longest sequence in train set has 33 words
Longest sequence in val set has 32 words
Longest sequence in test set has 36 words


### Filtering, Padding, and Tokenizing Our Dataset

In [9]:
# A function for discarding sequences beyond a specified length
def filter_dataset(dataset, num_words):    
    return dataset.filter(lambda x: len(x["text"].split()) <= num_words)

In [10]:
# Specifying the max length for sequences
num_words = 36

# Dropping sequences longer than the specified number from the dataset
filtered_dataset = filter_dataset(tweet_dataset, num_words)

Loading cached processed dataset at C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-8b5b5191685fa0b7.arrow
Loading cached processed dataset at C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-91006f42c4d168c7.arrow
Loading cached processed dataset at C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-f7c7993d24aa9038.arrow


In [11]:
# Inspecting the shortened dataset
print(filtered_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})


In [12]:
# A function for tokenizing our dataset
def tokenize_dataset(examples):
    return tokenizer(examples["text"], padding="max_length", 
                     truncation=True, max_length=36)

In [13]:
# Tokenizing our dataset
tokenized_dataset = filtered_dataset.map(tokenize_dataset)

Loading cached processed dataset at C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-3fc95a2e602f9734.arrow
Loading cached processed dataset at C:\Users\tigra\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-b03aab1360817548.arrow


  0%|          | 0/374 [00:00<?, ?ex/s]

In [14]:
# Inspecting the tokenized dataset
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 374
    })
})


In [15]:
# Inspecting a training sample
print(tokenized_dataset["train"][0])

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'label': 2, 'input_ids': [101, 1523, 4737, 2003, 1037, 2091, 7909, 2006, 1037, 3291, 2017, 2089, 2196, 2031, 1005, 1012, 11830, 11527, 1012, 1001, 14354, 1001, 4105, 1001, 4737, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


### Preparing Features and Labels

In [16]:
# Removing "text" and "label" columns from our data splits to craft features for the model
train_features = tokenized_dataset["train"].remove_columns(["text", "label"]).with_format("tensorflow")
val_features = tokenized_dataset["validation"].remove_columns(["text", "label"]).with_format("tensorflow")
test_features = tokenized_dataset["test"].remove_columns(["text", "label"]).with_format("tensorflow")

# Converting our features to TF Tensors
train_features = {x: train_features[x].to_tensor() for x in tokenizer.model_input_names}
val_features = {x: val_features[x].to_tensor() for x in tokenizer.model_input_names}
test_features = {x: test_features[x].to_tensor() for x in tokenizer.model_input_names}

In [17]:
# Inspecting expected model input names
print(tokenizer.model_input_names)

['input_ids', 'attention_mask']


In [18]:
# Inspecting our Tensors
print(train_features)

{'input_ids': <tf.Tensor: shape=(3257, 36), dtype=int64, numpy=
array([[  101,  1523,  4737, ...,     0,     0,     0],
       [  101,  2026, 18328, ...,     0,     0,     0],
       [  101,  2053,  2021, ...,     0,     0,     0],
       ...,
       [  101,  1030,  5310, ...,     0,     0,     0],
       [  101,  2017,  2031, ...,     0,     0,     0],
       [  101,  1030,  5310, ...,     0,     0,     0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(3257, 36), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}


In [19]:
# Importing the function for one-hot encoding
from tensorflow.keras.utils import to_categorical

# Creating labels for each of the data splits
train_labels = to_categorical(tokenized_dataset["train"]["label"])
val_labels = to_categorical(tokenized_dataset["validation"]["label"])
test_labels = to_categorical(tokenized_dataset["test"]["label"])

In [20]:
# Inspecting training labels
print(train_labels[:5])

[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]


### Creating Datasets for Training, Validation, and Testing

In [21]:
# Importing the TF Dataset class
from tensorflow.data import Dataset

# Creating TF Datasets for each of our data splits
train_dataset = Dataset.from_tensor_slices((train_features, train_labels))
val_dataset = Dataset.from_tensor_slices((val_features, val_labels))
test_dataset = Dataset.from_tensor_slices((test_features, test_labels))

# Shuffling and batching our data
train_dataset = train_dataset.shuffle(len(train_features), seed=2).batch(8)
val_dataset = val_dataset.shuffle(len(train_features), seed=2).batch(8)
test_dataset = test_dataset.shuffle(len(train_features), seed=2).batch(8)

## Step 3 – Setting Up Model Hyperparameters

### Freezing DistilBERT Weights

In [22]:
# Inspecting the model's architecture
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  3076      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,956,548
Trainable params: 66,956,548
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Freezing the DistilBERT block
model.layers[0].trainable = False

In [24]:
# Inspecting the model again to see the differences in trainable params
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  3076      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,956,548
Trainable params: 593,668
Non-trainable params: 66,362,880
_________________________________________________________________


### Making A Learning Rate Schedule

In [25]:
# A function defining our learning rate schedule
def lr_decay(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * np.exp(-0.1 * epoch)   

In [26]:
# Instantiating our learning rate scheduler callback
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(schedule=lr_decay, verbose=1)

### Selecting Performance Metrics and Compiling Our Model

In [27]:
# Setting some hyperparameters and compiling the model
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=tf.keras.metrics.CategoricalAccuracy())



## Step 4 – Training, Validation, and Testing

In [28]:
# Training the model
history = model.fit(train_dataset, validation_data=val_dataset, 
                    epochs=15, callbacks=[lr_scheduler])

Epoch 1/15

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/15

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/15

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/15

Epoch 00

In [29]:
# Evaluating our model on the test set
model.evaluate(test_dataset)



[0.7024242877960205, 0.7276566028594971]

## Step 5 – Inference

In [30]:
# Performing inference with our model
predictions = model.predict(test_features)



In [31]:
# Inspecting our predictions
print(predictions)

TFSequenceClassifierOutput(loss=None, logits=array([[-1.370903  , -4.7255187 , -0.5999131 ,  3.622602  ],
       [ 1.3311445 , -1.4883113 , -0.37181637, -0.26223233],
       [-1.1938188 , -1.2004251 , -4.824672  ,  4.268196  ],
       ...,
       [ 0.6109057 , -1.8429809 , -2.5267117 ,  1.897653  ],
       [ 3.3085601 , -2.588659  , -2.983421  , -0.5218434 ],
       [-3.7007992 ,  3.6551635 , -0.12156612, -1.4982461 ]],
      dtype=float32), hidden_states=None, attentions=None)


In [32]:
# Converting predicted logits to probabilities
predictions = tf.nn.softmax(predictions.logits)

# Extracting the indices with the highest probabilities
predictions = tf.argmax(predictions, axis=1).numpy()

In [33]:
# Converting numerical labels to their corresponding class names
predictions = [class_names[prediction] for prediction in predictions]

In [34]:
# Inspecting predicted class names
print(predictions[:10])

['sadness', 'anger', 'sadness', 'joy', 'sadness', 'anger', 'sadness', 'sadness', 'sadness', 'anger']


In [35]:
# A function containing the transformation steps from above
def logits_to_class_names(predictions):
    predictions = tf.nn.softmax(predictions.logits)
    predictions = tf.argmax(predictions, axis=1).numpy()
    predictions = [class_names[prediction] for prediction in predictions]
    
    return predictions

# Retrieving a single test batch
test_batch = next(iter(test_dataset))[0]

# Obtaining predicted class names
sample_predictions = logits_to_class_names(model(test_batch))

# Printing sequences and corresponding labels
for i in range(len(test_batch["input_ids"])):      
    print(f"Tweet: {tokenizer.decode(test_batch['input_ids'][i])}")
    print(f"Predicted class: {sample_predictions[i]}\n")

Tweet: [CLS] @ user interesting choice of words... are you confirming that governments fund # terrorism? bit of an open door, but still... [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]
Predicted class: anger

Tweet: [CLS] my visit to hospital for care triggered # trauma from accident 20 + yrs ago and image of my dead brother in it. feeling symptoms of # depression [SEP] [PAD] [PAD] [PAD] [PAD]
Predicted class: sadness

Tweet: [CLS] @ user welcome to # mpsvt! we are delighted to have you! # grateful # mpsvt # relationships [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Predicted class: joy

Tweet: [CLS] what makes you feel # joyful? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Predicted class: sadness

Tweet: [CLS] # deppression is real. partners w / # depressed people truly dont understand the depth in which they affect us. add in # anxiety & amp ; makes [SEP]
P