<p style="font-size:44px; text-align:center; font-weight:bold">
    Sentiment Analysis BERT Model
</p>

<p style="font-size:20px">
    Initialize system for consistency
</p>

In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


<p style="font-size:35px; text-align:center; font-weight:bold">
    Pre-training Setup
</p>

<p style="font-size:20px">
    Import reberta-base model
</p>

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(name)
model = RobertaForSequenceClassification.from_pretrained(name, num_labels=6)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<p style="font-size:20px">
    Read in training dataset
</p>

In [None]:
from datasets import Dataset, load_dataset

# Sentences connected to general emotions
hf_diar = load_dataset("dair-ai/emotion")

<p style="font-size:20px">
    Tokenize the different sets used in training, evaluating, and training
</p>

In [None]:
# Got this logic from https://github.com/clulab/gentlenlp/blob/main/notebooks/chap13_classification_bert.ipynb
def tokenize(dataset):
    return tokenizer(dataset['text'], truncation=True, padding='max_length')

# Training set
train_set = hf_diar['train'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)

# Evaluation set
valid_set = hf_diar['validation'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)

# Testing set
test_set = hf_diar['test'].map(
    tokenize, batched=True,
    remove_columns=['text'],
)

train_set.to_pandas()

Unnamed: 0,label,input_ids,attention_mask
0,0,"[0, 118, 46405, 619, 32386, 2, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[0, 118, 64, 213, 31, 2157, 98, 24418, 7, 98, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,"[0, 757, 16004, 10, 2289, 7, 618, 939, 619, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,2,"[0, 118, 524, 655, 2157, 28055, 59, 5, 24672, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,3,"[0, 118, 524, 2157, 22970, 17414, 2, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
15995,0,"[0, 118, 95, 56, 10, 182, 4315, 86, 11, 5, 232...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15996,0,"[0, 118, 524, 122, 3408, 8, 939, 619, 31790, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15997,1,"[0, 118, 619, 670, 8, 205, 1374, 2, 1, 1, 1, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
15998,3,"[0, 118, 619, 101, 42, 21, 215, 10, 21820, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


<p style="font-size:44px; text-align:center; font-weight:bold">
    Training the model
</p>

<p style="font-size:20px">
    Use training method from Ch. 13 
</p>

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    '''
    Compute accuracy of predictions for each half epoch, or every 500 training steps
    '''
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

# Found that these parameters worked well
epochs = 5
batch_size = 16
weight_decay = 0.01
model_name = f"{name}-sentiments"

# Set up Trainer
train_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
)

trainer = Trainer(
    model=model,
    args=train_args,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=valid_set,
)

<p style="font-size:20px">
    Train and save
</p>

In [11]:
trainer.train()

Step,Training Loss
500,0.1602
1000,0.1748
1500,0.1419
2000,0.1379
2500,0.1251
3000,0.1136
3500,0.0988
4000,0.1009
4500,0.0567
5000,0.0787


TrainOutput(global_step=5000, training_loss=0.11887846412658691, metrics={'train_runtime': 1603.4232, 'train_samples_per_second': 49.893, 'train_steps_per_second': 3.118, 'total_flos': 2.104964038656e+16, 'train_loss': 0.11887846412658691, 'epoch': 5.0})

In [12]:
trainer.save_model(model_name)
tokenizer.save_pretrained(model_name)

('roberta-base-sentiments/tokenizer_config.json',
 'roberta-base-sentiments/special_tokens_map.json',
 'roberta-base-sentiments/vocab.json',
 'roberta-base-sentiments/merges.txt',
 'roberta-base-sentiments/added_tokens.json')

<p style="font-size:20px">
    Test the model on the testing set from the original dataset
</p>

In [13]:
test_set.to_pandas()
output = trainer.predict(test_set)
output

PredictionOutput(predictions=array([[ 8.301279 , -1.3490793, -2.0233526, -1.474672 , -1.9915973,
        -1.0724182],
       [ 8.343268 , -1.4154687, -2.1028678, -1.28346  , -1.8736242,
        -1.2749711],
       [ 8.292639 , -1.4031063, -2.071614 , -1.4970298, -1.8666624,
        -1.078144 ],
       ...,
       [-1.4039474,  8.339422 , -0.9823971, -2.3669279, -1.6808301,
        -1.7045876],
       [-1.3483928,  8.335436 , -0.9637408, -2.4880412, -1.5925306,
        -1.7056941],
       [-2.182048 , -2.6905575, -2.7304044, -1.730922 ,  4.7788363,
         4.7394814]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 4]), metrics={'test_loss': 0.29583653807640076, 'test_accuracy': 0.9265, 'test_runtime': 13.0137, 'test_samples_per_second': 153.684, 'test_steps_per_second': 9.605})

In [14]:
from sklearn.metrics import classification_report

labels = ["sad", "joy", "love", "anger", "fear", "surprise"]
y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = labels
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         sad       0.97      0.97      0.97       581
         joy       0.95      0.94      0.94       695
        love       0.79      0.81      0.80       159
       anger       0.94      0.91      0.92       275
        fear       0.87      0.93      0.90       224
    surprise       0.79      0.73      0.76        66

    accuracy                           0.93      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.93      0.93      0.93      2000

