In [29]:
#Sentiment Analysis with IMDB dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import os
from datetime import datetime
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertConfig
from sklearn.utils.class_weight import compute_class_weight


In [30]:
#variables
seed_val = 1
train_subset = 3000
test_subset = 500
num_epochs = 4
batch_size = 48
lr = 3e-5
dp = 0.3
wd = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_list = np.array([0, 1])
sv_limit = 2


In [31]:
#load data
dataset = load_dataset('imdb')
print(dataset)

train_labels = dataset["train"][:]["label"]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [32]:
#Tokenize data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_func(data):
    return tokenizer(data["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_func, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print(tokenized_datasets["train"][0])

{'labels': tensor(0), 'input_ids': tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
         2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
         2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
         2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
         1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
         2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
         6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
         1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
         5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
        14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
         1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
         2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
        25430, 14728,  2245, 

In [33]:
#build model
trained_dataset = tokenized_datasets["train"].shuffle(seed=seed_val).select(range(train_subset)) 
test_dataset = tokenized_datasets["test"].shuffle(seed=seed_val).select(range(test_subset))

bert_config = BertConfig.from_pretrained("bert-base-uncased", hidden_dropout_prob=dp, num_labels=2)
#3 labels are typical for sentiment analysis, but imdb dataset only has 2 labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=bert_config)

class_weights = compute_class_weight("balanced", classes=class_list, y=train_labels)
class_weights =torch.tensor(class_weights, dtype=torch.float).to(device)

def comp_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)

    logits = model.logits

    loss_func = torch.nn.CrossEntropyLoss(weight=class_weights)
    loss = loss_func(logits, labels)

    if return_outputs:
        return (loss, outputs)
    else:
        return loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
#training model

#output_dir="bert_imdb/bert_results", takes up too much space on laptop
def comp_scores(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir="bert_imdb/bert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=wd,
    logging_dir="bert_imdb/bert_logs",
    logging_steps=10,
    save_steps=10,
    #fp16=True,
    load_best_model_at_end=True,
    save_total_limit=sv_limit,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trained_dataset,
    eval_dataset=test_dataset,
    compute_metrics=comp_scores
    
)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4136,0.347405,0.856,0.858921,0.844898,0.851852
2,0.369,0.290852,0.866,0.87395,0.84898,0.861284
3,0.2616,0.300044,0.866,0.856,0.873469,0.864646
4,0.2736,0.325272,0.872,0.854902,0.889796,0.872


TrainOutput(global_step=252, training_loss=0.38129079673025346, metrics={'train_runtime': 502.0126, 'train_samples_per_second': 23.904, 'train_steps_per_second': 0.502, 'total_flos': 789333166080000.0, 'train_loss': 0.38129079673025346, 'epoch': 4.0})

In [35]:
#evaluate model
test_results = trainer.evaluate()

print(f"Test results: {test_results}")


Test results: {'eval_loss': 0.3252720832824707, 'eval_accuracy': 0.872, 'eval_precision': 0.8549019607843137, 'eval_recall': 0.889795918367347, 'eval_f1': 0.872, 'eval_runtime': 6.3917, 'eval_samples_per_second': 78.226, 'eval_steps_per_second': 1.721, 'epoch': 4.0}


In [36]:
#save model
curr_date = datetime.now().strftime("%Y%m%d_%H%M_%f")

try:
    model_path = f"models/model_{curr_date}"
    tokenizer_path = f"tokenizers/tokenizer_{curr_date}"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)
except Exception as e:
    print(f"Error with saving model and tokenizer: {e}")

In [37]:
#predictions


#used to prevent errors from both model and input not being in the same palce (cuda or cpu)
model.to(device)
model.eval()

pred_list = []
true_label = []
test_text = dataset["test"][:]["text"]

tokenized_test = test_dataset.map(tokenize_func, batched=True)
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

predictions = trainer.predict(tokenized_test)

count = 0
print("Beginning prediction")


predictions = trainer.predict(test_dataset)

pred_prob = predictions.predictions
true_labels = predictions.label_ids
pred_classes = np.argmax(pred_prob, axis=1)

pred_list.extend(pred_classes)
true_label.extend(true_labels)

np_pred = np.array(pred_list)
np_label = np.array(true_label)

print("Ending prediction. Beginning evaluation")

if pred_list and true_label:
     try:
          

          accuracy = accuracy_score(np_label, np_pred)
          precision = precision_score(np_label, np_pred, zero_division=0)
          recall = recall_score(np_label, np_pred, zero_division=0)
          f1 = f1_score(np_label, np_pred, zero_division=0)
     except Exception as e:
          print(f"Error with calculating scores: {e}")
     print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}")
     print("Ending evaluation")
     #for i in range(10):
          #print(f"Predicted sentiment: {pred_list[i]}")
     cm = confusion_matrix(np_label, np_pred)
     print(f"Confusion matrix:\n{cm}")
else:
     print("Skipping evaluation")


Beginning prediction


Ending prediction. Beginning evaluation
Accuracy: 0.872
Precision: 0.8549019607843137
Recall: 0.889795918367347
F1: 0.872
Ending evaluation
Confusion matrix:
[[218  37]
 [ 27 218]]
