In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import pandas as pd
import re
import numpy as np
from transformers import AutoTokenizer
torch.cuda.is_available()

True

In [None]:
train_dataset = pd.read_csv('/content/drive/MyDrive/TA/A1/train.csv')
test_dataset = pd.read_csv('/content/drive/MyDrive/TA/A1/test.csv')

In [None]:
train_dataset['label'] = [1 if x=="positive" else 0 for x in train_dataset['sentiment'] ]
test_dataset['label'] = [1 if x=="positive" else 0 for x in test_dataset['sentiment'] ]

In [None]:
train_dataset = train_dataset.drop('sentiment', axis=1)
test_dataset = test_dataset.drop('sentiment', axis=1)

In [None]:
train_dataset.head()

Unnamed: 0,review,label
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,0
1,"If you want mindless action, hot chicks and a ...",1
2,"""The Woman in Black"" is easily one of the cree...",1
3,I can barely find the words to describe how mu...,0
4,What's in here ?! Let me tell you. It's the pr...,0


In [None]:
test_dataset.head()

Unnamed: 0,review,label
0,Steven Rea plays a forensic scientist thrust o...,1
1,As the first of the TV specials offered on the...,1
2,There may something poetically right in seeing...,0
3,all i can say about this film is to read the b...,0
4,I thought it was a pretty good movie and shoul...,1


In [None]:
def cleaning(s):
    s = str(s)
    s = s.replace("<br />", " ")
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    s = s.replace(".:"," ")
    return s

test_dataset['review'] = test_dataset['review'].apply(cleaning)
train_dataset['review'] = train_dataset['review'].apply(cleaning)

In [None]:
train_dataset.head()

Unnamed: 0,review,label
0,SAPS AT SEA Aspect ratio: Sound format: Mono...,0
1,"If you want mindless action, hot chicks and a ...",1
2,"""The Woman in Black"" is easily one of the cree...",1
3,I can barely find the words to describe how mu...,0
4,What's in here Let me tell you. It's the pres...,0


In [None]:
train_texts, train_labels = train_dataset['review'], train_dataset['label']
test_texts, test_labels = test_dataset['review'], test_dataset['label']

In [None]:
train_texts.reset_index(drop=True, inplace=True)
test_texts.reset_index(drop=True, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

train_texts.reset_index(drop=True, inplace=True)
val_texts.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
val_labels.reset_index(drop=True, inplace=True)

In [None]:
print(len(train_texts), len(train_labels))
print(len(test_texts), len(test_labels))
print(len(val_texts), len(val_labels))

24000 24000
20000 20000
6000 6000


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
type(train_texts), type(list(train_texts))

(pandas.core.series.Series, list)

In [None]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [None]:
import torch

class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

        # Ensure all encodings and labels have consistent lengths
        assert all(len(val) == len(self.labels) for val in self.encodings.values()), "Encodings and labels lengths do not match."

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [None]:
train_dataset = IMDBdataset(train_encodings, train_labels)
test_dataset = IMDBdataset(test_encodings,test_labels)
val_dataset = IMDBdataset(val_encodings, val_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model = model.to(device=device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.train()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)

In [None]:
optim = AdamW(model.parameters(),lr=5e-5)



In [None]:
from tqdm import tqdm
import time


total_start_time = time.time()  # Start timing for the total training time


for epoch in range(3):
    epoch_start_time = time.time()  # Start timing for the epoch

    model.train()  # Set the model to training mode
    train_loss = 0
    for batch in tqdm(train_dataloader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        train_loss += loss.item()
    train_loss /= len(train_dataloader)  # Calculate the average loss over all training batches

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    with torch.no_grad():  # Do not compute gradient to speed up computation and reduce memory usage
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            val_loss += loss.item()
    val_loss /= len(val_dataloader)  # Calculate the average loss over all validation batches

    epoch_end_time = time.time()  # End timing for the epoch

    # Print the losses and the time taken for the epoch
    print(f"Epoch {epoch+1}/{3}: Train loss = {train_loss:.4f}, Validation loss = {val_loss:.4f}")
    print(f"Time taken for epoch {epoch+1}: {epoch_end_time - epoch_start_time:.2f} seconds")

total_end_time = time.time()  # End timing for the total training time
print(f"Total training time: {total_end_time - total_start_time:.2f} seconds")

100%|██████████| 1500/1500 [05:54<00:00,  4.23it/s]


Epoch 1/3: Train loss = 0.2642, Validation loss = 0.2017
Time taken for epoch 1: 383.64 seconds


100%|██████████| 1500/1500 [05:51<00:00,  4.27it/s]


Epoch 2/3: Train loss = 0.1457, Validation loss = 0.2502
Time taken for epoch 2: 380.19 seconds


100%|██████████| 1500/1500 [05:51<00:00,  4.26it/s]


Epoch 3/3: Train loss = 0.0771, Validation loss = 0.2307
Time taken for epoch 3: 380.85 seconds
Total training time: 1144.70 seconds


In [None]:
save_directory = "/content/drive/MyDrive/TA/A1/FT_DistilBERT1"

In [None]:
# tokenizer.save_pretrained(save_directory)
# model.save_pretrained(save_directory)

In [None]:
from transformers import TFAutoModel, AutoTokenizer
from transformers import DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = DistilBertForSequenceClassification.from_pretrained(save_directory, return_dict=False)

In [None]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm
model.eval()  # Ensure the model is in evaluation mode

# Store predictions and actual labels
predictions = []
actuals = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        # Move tensors to the same device as the model
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Only needed if you're also evaluating performance

        outputs = model(input_ids, attention_mask=attention_mask)

        # Assuming you're doing classification and want the highest probability class
        logits = outputs[0]
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

100%|██████████| 20000/20000 [02:27<00:00, 135.68it/s]


In [None]:
type(predictions)

list

In [None]:
actual_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        labels = batch['labels'].to(device)  # Assuming labels are on the same device
        actual_labels.extend(labels.cpu().numpy())


100%|██████████| 20000/20000 [00:10<00:00, 1843.86it/s]


In [None]:
correct_predictions = sum(p == a for p, a in zip(predictions, actual_labels))
accuracy = correct_predictions / len(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 91.94%


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef

# Ensure predictions and actual_labels are numpy arrays or compatible formats
precision = precision_score(actual_labels, predictions)
recall = recall_score(actual_labels, predictions)
f1 = f1_score(actual_labels, predictions)
conf_matrix = confusion_matrix(actual_labels, predictions)
mcc = matthews_corrcoef(actual_labels, predictions)

# ROC-AUC score requires probability scores of the positive class, which might need model.predict_proba() or equivalent
# If your model outputs probabilities, you can use:
# roc_auc = roc_auc_score(actual_labels, prediction_probabilities)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Matthews Correlation Coefficient: {mcc:.2f}")
# print(f"ROC-AUC Score: {roc_auc:.2f}")  # Uncomment if you have probability predictions


Precision: 0.93
Recall: 0.91
F1 Score: 0.92
Confusion Matrix:
[[9255  680]
 [ 933 9132]]
Matthews Correlation Coefficient: 0.84


# Raw Model Performance

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
raw_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
raw_model = raw_model.to(device=device)
raw_model.eval()


# Store predictions and actual labels
raw_predictions = []
raw_actuals = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        # Move tensors to the same device as the model
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Only needed if you're also evaluating performance

        outputs = raw_model(input_ids, attention_mask=attention_mask)

        # Assuming you're doing classification and want the highest probability class
        logits = outputs[0]
        raw_predicted_labels = torch.argmax(logits, dim=1)
        raw_predictions.extend(raw_predicted_labels.cpu().numpy())

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20000/20000 [02:28<00:00, 134.70it/s]


In [None]:
raw_actual_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        raw_labels = batch['labels'].to(device)  # Assuming labels are on the same device
        raw_actual_labels.extend(raw_labels.cpu().numpy())


100%|██████████| 20000/20000 [00:08<00:00, 2302.04it/s]


In [None]:
raw_correct_predictions = sum(p == a for p, a in zip(raw_predictions, raw_actual_labels))
raw_accuracy = raw_correct_predictions / len(raw_predictions)
print(f"Accuracy: {raw_accuracy * 100:.2f}%")

Accuracy: 50.33%


In [None]:
# Ensure predictions and actual_labels are numpy arrays or compatible formats
precision = precision_score(raw_actual_labels, raw_predictions)
recall = recall_score(raw_actual_labels, raw_predictions)
f1 = f1_score(raw_actual_labels, raw_predictions)
conf_matrix = confusion_matrix(raw_actual_labels, raw_predictions)
mcc = matthews_corrcoef(raw_actual_labels, raw_predictions)

# ROC-AUC score requires probability scores of the positive class, which might need model.predict_proba() or equivalent
# If your model outputs probabilities, you can use:
# roc_auc = roc_auc_score(actual_labels, prediction_probabilities)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Matthews Correlation Coefficient: {mcc:.2f}")
# print(f"ROC-AUC Score: {roc_auc:.2f}")  # Uncomment if you have probability predictions


Precision: 0.50
Recall: 1.00
F1 Score: 0.67
Confusion Matrix:
[[    2  9933]
 [    1 10064]]
Matthews Correlation Coefficient: 0.00
