In [None]:
!pip install transformers
!pip install torchmetrics

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
Col

##Load up the libraries

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertConfig
from tqdm import tqdm
import torch
import pickle
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## A function to pre-process each line

In [None]:
def preprocess(x):
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

## Helper functions to save and load pickle files

In [None]:
def save_pickle_file(object, file_name):
    fp = open(file_name, "wb")
    pickle.dump(object, fp)
    fp.close()


def load_pickle_file(file_name):
    fp = open(file_name, "rb")
    data = pickle.load(fp)
    fp.close()
    return data

## This function converts input dataframe to transformer usable format

In [None]:
def pipeline(dataframe):
    # Pre-process the sentences
    dataframe['text'] = dataframe['text'].apply(lambda x: preprocess(x))

    # Pre-pend CLS token to each sentence
    sentences = ["[CLS] " + s for s in dataframe.text.values]

    # Extract labels
    labels = dataframe.label.values

    # Tokenize each sentence
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    tokenized = [tokenizer.tokenize(s) for s in tqdm(sentences)]

    # Append the SEP token and also set a threshold for the number of tokens in a sentence
    MAX_LEN_TRAIN, MAX_LEN_TEST = 140, 140
    tokenized = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized]

    # Generate IDs of each token and add padding to sentences smaller than given threshold
    ids = [tokenizer.convert_tokens_to_ids(t) for t in tqdm(tokenized)]
    ids = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), mode='constant') for i in ids])

    # Also generate Attention masks. An attention mask is a binary tensor
    # that indicates the position of padded indices so that the model does not attend to them
    amasks = np.asarray([[float(i>0) for i in seq] for seq in tqdm(ids)])

    return torch.tensor(ids), torch.tensor(labels), torch.tensor(amasks)

## Load the training and validation datasets

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/IMDB Dataset/Train.csv")
display(df_train.head())

df_val = pd.read_csv("/content/drive/MyDrive/IMDB Dataset/Valid.csv")
display(df_val.head())

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0
2,The Guidelines state that a comment must conta...,0
3,This movie is a muddled mish-mash of clichés f...,0
4,Before Stan Laurel became the smaller half of ...,0


## Clean the data and store in BERT usable format

In [None]:
ids_train, labels_train, amasks_train = pipeline(df_train)
ids_val, labels_val, amasks_val = pipeline(df_val)

print(ids_train.shape, amasks_train.shape, labels_train.shape)
print(ids_val.shape, amasks_val.shape, labels_val.shape)

  0%|          | 0/40000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1295 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 40000/40000 [01:04<00:00, 622.67it/s]
100%|██████████| 40000/40000 [00:03<00:00, 12471.77it/s]
100%|██████████| 40000/40000 [00:02<00:00, 17949.93it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (729 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 5000/5000 [00:05<00:00, 853.43it/s]
100%|██████████| 5000/5000 [00:00<00:00, 8271.89it/s]
100%|██████████| 5000/5000 [00:00<00:00, 10968.13it/s]


torch.Size([40000, 140]) torch.Size([40000, 140]) torch.Size([40000])
torch.Size([5000, 140]) torch.Size([5000, 140]) torch.Size([5000])


## Generate the data loaders

In [None]:
train_set = TensorDataset(ids_train, amasks_train, labels_train)
train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)

val_set = TensorDataset(ids_val, amasks_val, labels_val)
val_dataloader = DataLoader(val_set, batch_size=32, shuffle=False)

## Now Create the model

In [None]:
# If you load a pre-trained model and train it using a small learning rate
# then it is known as fine-tuning.
# If you provide a config file then the BERT model is loaded without the
# pretrained weights. Training this model is known as training from scratch

# Loading your model this way loads a network without the pre-trained weights
# config = BertConfig.from_pretrained("bert-base-uncased")
# print(config)
# model = BertForSequenceClassification(config)

# Loading your model this way loads the pre-trained network
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

print(model)
# You can also replace the last classification layer with your own layer
# Based on your classification needs
# model.classifier = torch.nn.Linear(768, 10)
# print("\n\nNew BertModel:\n", model)
model = model.to(device)

# You can also freeze some of the layers in Bert
# If you freeze all the layers except the classification layer
# and train the model, then this is known as transfer learning
for idx, (name, param) in enumerate(model.named_parameters()):
    # print(idx, name)
    if "classifier" in name or "bert.encoder.layer.9" in name or "bert.encoder.layer.10" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

total_params = 0
for param in model.parameters():
    if param.requires_grad:
        total_params+= param.numel()
print(total_params)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Train and Test the model

In [None]:
epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003, eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()
train_acc, val_acc = Accuracy(task="binary", num_classes=2).to(device), Accuracy(task="binary", num_classes=2).to(device)

# We will train the model for the specified number of epochs
for epoch in range(epochs):
    train_loss, val_loss = list(), list()
    print("\n\nEpoch:", epoch, "\n-----------------------\n")
    # Make sure model is in training mode
    model.train()
    # For each batch of data
    for idx, (x_ids, x_masks, x_labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        # Clear previous gradients
        optimizer.zero_grad()
        # Move the batch to the GPU
        x_ids, x_masks, x_labels = x_ids.to(device), x_masks.to(device), x_labels.to(device)
        # Perform predictions
        preds = model(x_ids, attention_mask=x_masks)
        # Save the current iteration's accuracy
        train_acc.update(torch.argmax(preds.logits, dim=1), x_labels)
        # Get the loss
        loss = criterion(preds.logits, x_labels)
        train_loss.append(loss.item())
        # Calculate the gradients
        loss.backward()
        # Update the parameters with the calculated gradients
        optimizer.step()

    # After each epoch, test the model
    model.eval()
    for idx, (x_ids, x_masks, x_labels) in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        x_ids, x_masks, x_labels = x_ids.to(device), x_masks.to(device), x_labels.to(device)
        preds = model(x_ids, attention_mask=x_masks)
        loss = criterion(preds.logits, x_labels)
        val_loss.append(loss.item())
        val_acc.update(torch.argmax(preds.logits, dim=1), x_labels)

    # Finally print out the average train and val losses
    print("Train Loss =", sum(train_loss)/len(train_loss), "\tVal Loss =", sum(val_loss)/len(val_loss))
    # As well as the train and val accuracies
    print("Train Acc =", train_acc.compute().item(), "\tVal Acc =", val_acc.compute().item())



Epoch: 0 
-----------------------



100%|██████████| 1250/1250 [05:10<00:00,  4.03it/s]
100%|██████████| 157/157 [00:39<00:00,  3.93it/s]


Train Loss = 0.6061540453672409 	Val Loss = 0.5256230523631831
Train Acc = 0.671999990940094 	Val Acc = 0.7373999953269958


Epoch: 1 
-----------------------



100%|██████████| 1250/1250 [05:28<00:00,  3.80it/s]
100%|██████████| 157/157 [00:40<00:00,  3.87it/s]


Train Loss = 0.5830809671878815 	Val Loss = 0.5272037870944686
Train Acc = 0.6835874915122986 	Val Acc = 0.7366999983787537


Epoch: 2 
-----------------------



100%|██████████| 1250/1250 [05:29<00:00,  3.80it/s]
100%|██████████| 157/157 [00:40<00:00,  3.90it/s]


Train Loss = 0.5790172069787979 	Val Loss = 0.5278258102524812
Train Acc = 0.6895166635513306 	Val Acc = 0.7350000143051147


Epoch: 3 
-----------------------



100%|██████████| 1250/1250 [05:29<00:00,  3.80it/s]
100%|██████████| 157/157 [00:40<00:00,  3.89it/s]


Train Loss = 0.5808473482847214 	Val Loss = 0.5628499269105827
Train Acc = 0.6918187737464905 	Val Acc = 0.7281500101089478


Epoch: 4 
-----------------------



100%|██████████| 1250/1250 [05:29<00:00,  3.80it/s]
100%|██████████| 157/157 [00:40<00:00,  3.90it/s]


Train Loss = 0.5832845384120942 	Val Loss = 0.5090947737739344
Train Acc = 0.6930750012397766 	Val Acc = 0.7317600250244141


Epoch: 5 
-----------------------



100%|██████████| 1250/1250 [05:29<00:00,  3.80it/s]
100%|██████████| 157/157 [00:40<00:00,  3.89it/s]


Train Loss = 0.5875871228218079 	Val Loss = 0.5028937628884225
Train Acc = 0.6936416625976562 	Val Acc = 0.7355666756629944


Epoch: 6 
-----------------------



  6%|▌         | 70/1250 [00:18<05:15,  3.74it/s]


KeyboardInterrupt: ignored