<a href="https://colab.research.google.com/github/tinywizzard/HAAI_Codes/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Packages

In [1]:
!pip install transformers   # from huggingface - pretrained transformer models
!pip install torchmetrics
!pip install datasets

Collecting torchmetrics
  Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.4.2-py3-none-any.whl (869 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m869.2/869.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.7 torchmetrics-1.4.2
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-many

## Load up the libraries

In [2]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertConfig
from tqdm import tqdm
import torch
import pickle
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # Run on GPU if available, else run on CPU

## A function to pre-process each line

In [3]:
# Clean data of html tags and leave only text
def preprocess(x):
    x = re.sub('<.*?>', ' ', x)     # remove all opening and closing tags
    x = re.sub('http\S+', ' ', x)   # remove all http or https tags
    x = re.sub('\s+', ' ', x)       # remove multiple 's' values
    return x.lower().strip()        # convert the entire remaining text to lower case and remove all preceding or following spaces or newlines, and return the data

## Helper functions to save and load pickle files

In [4]:
# Save the preprocessed data from the above step as a binary file
def save_pickle_file(object, file_name):
    fp = open(file_name, "wb")
    pickle.dump(object, fp)
    fp.close()

# Load preprocessed (binary) file for use in later runs - saves time from download and preprocessing
def load_picke_file(file_name):
    fp = open(file_name, "rb")
    data = pickle.load(fp)
    fp.close()
    return data

## This function converts input dataframe to transformer usable format

In [5]:
def pipeline(dataframe):
    # Pre-process the sentences
    dataframe['text'] = dataframe['text'].apply(lambda x: preprocess(x))

    # Pre-pend CLS token to each sentence
    sentences = ["[CLS] " + s for s in dataframe.text.values]

    # Extract labels
    labels = dataframe.label.values

    # # Tokenize each
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)  # Initialized the tokenizer
    tokenized = [tokenizer.tokenize(s) for s in tqdm(sentences)]    # tqdm for visualization of progress in processing

    # # Append the SEP token and also set a threshold for the number of tokens in a sentence
    MAX_LEN_TRAIN, MAX_LEN_TEST = 140, 140      # processing only 140 tokens as the important information will be available in the beginning of a review and this provides a good trade off between performance and speed
    tokenized = [t[:(MAX_LEN_TRAIN-1)] + ['SEP'] for t in tokenized] # truncate tokens larger than 140 and padd tokens smaller than 140 with zeros

    # # Generate IDs of each token and add padding to sentences smaller than given threshold
    ids = [tokenizer.convert_tokens_to_ids(t) for t in tqdm(tokenized)]
    ids = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), mode='constant') for i in ids])

    # Also generate Attention masks. An attention mask is a binary tensor
    # that indicates the position of padded indices so that the model does not attend to them
    amasks = np.asarray([[float(i>0) for i in seq] for seq in tqdm(ids)])

    # Tokenize each
    # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    # tokenized = [tokenizer.tokenize(s) for s in tqdm(sentences)]

    # # Append the SEP token and also set a threshold for the number of tokens in a sentence
    # MAX_LEN_TRAIN, MAX_LEN_TEST = 140, 140
    # tokenized = [t[:(MAX_LEN_TRAIN-1)] + ['SEP'] for t in tokenized]

    # # Generate IDs of each token and add padding to sentences smaller than given threshold
    # ids = [tokenizer.convert_tokens_to_ids(t) for t in tqdm(tokenized)]

    return torch.tensor(ids), torch.tensor(labels), torch.tensor(amasks)

## Load the training and validation datasets

In [6]:
dataset = load_dataset('imdb')
df_train = dataset['train'].to_pandas()
display(df_train.head())

df_val = dataset['test'].to_pandas()
display(df_val.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


## Clean the data and store in BERT usable format

In [7]:
ids_train, labels_train, amasks_train = pipeline(df_train)
ids_val, labels_val, amasks_val = pipeline(df_val)

print(ids_train.shape, labels_train.shape, amasks_train.shape)
print(ids_val.shape, labels_val.shape, amasks_val.shape)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/25000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (687 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 25000/25000 [00:26<00:00, 931.16it/s]
100%|██████████| 25000/25000 [00:01<00:00, 16242.94it/s]
100%|██████████| 25000/25000 [00:01<00:00, 20607.15it/s]
  0%|          | 0/25000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 25000/25000 [00:26<00:00, 938.95it/s] 
100%|██████████| 25000/25000 [00:01<00:00, 12889.74it/s]
100%|██████████| 25000/25000 [00:01<00:00, 15191.54it/s]


torch.Size([25000, 140]) torch.Size([25000]) torch.Size([25000, 140])
torch.Size([25000, 140]) torch.Size([25000]) torch.Size([25000, 140])


## Generate the data loaders

In [8]:
train_set = TensorDataset(ids_train, amasks_train, labels_train)
train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)

val_set = TensorDataset(ids_val, amasks_val, labels_val)
val_dataloader = DataLoader(val_set, batch_size=32, shuffle=False)

## Now Create the model

In [17]:
# If you load a pre-trained model and train it using a small learning rate
# then it is known as fine-tuning.
# If you provide a config file then the BERT model is loaded without the
# pretrained weights. Training this model is known as training from scratch

# Loading your model this way loads a network without the pre-trained weights
# config = BertConfig.from_pretrained("bert-base-uncased")
# print(config)
# model = BertForSequenceClassification(config)
# # model = BertModel(config)
# print(model)

# Loading your model this way loads the pre-trained network
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

print(model)
# You can also replace the last classification layer with your own layer
# Based on your classification needs
# model.classifier = torch.nn.Linear(768, 10)
# print("\n\nNew BertModel:\n", model)
model = model.to(device)

# You can also freeze some of the layers in Bert
# If you freeze all the layers except the classification layer
# and train the model, then this is known as transfer learning
for idx, (name, param) in enumerate(model.named_parameters()):
    # print(idx, name)
    # if "classifier" in name or "bert.encoder.layer.9" in name or "bert.encoder.layer.10" in name:
    if "classifier" in name or "bert.encoder.layer.9" in name or "bert.encoder.layer.10" in name or "bert.encoder.layer.8" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

total_params = 0
for param in model.parameters():
    if param.requires_grad:
        total_params+= param.numel()
print(total_params)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Train and Test the model

In [None]:
epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003, eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()
train_acc, val_acc = Accuracy(task="binary", num_classes=2).to(device), Accuracy(task="binary", num_classes=2).to(device)

# We will train the model for the specified number of epochs
for epoch in range(epochs):
    train_loss, val_loss = list(), list()
    print("\n\nEpoch:", epoch, "\n-----------------------\n")
    # Make sure model is in training mode
    model.train()
    # For each batch of data
    for idx, (x_ids, x_masks, x_labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        # Clear previous gradients
        optimizer.zero_grad()
        # Move the batch to the GPU
        x_ids, x_masks, x_labels = x_ids.to(device), x_masks.to(device), x_labels.to(device)
        # Perform predictions
        preds = model(x_ids, attention_mask=x_masks)
        # Save the current iteration's accuracy
        train_acc.update(torch.argmax(preds.logits, dim=1), x_labels)
        # Get the loss
        loss = criterion(preds.logits, x_labels)
        train_loss.append(loss.item())
        # Calculate the gradients
        loss.backward()
        # Update the parameters with the calculated gradients
        optimizer.step()

    # After each epoch, test the model
    model.eval()
    for idx, (x_ids, x_masks, x_labels) in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        x_ids, x_masks, x_labels = x_ids.to(device), x_masks.to(device), x_labels.to(device)
        preds = model(x_ids, attention_mask=x_masks)
        loss = criterion(preds.logits, x_labels)
        val_loss.append(loss.item())
        val_acc.update(torch.argmax(preds.logits, dim=1), x_labels)

    # Finally print out the average train and val losses
    print("Train Loss =", sum(train_loss)/len(train_loss), "\tVal Loss =", sum(val_loss)/len(val_loss))
    # As well as the train and val accuracies
    print("Train Acc =", train_acc.compute().item(), "\tVal Acc =", val_acc.compute().item())



Epoch: 0 
-----------------------



100%|██████████| 782/782 [04:32<00:00,  2.87it/s]
100%|██████████| 782/782 [03:10<00:00,  4.11it/s]


Train Loss = 0.38887189286749074 	Val Loss = 0.3101799376928212
Train Acc = 0.8275200128555298 	Val Acc = 0.8640000224113464


Epoch: 1 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:10<00:00,  4.12it/s]


Train Loss = 0.3325956350721209 	Val Loss = 0.3311920837091897
Train Acc = 0.8428599834442139 	Val Acc = 0.8590400218963623


Epoch: 2 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.30460379509460134 	Val Loss = 0.30625899772033516
Train Acc = 0.852733314037323 	Val Acc = 0.862559974193573


Epoch: 3 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.28879409965575503 	Val Loss = 0.3175544942612462
Train Acc = 0.8596400022506714 	Val Acc = 0.863319993019104


Epoch: 4 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:10<00:00,  4.12it/s]


Train Loss = 0.2786368028548977 	Val Loss = 0.3024611115560431
Train Acc = 0.8646240234375 	Val Acc = 0.8642640113830566


Epoch: 5 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.26981904460093403 	Val Loss = 0.3313488500726307
Train Acc = 0.8684066534042358 	Val Acc = 0.864026665687561


Epoch: 6 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.2635537685631105 	Val Loss = 0.3478122784379784
Train Acc = 0.8712000250816345 	Val Acc = 0.8645371198654175


Epoch: 7 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.25914641845104336 	Val Loss = 0.31791383427713077
Train Acc = 0.8740749955177307 	Val Acc = 0.8650500178337097


Epoch: 8 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]


Train Loss = 0.24972503380302122 	Val Loss = 0.29816329774573025
Train Acc = 0.8766888976097107 	Val Acc = 0.8658888936042786


Epoch: 9 
-----------------------



100%|██████████| 782/782 [04:28<00:00,  2.91it/s]
100%|██████████| 782/782 [03:09<00:00,  4.12it/s]

Train Loss = 0.23742238366309448 	Val Loss = 0.3537437310692428
Train Acc = 0.8792999982833862 	Val Acc = 0.86490797996521



