In [3]:
%mkdir data
%cd data
!unzip /content/drive/MyDrive/dataset/IMDB_Dataset.csv.zip -d /content/data/

mkdir: cannot create directory ‘data’: File exists
/content/data
Archive:  /content/drive/MyDrive/dataset/IMDB_Dataset.csv.zip
replace /content/data/IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [99]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, AdamW, BertTokenizer

import pandas as pd
import numpy as np

import random

In [100]:
df = pd.read_csv("/content/data/IMDB Dataset.csv")
print(df.head(5))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [101]:
print(set(df['sentiment']))

{'negative', 'positive'}


In [102]:
class_to_id = {
    'negative' : 0, 
    'positive' : 1,
}
df['sentiment'] = df.sentiment.map(class_to_id)

In [103]:
print(set(df['sentiment']))

{0, 1}


In [104]:
text = df.review.values
labels = df.sentiment.values

In [105]:
# Preprocessing
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [106]:
tokenizer.tokenize(text[10])

['phil',
 'the',
 'alien',
 'is',
 'one',
 'of',
 'those',
 'qui',
 '##rky',
 'films',
 'where',
 'the',
 'humour',
 'is',
 'based',
 'around',
 'the',
 'odd',
 '##ness',
 'of',
 'everything',
 'rather',
 'than',
 'actual',
 'punch',
 '##lines',
 '.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'at',
 'first',
 'it',
 'was',
 'very',
 'odd',
 'and',
 'pretty',
 'funny',
 'but',
 'as',
 'the',
 'movie',
 'progressed',
 'i',
 'didn',
 "'",
 't',
 'find',
 'the',
 'jokes',
 'or',
 'odd',
 '##ness',
 'funny',
 'anymore',
 '.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'its',
 'a',
 'low',
 'budget',
 'film',
 '(',
 'that',
 '##s',
 'never',
 'a',
 'problem',
 'in',
 'itself',
 ')',
 ',',
 'there',
 'were',
 'some',
 'pretty',
 'interesting',
 'characters',
 ',',
 'but',
 'eventually',
 'i',
 'just',
 'lost',
 'interest',
 '.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'i',
 'imagine',
 'this',
 'film',
 'would',
 'appeal',
 'to',
 'a',
 'stone',
 '##r',
 'who',
 'i

In [107]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[10]))

[6316,
 1996,
 7344,
 2003,
 2028,
 1997,
 2216,
 21864,
 15952,
 3152,
 2073,
 1996,
 17211,
 2003,
 2241,
 2105,
 1996,
 5976,
 2791,
 1997,
 2673,
 2738,
 2084,
 5025,
 8595,
 12735,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 2012,
 2034,
 2009,
 2001,
 2200,
 5976,
 1998,
 3492,
 6057,
 2021,
 2004,
 1996,
 3185,
 12506,
 1045,
 2134,
 1005,
 1056,
 2424,
 1996,
 13198,
 2030,
 5976,
 2791,
 6057,
 4902,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 2049,
 1037,
 2659,
 5166,
 2143,
 1006,
 2008,
 2015,
 2196,
 1037,
 3291,
 1999,
 2993,
 1007,
 1010,
 2045,
 2020,
 2070,
 3492,
 5875,
 3494,
 1010,
 2021,
 2776,
 1045,
 2074,
 2439,
 3037,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 1045,
 5674,
 2023,
 2143,
 2052,
 5574,
 2000,
 1037,
 2962,
 2099,
 2040,
 2003,
 2747,
 2112,
 15495,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 2005,
 2242,
 2714,
 2021,
 2488,
 3046,
 1000,
 2567,
 2013,
 2178,
 4774,

In [108]:
class BERTDataset(Dataset):
  def __init__(self, texts, targets, max_len=32):
    self.data = texts
    self.targets = targets
    self.tokenizer = BertTokenizer.from_pretrained(
        "bert-base-uncased",
        do_lower_case = True
    )
    self.max_len = max_len
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    text = str(self.data[idx])
    inputs = self.tokenizer(
        text,
        None, 
        add_special_tokens = True,
        max_length = self.max_len,
        padding="max_length",
        return_attention_mask = True,
        truncation = True,
        # return_tensors = 'pt'
    )

    response = {
        "ids": torch.tensor(inputs['input_ids'], dtype=torch.long),
        "mask": torch.tensor(inputs['attention_mask'], dtype=torch.long),
        "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long),
        "targets": torch.tensor(self.targets[idx], dtype=torch.long)
    }

    return response

In [109]:
import torch.nn as nn

class Base_Model(nn.Module):

    def __init__(self, bert_config='bert-base-uncased', 
                n_class=2, drop_out=0.2):
      
        super(Base_Model, self).__init__()
        self.bert_config = bert_config
        self.bert = BertModel.from_pretrained(self.bert_config)
        self.bert_drop = nn.Dropout(drop_out)
        self.out = nn.Linear(768, n_class)

    def forward(self, ids, mask, token_type_ids):

        output = self.bert(
            ids, attention_mask=mask, token_type_ids=token_type_ids)
        # output = self.bert_drop(output)
        return self.out(output['pooler_output'])

In [110]:
data = BERTDataset(texts = text, targets = labels, max_len = 32)

In [111]:
torch.manual_seed(1)
indices = torch.randperm(len(data)).tolist()
split_idx = int(0.2 * len(data))

trainset = torch.utils.data.Subset(data, indices[:-split_idx])
valset = torch.utils.data.Subset(data, indices[-split_idx:])
print("Total no.", len(data), len(trainset), len(valset))

Total no. 50000 40000 10000


In [119]:
batch_size = 40
pretrain_warmup_steps = 30
learn_r = 3e-5
epochs = 5

In [120]:
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(valset, batch_size=batch_size, shuffle=True)

In [121]:
model = Base_Model(bert_config='bert-base-uncased', n_class=2)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Base_Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [122]:
num_training_steps = int(len(trainset) / batch_size * epochs)
optimizer=AdamW(model.parameters(), lr=learn_r)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

loss_fun = torch.nn.CrossEntropyLoss()

In [123]:
device='cuda'

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score

for epoch in range(1, epochs+1):
    val_acc_track = 0.0
    current_loss = 0.0
    model.train()

    with tqdm(train_loader, unit="batch") as tloader:
        for batch in tloader:
            
            tloader.set_description("Epoch {}".format(epoch))
            optimizer.zero_grad()

            ids = batch["ids"].to(device)
            ids= ids.squeeze(1)

            mask = batch['mask'].to(device) 
            token_type_ids = batch["token_type_ids"].to(device)
            targets = batch["targets"].to(device)    
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

            loss = loss_fun(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            current_loss += loss.item()
            tloader.set_postfix(loss=loss.item())

    train_loss = current_loss/len(train_loader)
    print("Training Epoch {}, Training Loss {}".format(epoch, train_loss))

    model.eval()
    current_loss = 0.0

    fin_targets = []
    fin_outputs = []

    with torch.no_grad():
        with tqdm(val_loader, unit="batch") as tloader:
            for batch in tloader:

                tloader.set_description("Epoch {}".format(epoch))

                ids = batch["ids"].to(device)
                mask = batch['mask'].to(device) 
                token_type_ids = batch["token_type_ids"].to(device)
                targets = batch["targets"].to(device)  

                outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

                loss = loss_fun(outputs, targets)
                current_loss += loss.item()
                tloader.set_postfix(loss=loss.item())

                _, pred = torch.max(outputs.data, 1)

                fin_targets.append(targets.cpu().detach().numpy())
                fin_outputs.append(pred.cpu().detach().numpy())

    val_loss = current_loss/len(val_loader)
    print("val epoch {}, val Loss {}".format(epoch, val_loss))
    target = np.concatenate(fin_targets)
    predicted = np.concatenate(fin_outputs)
    accuracy = accuracy_score(predicted, target)
    print("Accuracy {}".format(accuracy))
    if accuracy > val_acc_track:
        ckpt = {
            'model_dict': model.state_dict(),
            'optim_dict': optimizer.state_dict(),
            'eval_loss': val_loss,
            'accuracy': accuracy
        }

        
        save_path = 'best_model.pt'
        torch.save(ckpt, save_path)
        val_acc_track = accuracy

In [127]:
print("https://www.kaggle.com/code/jaskaransingh/bert-fine-tuning-with-pytorch")
print("https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2")
print("https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews/notebook")

https://www.kaggle.com/code/jaskaransingh/bert-fine-tuning-with-pytorch
https://luv-bansal.medium.com/fine-tuning-bert-for-text-classification-in-pytorch-503d97342db2
https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews/notebook
