In [1]:
# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
import pandas as pd
import numpy as np

In [2]:
data_path = "C:\\Users\\arthur\\Documents\\Data Science Club\\cxc-2022\\NLP\\IMDB Dataset.csv"

raw_ds = pd.read_csv(data_path)
raw_ds.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
raw_ds["sentiment"].value_counts()

# perfectly balanced as all things should be

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [4]:
sentiment_dict = {
    "positive": 1,
    "negative": 0
}

raw_ds["sentiment_num"] = raw_ds["sentiment"].map(sentiment_dict)
raw_ds.head()

Unnamed: 0,review,sentiment,sentiment_num
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
import re

def cleanup(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

cleanup(raw_ds["review"].iloc[1])

'a wonderful little production br br the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece br br the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life br br the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [6]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, AutoModelForSequenceClassification

MAX_SENT_LEN = 50
MODEL_NAME = "bert-base-uncased"

class SentimentDataset(Dataset):
    def __init__(self, isTest=False):
        self.raw_ds = pd.read_csv(data_path)
        sentiment_dict = {
            "positive": 1,
            "negative": 0
        }
        self.MAX_LEN = MAX_SENT_LEN
        self.tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
        
        
        self.raw_ds["sentiment_num"] = self.raw_ds["sentiment"].map(sentiment_dict)
        
        train_ds, test_ds = train_test_split(self.raw_ds, test_size=0.2, random_state=42)
        if isTest:
            self.ds = test_ds
        else:
            self.ds = train_ds

    def __getitem__(self,idx):
        x = self.tokenizer(
            self.ds["review"].iloc[idx], 
            truncation=True, 
            max_length=self.MAX_LEN, 
            padding='max_length'
        )
        y = self.ds["sentiment_num"].iloc[idx]
        return np.array(x["input_ids"]), np.array(x["attention_mask"]), y
    
    def __len__(self):
        return len(self.ds)
    
train_sent = SentimentDataset()
test_sent = SentimentDataset(False)

train_iter = DataLoader(train_sent, batch_size=40, shuffle=True)
test_iter = DataLoader(test_sent, batch_size=40)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.classifier

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Linear(in_features=768, out_features=2, bias=True)

In [8]:
EPOCHS = 3
LR = 0.001

In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup

torch.cuda.manual_seed_all(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_iter)*EPOCHS)

In [12]:
train_acc = []
train_loss = []

test_acc = []
test_loss = []


for ep_idx in range(EPOCHS):
    model.train()
    corrects = 0
    total = 0
    losses = []
    for input_ids, attention_mask, y in train_iter:
        input_ids, attention_mask, y = input_ids.to(device), attention_mask.to(device), y.to(device)
        outputs = model(input_ids.type(torch.long), attention_mask)
        logits = outputs["logits"]
        
        logits = torch.squeeze(logits[:, 0])
        loss = loss_fn(logits, y.float())
        losses.append(loss.cpu().detach().numpy())
        preds = logits.cpu().detach().numpy() > 0.5
        y_np = y.cpu().detach().numpy()
        
        corrects += np.sum(preds==y_np)
        total += len(input_ids)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    train_acc.append(corrects/total)
    train_loss.append(np.mean(losses))
    
    scheduler.step()
    
    model.eval()
    corrects = 0
    total = 0
    losses = []
    with torch.no_grad():
        for input_ids, attention_mask, y in test_iter:
            input_ids, attention_mask, y = input_ids.to(device), attention_mask.to(device), y.to(device)
            outputs = model(input_ids.type(torch.long), attention_mask)
            logits = outputs["logits"]
            logits = torch.squeeze(logits[:, 0])
            loss = loss_fn(logits, y.float())
            losses.append(loss.cpu().detach().numpy())
            preds = logits.cpu().detach().numpy() > 0.5
            y_np = y.cpu().detach().numpy()

            corrects += np.sum(preds==y_np)
            total += len(input_ids)
            
    test_acc.append(corrects/total)
    test_loss.append(np.mean(losses))
    
    print("---EPOCH {}---".format(ep_idx))
    print("Train Acc {} || Loss {}".format(train_acc[-1], train_loss[-1]))
    print("Test Acc {} || Loss {}".format(test_acc[-1], test_loss[-1]))

KeyboardInterrupt: 

In [None]:
torch.save(model, './saved_transformer.ckpt')

In [13]:
model = torch.load('./saved_transformer.ckpt')