In [1]:
# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

import nltk
import pandas as pd

In [2]:
data_path = "C://Users//alvin//DATA//dsc//cxc-2022//data//imdb_ds.csv"

raw_ds = pd.read_csv(data_path)
raw_ds.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
raw_ds["sentiment"].value_counts()

# perfectly balanced as all things should be

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [4]:
sentiment_dict = {
    "positive": 1,
    "negative": 0
}

raw_ds["sentiment_num"] = raw_ds["sentiment"].map(sentiment_dict)
raw_ds.head()

Unnamed: 0,review,sentiment,sentiment_num
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
import re

def cleanup(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

cleanup(raw_ds["review"].iloc[1])

'a wonderful little production br br the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece br br the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life br br the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [6]:
import numpy as np

glove_path = "C://Users//alvin//DATA//dsc//cxc-2022//NLP//glove.6B.50d.txt"
glove_dict = dict()
with open(glove_path,'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], np.float32)
        glove_dict[word] = vector

In [7]:
words, mats = glove_dict.keys(), np.stack(list(glove_dict.values()))

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
mats_10 = pca.fit_transform(mats)

In [9]:
glove_dict_10 = dict()
for i, word in enumerate(words):
    glove_dict_10[word] = mats_10[i]

In [10]:
def vectorize_glove(word):
    if word in glove_dict.keys():
        return glove_dict_10[word]
    else:
        return np.random.random((10))
    
print("hello: {} \n\nkjnkejnv: {}".format(vectorize_glove("hello"), vectorize_glove("kjnkejnv")))

hello: [ 1.7819184   0.9185053   0.27074027 -1.556231    1.1623983  -0.28482863
 -1.002999    0.7611644   0.51761925 -0.3000517 ] 

kjnkejnv: [0.90342017 0.40820559 0.18307293 0.70557698 0.18682414 0.55792556
 0.09812458 0.40294489 0.83655341 0.34016838]


In [11]:
MAX_SENT_LEN = 50

def vectorize_sentence(sentence):
    sent_vec = []
    words = list(reversed(nltk.word_tokenize(sentence)))
    for i in range(MAX_SENT_LEN):
        if i < len(words):
            sent_vec.append(vectorize_glove(words[i]))
        else:
            sent_vec.append(np.zeros(10))
            
    return np.stack(sent_vec)

vectorize_sentence(cleanup("I love maths"))

array([[ 1.46019065,  0.10174885,  0.2349833 ,  0.13381507,  0.13139261,
        -0.35644269,  0.13025667,  0.88861185, -0.53457463,  0.28422403],
       [ 3.91869497,  2.20386267,  0.37218884, -1.4228698 ,  0.74477488,
         0.23443703, -0.41543913,  0.64290857,  0.8048414 , -0.85233623],
       [ 4.72919226,  3.22979999, -0.10978653, -1.31802189,  0.91205353,
        -0.45005161,  0.00574686,  0.87387037,  0.3075569 , -0.8312102 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        

In [12]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, isTest=False):
        self.raw_ds = pd.read_csv(data_path)
        sentiment_dict = {
            "positive": 1,
            "negative": 0
        }

        self.raw_ds["sentiment_num"] = self.raw_ds["sentiment"].map(sentiment_dict)
        
        train_ds, test_ds = train_test_split(self.raw_ds, test_size=0.2, random_state=42)
        if isTest:
            self.ds = test_ds
        else:
            self.ds = train_ds

    def __getitem__(self,idx):
        x = self.ds["review"].iloc[idx]
        y = self.ds["sentiment_num"].iloc[idx]
        return vectorize_sentence(cleanup(x)).flatten(), y
    
    def __len__(self):
        return len(self.ds)
    
train_sent = SentimentDataset()
test_sent = SentimentDataset(False)

train_iter = DataLoader(train_sent, batch_size=40, shuffle=True)
test_iter = DataLoader(test_sent, batch_size=40)

In [13]:
import torch
import torch.nn as nn

class LinearNet(nn.Module):
    def __init__(self, input_shape):
        super(LinearNet, self).__init__()
        self.fc1 = nn.Linear(input_shape,128)
        self.fc2 = nn.Linear(128,32)
        self.drop = nn.Dropout(0.2)
        self.fc3 = nn.Linear(32,1)
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.drop(self.fc3(x)))
        return x

In [14]:
EPOCHS = 60
LR = 0.01

In [15]:
torch.cuda.manual_seed_all(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LinearNet(50 * 10)
model = model.to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=LR)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [16]:
train_acc = []
train_loss = []

test_acc = []
test_loss = []


for ep_idx in range(EPOCHS):
    model.train()
    corrects = 0
    total = 0
    losses = []
    for x, y in train_iter:
        x, y = x.to(device), y.to(device)
        outputs = model(x.float())
        outputs = torch.squeeze(outputs)
        loss = loss_fn(outputs, y.float())
        losses.append(loss.detach().numpy())
        preds = outputs.detach().numpy() > 0.5
        y_np = y.numpy()
        
        corrects += np.sum(preds==y_np)
        total += len(x)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    train_acc.append(corrects/total)
    train_loss.append(np.mean(losses))
    
    scheduler.step()
    
    model.eval()
    corrects = 0
    total = 0
    losses = []
    with torch.no_grad():
        for x, y in test_iter:
            x, y = x.to(device), y.to(device)
            outputs = model(x.float())
            outputs = torch.squeeze(outputs)
            loss = loss_fn(outputs, y.float())
            losses.append(loss.detach().numpy())
            preds = outputs.detach().numpy() > 0.5
            y_np = y.numpy()

            corrects += np.sum(preds==y_np)
            total += len(x)
            
    test_acc.append(corrects/total)
    test_loss.append(np.mean(losses))
    
    print("---EPOCH {}---".format(ep_idx))
    print("Train Acc {} || Loss {}".format(train_acc[-1], train_loss[-1]))
    print("Test Acc {} || Loss {}".format(test_acc[-1], test_loss[-1]))

---EPOCH 0---
Train Acc 0.5111 || Loss 0.6924773454666138
Test Acc 0.55065 || Loss 0.6890760660171509
---EPOCH 1---
Train Acc 0.535875 || Loss 0.6877182722091675
Test Acc 0.5778 || Loss 0.6816287040710449
---EPOCH 2---
Train Acc 0.5539 || Loss 0.6814061403274536
Test Acc 0.597925 || Loss 0.6723319292068481
---EPOCH 3---
Train Acc 0.5656 || Loss 0.6778176426887512
Test Acc 0.60515 || Loss 0.6665642857551575
---EPOCH 4---
Train Acc 0.572475 || Loss 0.6729364991188049
Test Acc 0.590025 || Loss 0.6683647036552429
---EPOCH 5---
Train Acc 0.57615 || Loss 0.6705861687660217
Test Acc 0.61105 || Loss 0.6593706011772156
---EPOCH 6---
Train Acc 0.57995 || Loss 0.6690576672554016
Test Acc 0.6055 || Loss 0.6619959473609924
---EPOCH 7---
Train Acc 0.581175 || Loss 0.6683685183525085
Test Acc 0.599075 || Loss 0.6617356538772583
---EPOCH 8---
Train Acc 0.58735 || Loss 0.6666016578674316
Test Acc 0.61645 || Loss 0.6555406451225281
---EPOCH 9---
Train Acc 0.5838 || Loss 0.6666612029075623
Test Acc 0.613

KeyboardInterrupt: 