In [25]:
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from gensim.models import Word2Vec, Phrases
import numpy as np
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
import gensim

In [3]:
folder = './aclImdb'
labels = {'pos': 1, 'neg': 0}

In [4]:
df_train = pd.DataFrame(columns = ['review', 'sentiment', 'rating'])
df_test = pd.DataFrame(columns = ['review', 'sentiment', 'rating'])


f = 'train'   
for l in ('pos', 'neg'):
    path = os.path.join(folder, f, l)
    for file in os.listdir(path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        df_train.loc[df_train.shape[0],:] = [txt, labels[l],file[:-4].split('_')[1]]

f = 'test'   
for l in ('pos', 'neg'):
    path = os.path.join(folder, f, l)
    for file in os.listdir(path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        df_test.loc[df_test.shape[0],:] = [txt, labels[l],file[:-4].split('_')[1]]


In [5]:
df_test.head()

Unnamed: 0,review,sentiment,rating
0,"Based on an actual story, John Boorman shows t...",1,9
1,This is a gem. As a Film Four production - the...,1,9
2,"I really like this show. It has drama, romance...",1,9
3,This is the best 3-D experience Disney has at ...,1,10
4,"Of the Korean movies I've seen, only three had...",1,10


In [6]:
df_train.head()

Unnamed: 0,review,sentiment,rating
0,For a movie that gets no respect there sure ar...,1,9
1,Bizarre horror movie filled with famous faces ...,1,8
2,"A solid, if unremarkable film. Matthau, as Ein...",1,7
3,It's a strange feeling to sit alone in a theat...,1,8
4,"You probably all already know this by now, but...",1,10


In [81]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'<[^>]+>',' ',text, re.UNICODE)
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df_train['Processed_Reviews'] = df_train.review.apply(lambda x: clean_text(x))
df_test['Processed_Reviews'] = df_test.review.apply(lambda x: clean_text(x))


[nltk_data] Downloading package stopwords to /Users/artem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/artem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
df_train.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews,encoded
0,For a movie that gets no respect there sure ar...,1,9,movie get respect sure lot memorable quote lis...,"[1.0, 7.0, 858.0, 168.0, 65.0, 813.0, 1652.0, ..."
1,Bizarre horror movie filled with famous faces ...,1,8,bizarre horror movie fill famous face steal cr...,"[1015.0, 106.0, 1.0, 568.0, 671.0, 236.0, 556...."
2,"A solid, if unremarkable film. Matthau, as Ein...",1,7,solid unremarkable film matthau einstein wa wo...,"[988.0, 9369.0, 2.0, 2920.0, 5434.0, 0.0, 294...."
3,It's a strange feeling to sit alone in a theat...,1,8,strange feel sit alone theater occupy parent r...,"[546.0, 60.0, 307.0, 536.0, 502.0, 4219.0, 648..."
4,"You probably all already know this by now, but...",1,10,probably already know 5 additional episode nev...,"[142.0, 377.0, 24.0, 622.0, 4624.0, 181.0, 47...."


In [83]:
df_test.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews
0,"Based on an actual story, John Boorman shows t...",1,9,base actual story john boorman show struggle a...
1,This is a gem. As a Film Four production - the...,1,9,gem film four production anticipate quality w...
2,"I really like this show. It has drama, romance...",1,9,really like show ha drama romance comedy roll ...
3,This is the best 3-D experience Disney has at ...,1,10,best 3d experience disney ha themeparks certai...
4,"Of the Korean movies I've seen, only three had...",1,10,korean movie ive see three really stick first ...


In [54]:
# embedding_vector_size = 256
# w2v = Word2Vec(
#     sentences = df_train['Processed_Reviews'].str.split(),
#     vector_size = embedding_vector_size,
#     min_count=10, window=3, workers=4)
# w2v.save('w2vmodel.pth')

In [58]:
w2v = Word2Vec.load('w2vmodel.pth')
w2v.build_vocab([["UNK"]], update=True)

unk_vector = w2v.wv.vectors.mean(axis=0)
w2v.wv["UNK"] = unk_vector

In [59]:
w2v.wv.most_similar(positive=['bad'])[0]

('awful', 0.6712296009063721)

In [60]:
w2v.wv.vectors.shape

(18749, 256)

In [64]:
word2idx = {word: idx for idx, word in enumerate(w2v.wv.index_to_key)}

In [69]:
word2idx['wa']

0

In [96]:
def encode(sen):
    return np.array([word2idx[word] if word in word2idx else word2idx['UNK'] for word in sen.split()],dtype='int')
def add_pads(nested):
    max_length = max(len(x) for x in nested)
    padded_array = np.array([np.pad(x, (0, max_length - len(x)), mode='constant') for x in nested])
    return padded_array

In [97]:
df_train['encoded'] = df_train.Processed_Reviews.apply(lambda x: encode(x))
df_test['encoded'] = df_test.Processed_Reviews.apply(lambda x: encode(x))

In [98]:
df_train.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews,encoded
0,For a movie that gets no respect there sure ar...,1,9,movie get respect sure lot memorable quote lis...,"[1, 7, 858, 168, 65, 813, 1652, 721, 1233, 601..."
1,Bizarre horror movie filled with famous faces ...,1,8,bizarre horror movie fill famous face steal cr...,"[1015, 106, 1, 568, 671, 236, 556, 9036, 1650,..."
2,"A solid, if unremarkable film. Matthau, as Ein...",1,7,solid unremarkable film matthau einstein wa wo...,"[988, 9369, 2, 2920, 5434, 0, 294, 355, 62, 41..."
3,It's a strange feeling to sit alone in a theat...,1,8,strange feel sit alone theater occupy parent r...,"[546, 60, 307, 536, 502, 4219, 648, 17639, 144..."
4,"You probably all already know this by now, but...",1,10,probably already know 5 additional episode nev...,"[142, 377, 24, 622, 4624, 181, 47, 700, 229, 1..."


In [99]:
df_test.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews,encoded
0,"Based on an actual story, John Boorman shows t...",1,9,base actual story john boorman show struggle a...,"[306, 626, 17, 201, 14930, 19, 725, 184, 822, ..."
1,This is a gem. As a Film Four production - the...,1,9,gem film four production anticipate quality w...,"[1233, 2, 586, 266, 3677, 364, 0, 845, 497, 94..."
2,"I really like this show. It has drama, romance...",1,9,really like show ha drama romance comedy roll ...,"[18, 5, 19, 8, 375, 743, 117, 866, 3, 6058, 57..."
3,This is the best 3-D experience Disney has at ...,1,10,best 3d experience disney ha themeparks certai...,"[53, 3415, 369, 718, 8, 18748, 330, 59, 115, 2..."
4,"Of the Korean movies I've seen, only three had...",1,10,korean movie ive see three really stick first ...,"[2809, 1, 163, 6, 196, 18, 514, 29, 223, 106, ..."


In [100]:
nested = df_train['encoded'].to_numpy()
nested_test = df_test['encoded'].to_numpy()

In [106]:
X = torch.tensor(add_pads(nested),dtype=torch.long)
y = torch.FloatTensor(df_train['sentiment'])
X_test = torch.tensor(add_pads(nested_test),dtype=torch.long)
y_test = torch.FloatTensor(df_test['sentiment'])


In [107]:
w2v.wv.most_similar(positive=['movie'])

[('film', 0.8062955737113953),
 ('flick', 0.6898990273475647),
 ('movie.', 0.637246310710907),
 ('sequel', 0.6307896971702576),
 ('movies.', 0.6298549175262451),
 ('movie,', 0.5976845622062683),
 ('product', 0.590249240398407),
 ('anyway', 0.5744838118553162),
 ('however', 0.5639763474464417),
 ('rubbish', 0.5577197670936584)]

In [140]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [141]:
from torch.utils.data import Dataset,DataLoader

class ImdbDataset(Dataset):
    def __init__(self,X,y):
        self.X = X.to(device)
        self.y = y.to(device)
        self.n_samples = self.X.shape[0]

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [142]:
dataset = ImdbDataset(X,y)
test_dataset = ImdbDataset(X_test,y_test)

In [143]:
dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=256)

In [144]:
w2v.vector_size
pretrained_embeddings = torch.FloatTensor(w2v.wv.vectors)

In [145]:
class CNNModel(nn.Module):
    def __init__(self,embedding_matrix, hidden_size):
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(embedding_matrix)
        self.cnn = nn.Sequential(
            nn.Conv1d(embedding_matrix.shape[1], hidden_size, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Conv1d(hidden_size,hidden_size,kernel_size=3,padding=1, stride=2),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten()
        )
        self.clf = nn.Sequential(
            nn.Linear(hidden_size,20),
            nn.Linear(20,1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embeddings(x)
        x = x.permute(0,2,1)
        x = self.cnn(x)
        predictions = self.clf(x)
        return predictions

In [146]:
model = CNNModel(pretrained_embeddings,50).to(device)
loss = nn.BCELoss()
op = optim.Adam(model.parameters(), lr=1e-2)
n_epoch = 10

In [137]:
from tqdm.notebook import trange, tqdm
def training(model, loss, op, num_epochs, train_dataloader, test_dataloader, max_grad_norm=2):
  for epoch in trange(num_epochs):
      model.train()
      num_iter=0
      for X,y in tqdm(train_dataloader,total=len(train_dataloader)):
        op.zero_grad()
        y.unsqueeze_(1)
        y_pred = model.forward(X)
        curr_loss = loss(y_pred,y)
        curr_loss.backward()
        if max_grad_norm is not None:
           torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        op.step()
        num_iter+=1
      valid_loss = 0
      valid_acc = 0
      num_iter = 0
      model.eval()
      with torch.no_grad():
          correct = 0
          num_obj = 0
          for X,y in tqdm(test_dataloader,total=len(test_dataloader)):            
            y.unsqueeze_(1)
            y_pred = model.forward(X)
            valid_loss += loss(y_pred,y)
            correct += (y == y_pred.argmax(-1).float().mean()).sum()
            num_obj = X.shape[0]
            num_iter+=1
      print(f"Valid loss: {valid_loss/num_iter}, accuracy: {correct/num_obj}")



In [138]:
training(model,loss,op,n_epoch,dataloader,test_dataloader)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Valid loss: 0.4120892584323883, accuracy: 74.4047622680664
