In [None]:
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from gensim.models import Word2Vec, Phrases
import numpy as np
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
import gensim

In [None]:
!ls

sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip drive/MyDrive/data/Архив.zip

In [None]:
!ls

aclImdb  drive	__MACOSX  sample_data  w2vmodel.pth


In [None]:
folder = './aclImdb'
labels = {'pos': 1, 'neg': 0}

In [None]:
df_train = pd.DataFrame(columns = ['review', 'sentiment', 'rating'])
df_test = pd.DataFrame(columns = ['review', 'sentiment', 'rating'])


f = 'train'   
for l in ('pos', 'neg'):
    path = os.path.join(folder, f, l)
    for file in os.listdir(path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        df_train.loc[df_train.shape[0],:] = [txt, labels[l],file[:-4].split('_')[1]]

f = 'test'   
for l in ('pos', 'neg'):
    path = os.path.join(folder, f, l)
    for file in os.listdir(path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        df_test.loc[df_test.shape[0],:] = [txt, labels[l],file[:-4].split('_')[1]]


In [None]:
df_test.head()

Unnamed: 0,review,sentiment,rating
0,"In this unlikely love triangle, set in 19th ce...",1,10
1,"Shame, is a Swedish film in Swedish with Engli...",1,7
2,When I tuned in to my local PBS station last n...,1,9
3,"I love horror films, but I think they work way...",1,9
4,The first time I ever saw this movie was when ...,1,8


In [None]:
df_train.head()

Unnamed: 0,review,sentiment,rating
0,"Utterly tactical, strange (watch for the kinky...",1,7
1,"First things first, I was never once scared of...",1,10
2,This movie is one of the masterpieces from Mr....,1,10
3,Jack Lemmon and Walter Matthau began and endin...,1,7
4,Jörg Buttgereit goes a bit too far with his mo...,1,7


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'<[^>]+>',' ',text, re.UNICODE)
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df_train['Processed_Reviews'] = df_train.review.apply(lambda x: clean_text(x))
df_test['Processed_Reviews'] = df_test.review.apply(lambda x: clean_text(x))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df_train.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews
0,"Utterly tactical, strange (watch for the kinky...",1,7,utterly tactical strange watch kinky moment dr...
1,"First things first, I was never once scared of...",1,10,first thing first wa never scar underrate gem ...
2,This movie is one of the masterpieces from Mr....,1,10,movie one masterpiece mr antonioni youth distr...
3,Jack Lemmon and Walter Matthau began and endin...,1,7,jack lemmon walter matthau begin end career to...
4,Jörg Buttgereit goes a bit too far with his mo...,1,7,jörg buttgereit go bite far movie theme time e...


In [None]:
df_test.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews
0,"In this unlikely love triangle, set in 19th ce...",1,10,unlikely love triangle set 19th century italy ...
1,"Shame, is a Swedish film in Swedish with Engli...",1,7,shame swedish film swedish english subtitle fi...
2,When I tuned in to my local PBS station last n...,1,9,tune local pb station last night watch war mak...
3,"I love horror films, but I think they work way...",1,9,love horror film think work way better hide dr...
4,The first time I ever saw this movie was when ...,1,8,first time ever saw movie wa wa four year old ...


In [None]:
embedding_vector_size = 256
w2v = Word2Vec(
    sentences = df_train['Processed_Reviews'].str.split(),
    size = embedding_vector_size,
    min_count=10, window=3, workers=4)
# w2v.save('w2vmodel.pth')

In [None]:
# w2v = Word2Vec.load('w2vmodel.pth')
w2v.build_vocab([["UNK"]], update=True)

unk_vector = w2v.wv.vectors.mean(axis=0)
w2v.wv["UNK"] = unk_vector

In [None]:
w2v.wv.most_similar(positive=['bad'])[0]

('awful', 0.667884349822998)

In [None]:
w2v.wv.vectors.shape

(18749, 256)

In [None]:
word2idx = {word: idx for idx, word in enumerate(w2v.wv.index2word)}

In [None]:
word2idx['wa']

0

In [None]:
def encode(sen):
    return np.array([word2idx[word] if word in word2idx else word2idx['UNK'] for word in sen.split()],dtype='int')
def add_pads(nested):
    max_length = max(len(x) for x in nested)
    padded_array = np.array([np.pad(x, (0, max_length - len(x)), mode='constant') for x in nested])
    return padded_array

In [None]:
df_train['encoded'] = df_train.Processed_Reviews.apply(lambda x: encode(x))
df_test['encoded'] = df_test.Processed_Reviews.apply(lambda x: encode(x))

In [None]:
df_train.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews,encoded
0,"Utterly tactical, strange (watch for the kinky...",1,7,utterly tactical strange watch kinky moment dr...,"[1040, 18748, 546, 12, 8598, 173, 18748, 1335,..."
1,"First things first, I was never once scared of...",1,10,first thing first wa never scar underrate gem ...,"[29, 41, 29, 0, 47, 1244, 1807, 1231, 144, 48,..."
2,This movie is one of the masterpieces from Mr....,1,10,movie one masterpiece mr antonioni youth distr...,"[1, 3, 836, 447, 5798, 1670, 5516, 2543, 9254,..."
3,Jack Lemmon and Walter Matthau began and endin...,1,7,jack lemmon walter matthau begin end career to...,"[589, 3707, 1843, 2907, 123, 33, 526, 227, 222..."
4,Jörg Buttgereit goes a bit too far with his mo...,1,7,jörg buttgereit go bite far movie theme time e...,"[18748, 12134, 11, 99, 143, 1, 425, 10, 14, 10..."


In [None]:
df_test.head()

Unnamed: 0,review,sentiment,rating,Processed_Reviews,encoded
0,"In this unlikely love triangle, set in 19th ce...",1,10,unlikely love triangle set 19th century italy ...,"[2023, 37, 5331, 91, 4685, 973, 2784, 797, 239..."
1,"Shame, is a Swedish film in Swedish with Engli...",1,7,shame swedish film swedish english subtitle fi...,"[809, 3347, 2, 3347, 555, 1942, 2, 527, 232, 1..."
2,When I tuned in to my local PBS station last n...,1,9,tune local pb station last night watch war mak...,"[1557, 511, 8886, 1367, 128, 219, 12, 217, 4, ..."
3,"I love horror films, but I think they work way...",1,9,love horror film think work way better hide dr...,"[37, 106, 2, 16, 49, 39, 59, 659, 781, 1324, 4..."
4,The first time I ever saw this movie was when ...,1,8,first time ever saw movie wa wa four year old ...,"[29, 10, 56, 120, 1, 0, 0, 587, 54, 79, 222, 3..."


In [None]:
nested = df_train['encoded'].to_numpy()
nested_test = df_test['encoded'].to_numpy()

In [None]:
X = torch.tensor(add_pads(nested),dtype=torch.long)
X_test = torch.tensor(add_pads(nested_test),dtype=torch.long)



In [None]:
w2v.wv.most_similar(positive=['movie'])

[('film', 0.8296576738357544),
 ('flick', 0.7181382179260254),
 ('movie.', 0.6490473747253418),
 ('sequel', 0.636115550994873),
 ('movies.', 0.6224300861358643),
 ('movie,', 0.6095897555351257),
 ('rubbish', 0.599772036075592),
 ('anyway', 0.5797839760780334),
 ('blockbuster', 0.5771593451499939),
 ('however', 0.5702927708625793)]

In [None]:
from torch.utils.data import Dataset,DataLoader

class ImdbDataset(Dataset):
    def __init__(self,X,y):
        self.X = X.to(device)
        self.y = y.to(device)
        self.n_samples = self.X.shape[0]

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
w2v.vector_size
pretrained_embeddings = torch.FloatTensor(w2v.wv.vectors)

#sentiment

In [None]:
y = torch.FloatTensor(df_train['sentiment'])
y_test = torch.FloatTensor(df_test['sentiment'])

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
dataset = ImdbDataset(X,y)
test_dataset = ImdbDataset(X_test,y_test)

In [None]:
dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=256)

In [None]:
class CNNModel1(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.cnn = nn.Sequential(
            nn.Conv1d(pretrained_embeddings.shape[1], hidden_size, kernel_size=3, padding=1, stride=2),
            nn.ReLU(),
            nn.Conv1d(hidden_size,hidden_size,kernel_size=3,padding=1, stride=2),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten()
        )
        self.clf = nn.Sequential(
            nn.Linear(hidden_size,30),
            nn.ReLU(),
            nn.Linear(30,1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embeddings(x)
        x = x.permute(0,2,1)
        x = self.cnn(x)
        predictions = self.clf(x)
        return predictions

In [None]:
model1 = CNNModel1(50).to(device)
loss = nn.BCELoss()
op = optim.Adam(model1.parameters(), lr=0.01)
n_epoch = 10

In [None]:
from tqdm.notebook import trange, tqdm
def training(model, loss, op, num_epochs, train_dataloader, test_dataloader, max_grad_norm=2):
  for epoch in trange(num_epochs):
      model.train()
      num_iter=0
      for X,y in tqdm(train_dataloader,total=len(train_dataloader)):
        op.zero_grad()
        y.unsqueeze_(1)
        y_pred = model(X)
        curr_loss = loss(y_pred,y)
        curr_loss.backward()
        if max_grad_norm is not None:
           torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        op.step()
        num_iter+=1
      model.eval()
      valid_loss = 0
      num_iter = 0
      correct = 0
      num_obj = 0
      with torch.no_grad():
          for X,y in test_dataloader:            
            y.unsqueeze_(1)
            y_pred = model(X)
            valid_loss += loss(y_pred,y)
            correct += (y == (y_pred>=0.5).long()).sum()
            num_obj += y.shape[0]
            num_iter+=1
      print(f"epoch:{epoch} Valid loss: {valid_loss/num_iter}, accuracy: {correct/num_obj}", correct.to('cpu').item(), num_obj)



In [None]:
training(model1,loss,op,n_epoch,dataloader,test_dataloader)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

epoch:0 Valid loss: 0.44028592109680176, accuracy: 0.8033199906349182 20083 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:1 Valid loss: 0.3954625427722931, accuracy: 0.8205599784851074 20514 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:2 Valid loss: 0.40746790170669556, accuracy: 0.8078399896621704 20196 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:3 Valid loss: 0.3485051989555359, accuracy: 0.8445199728012085 21113 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:4 Valid loss: 0.3917153775691986, accuracy: 0.8259999752044678 20650 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:5 Valid loss: 0.34992870688438416, accuracy: 0.8461199998855591 21153 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:6 Valid loss: 0.3573841452598572, accuracy: 0.842199981212616 21055 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:7 Valid loss: 0.3480674922466278, accuracy: 0.8489999771118164 21225 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:8 Valid loss: 0.34759533405303955, accuracy: 0.8495599627494812 21239 25000


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:9 Valid loss: 0.35607433319091797, accuracy: 0.8469600081443787 21174 25000


In [None]:
# checkpoint = {
#     "epoch": n_epoch,
#     "model state": model.state_dict(),
#     "optim_state": op.state_dict()
# }
torch.save(model1.state_dict(),"model1.pth")

In [None]:
from google.colab import files
files.download('model1.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
x = """The definition of a vampire is an inhumane corpse supposed to leave its grave at night to drink the blood of the living. Bakjwi nearly nails this concept on the head minus the cliché of pointy fangs and neck biting. Being an R rated movie, I knew this was actually going to pertain to vampires actually being vampires. Which means that the characters in the movie are going to do what vampires actually do without restraint and rightfully lack any glamorous moments in comparison to Twilight. Having viewed Chan-wook Park's preceding Oldboy, I had very high expectations of Bakjwi.<br /><br />I anticipated some awkward plot sequences with our anti-hero, known as Priest Sang-hyeon, and was very impressed by his performance as a holy-man who is forced into this quandary of being humane and obeying his thirst as a vampire. (SPOILER) After the initial premise of him surviving the defective blood transfusion, he starts to crave blood and discovers his super strength and his flying ability. The screen shots do his transition phase without overbearing on exposition. He starts drinking the blood of the dying and those who wish to be euthanized for moral reasons. The oft tragic and dysfunctional love affair the priest has with the manipulative Tae-joo is very riveting as they are played by The Host's Kanh-ho Song and actress OK-vin Kim. The special effects are properly placed in the backdrop and while it doesn't offer anything new in the ways of stunts and CGI, it didn't impose itself into the plot driven and character developed premise. The story and the pivotal plot points are very perverse and grotesque yet very original in its own Korean style. <br /><br />There aren't many negatives I can say about Bakjwi. Sometimes I ask myself if the priests transition phase could have showed more of the priest having an emotional crisis with his transformation, but then again this would have made the movie 3 hours long. The movie was long to begin with. On the same token, vampires really don't have much in the way of expressing emotions to begin with. As mentioned before, this movie is very tragic, so don't expect anything hopeful while watching this. <br /><br />Overall, Bakjwi is delightfully dark, morbid and original. I strongly recommend this movie for serious viewers who are past the teenage phase of Twilight. This is definitely the Korean answer to the Swedish Let The Right One In, which is also a good movie."""
x = clean_text(x)
x = encode(x).reshape(1,-1)
x = torch.tensor(x,dtype=torch.long).to(device)
print(x.shape)
model.eval()
with torch.no_grad():
  display((model1(x)>0.5).long())
x = "This movie is great. 80's sleazy slasher movie about three kids born during an eclipse, so they kill everyone they see. The reason they kill makes practically no sense, but it just adds to the charm of this movie. And dang, these kids are crazy, especially Curtis. If you've seen the movie, you know who I am talking about. That kid's vicous! Although the movie doesn't have much gore, it is entertaining, and for some reason you kind of care about the characters. It also has some nice nudity. Has some decent acting as well, really a decent 80's slasher movie, it's worth a look if you ever get the chance to see it. You'll have nightmares about those darn kids though, I guarantee you!"
x = clean_text(x)
x = encode(x).reshape(1,-1)
x = torch.tensor(x,dtype=torch.long).to(device)
print(x.shape)
model.eval()
with torch.no_grad():
  display((model1(x)>0.5).long())

torch.Size([1, 220])


tensor([[0]], device='cuda:0')

torch.Size([1, 68])


tensor([[1]], device='cuda:0')

#rating

In [None]:
y = torch.tensor(df_train['rating'].astype(int),dtype=torch.float)
y_test = torch.tensor(df_test['rating'].astype(int),dtype=torch.float)

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
dataset = ImdbDataset(X,y)
test_dataset = ImdbDataset(X_test,y_test)
dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=256)

In [None]:
class CNNModel2(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.cnn = nn.Sequential(
            nn.Conv1d(pretrained_embeddings.shape[1], hidden_size, kernel_size=3, padding=1, stride=2),
            nn.ReLU(),
            nn.Conv1d(hidden_size,hidden_size,kernel_size=3,padding=1, stride=2),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten()
        )
        self.clf = nn.Sequential(
            nn.Linear(hidden_size,10),
            nn.ReLU(),
            nn.Linear(10,1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.embeddings(x)
        x = x.permute(0,2,1)
        x = self.cnn(x)
        predictions = self.clf(x)*torch.tensor(10)
        return predictions

In [None]:
model2 = CNNModel2(50).to(device)
loss = nn.MSELoss()
op = optim.Adam(model2.parameters(), lr=0.01)
n_epoch = 15

In [None]:
from tqdm.notebook import trange, tqdm
def training(model, loss, op, num_epochs, train_dataloader, test_dataloader, max_grad_norm=2):
  for epoch in trange(num_epochs):
      model.train()
      num_iter=0
      for X,y in tqdm(train_dataloader,total=len(train_dataloader)):
        op.zero_grad()
        y.unsqueeze_(1)
        y_pred = model(X)
        curr_loss = loss(y_pred,y)
        curr_loss.backward()
        if max_grad_norm is not None:
           torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        op.step()
        num_iter+=1
      model.eval()
      valid_loss = 0
      num_iter = 0
      with torch.no_grad():
          for X,y in test_dataloader:            
            y_pred = model(X)
            y.unsqueeze_(1)
            valid_loss += loss(y_pred,y)
            num_iter+=1
      print(f"epoch:{epoch} Valid loss: {valid_loss/num_iter}")

In [None]:
training(model2,loss,op,n_epoch,dataloader,test_dataloader)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

epoch:0 Valid loss: 8.539665222167969


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:1 Valid loss: 5.963172435760498


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:2 Valid loss: 5.401767730712891


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:3 Valid loss: 5.674729347229004


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:4 Valid loss: 4.968267440795898


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:5 Valid loss: 4.98557710647583


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:6 Valid loss: 5.340519905090332


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:7 Valid loss: 4.833836555480957


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:8 Valid loss: 4.867684841156006


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:9 Valid loss: 5.6326904296875


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:10 Valid loss: 5.1831159591674805


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:11 Valid loss: 5.400252342224121


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:12 Valid loss: 5.581233978271484


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:13 Valid loss: 5.6558661460876465


  0%|          | 0/98 [00:00<?, ?it/s]

epoch:14 Valid loss: 5.537932395935059


In [None]:
model2.eval()
with torch.no_grad():
  x = """The definition of a vampire is an inhumane corpse supposed to leave its grave at night to drink the blood of the living. Bakjwi nearly nails this concept on the head minus the cliché of pointy fangs and neck biting. Being an R rated movie, I knew this was actually going to pertain to vampires actually being vampires. Which means that the characters in the movie are going to do what vampires actually do without restraint and rightfully lack any glamorous moments in comparison to Twilight. Having viewed Chan-wook Park's preceding Oldboy, I had very high expectations of Bakjwi.<br /><br />I anticipated some awkward plot sequences with our anti-hero, known as Priest Sang-hyeon, and was very impressed by his performance as a holy-man who is forced into this quandary of being humane and obeying his thirst as a vampire. (SPOILER) After the initial premise of him surviving the defective blood transfusion, he starts to crave blood and discovers his super strength and his flying ability. The screen shots do his transition phase without overbearing on exposition. He starts drinking the blood of the dying and those who wish to be euthanized for moral reasons. The oft tragic and dysfunctional love affair the priest has with the manipulative Tae-joo is very riveting as they are played by The Host's Kanh-ho Song and actress OK-vin Kim. The special effects are properly placed in the backdrop and while it doesn't offer anything new in the ways of stunts and CGI, it didn't impose itself into the plot driven and character developed premise. The story and the pivotal plot points are very perverse and grotesque yet very original in its own Korean style. <br /><br />There aren't many negatives I can say about Bakjwi. Sometimes I ask myself if the priests transition phase could have showed more of the priest having an emotional crisis with his transformation, but then again this would have made the movie 3 hours long. The movie was long to begin with. On the same token, vampires really don't have much in the way of expressing emotions to begin with. As mentioned before, this movie is very tragic, so don't expect anything hopeful while watching this. <br /><br />Overall, Bakjwi is delightfully dark, morbid and original. I strongly recommend this movie for serious viewers who are past the teenage phase of Twilight. This is definitely the Korean answer to the Swedish Let The Right One In, which is also a good movie."""
  x = clean_text(x)
  x = encode(x).reshape(1,-1)
  x = torch.tensor(x,dtype=torch.long).to(device)
  res = torch.round(model2(x).to('cpu'))
  display(torch.min(res,torch.tensor(10)))

tensor([[4.]])

In [None]:

torch.save(model2.state_dict(),"model2.pth")
from google.colab import files
files.download('model2.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
torch.save(pretrained_embeddings,"embeddings.pth")
files.download('embeddings.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>