In [None]:
import numpy as np
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import random_split, DataLoader, Dataset
from torch.autograd import Variable

import pandas as pd 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re

In [None]:
glove_path = "glove/glove.6B.50d.txt"


# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embedded_words = {}
    embedded_words[' '] = np.array([0] * 50, dtype='float32')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embedded_words[word] = vector

    
    return embedded_words

embedded_words = load_glove_embeddings(glove_path)


In [None]:
words_index = {word:index for index, word in enumerate(sorted(embedded_words.keys()))}

In [None]:
df = pd.read_csv('./data/IMDB Dataset.csv')
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

In [None]:
sentences = df['review'].tolist()
labels = df['label'].tolist()


In [None]:
nltk.data.path.append("./stopwords")
nltk.download('stopwords', download_dir="./stopwords")
stop_words = set(stopwords.words('english'))

def sentence_to_index_tokens(sentence, words_index, max_len = 15000):
    sentence = sentence.lower()
    sentence = re.sub(r'[\']', ' ', sentence)
    sentence = re.sub(r'<[^>]{0,5}>|[^a-zA-Z\s]', ' ', sentence)
    tokens = sentence.split()
    tokens = [words_index[token] for token in tokens if token not in stop_words and token in words_index]
    if(len(tokens) < max_len):
        tokens += [0] * (max_len - len(tokens))
    return tokens[:max_len]

def preprocess_training_data(sentences, labels, words_index, max_len = 15000):
    preprocessed_data = []

    for i,(sentence,label) in enumerate(zip(sentences,labels)):
        tokens = sentence_to_index_tokens(sentence, words_index, max_len=max_len)
        preprocessed_data.append((torch.tensor(tokens),torch.tensor(label)))
    
    return preprocessed_data
        
MAX_LEN = 1000
data = preprocess_training_data(sentences, labels, words_index, max_len=MAX_LEN)


In [None]:
def create_embedding_matrix(embedded_words, words_index):
    embedding_matrix = np.zeros((len(words_index), 50), dtype='float32')
    for word, i in words_index.items():
        embedding_vector = embedded_words.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        
    
    return torch.tensor(embedding_matrix)
    
embedding_matrix = create_embedding_matrix(embedded_words, words_index)



In [None]:
print(embedding_matrix.dtype)
print(embedding_matrix.shape)

In [None]:
train_len = int(0.8 * len(data))
test_len = len(data) - train_len
BATCH_SIZE = 32
train, test = random_split(data, [train_len, test_len], generator=torch.Generator().manual_seed(77))
train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test, batch_size=BATCH_SIZE)


In [None]:
embedding_layer = nn.Embedding(400001,50)
embedding_layer.weight = nn.Parameter(embedding_matrix)
embedding_layer.weight.requires_grad = False



In [None]:
embedding_layer(data[0][0]).shape

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size = 400001,
                embedding_dim = 50,
                hidden_dim = 256,
                output_dim = 1, 
                n_layers = 2, 
                bidirectional = True, 
                dropout = 0.3,
                embedding_matrix = None,
                batch_first = True,
                device = 'cpu'):
        super(LSTMModel, self).__init__()
        
        self.embedding=nn.Embedding(vocab_size, embedding_dim, device=device)
        
        if(embedding_matrix is not None):
            self.embedding.weight = nn.Parameter(embedding_matrix.to(device), requires_grad=False)

        self.rnn=nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, device=device, batch_first=batch_first)

        if(bidirectional):
            self.fc = nn.Linear(hidden_dim*2, output_dim, device=device)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim, device=device)

        self.loss = nn.BCEWithLogitsLoss()

        self.device = device
        self.bidirectional = bidirectional
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):
        out=self.embedding(x)
        lstm_out,(hidden,cell)=self.rnn(out)
        if(self.bidirectional):
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else: 
            hidden = hidden[-1,:,:]
        hidden = self.dropout(hidden)

        out=self.fc(hidden.squeeze(0))
        
        return out.squeeze()

    

In [None]:
   

def fit(model, data, device='cpu'):
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)#,lr=0.001, betas=(0.9,0.999))
    EPOCHS = 10
    model.train()

    for e in range(EPOCHS):
        correct = 0
        for i, (x_batch,y_batch) in enumerate(data):
            x = Variable(x_batch).to(device)
            y = Variable(y_batch).float().to(device)

            optimizer.zero_grad()
            
            y_pred = model.forward(x)

            loss = model.loss(y_pred, y)
            loss.backward()
            optimizer.step()
            
            predicted = torch.round(F.sigmoid(y_pred))
  
            correct += (predicted == y).sum()

            if i % 50 == 0:
                print("{:<15} {:<15} {:<30} {:<30}".format("Epoch: " + str(e), "| Batch: " + str(i), "| Loss: " + str(loss.item()), "| accuracy: " + str(float(correct/float(BATCH_SIZE*(i+1))))))
        if((e+1) % 5 == 0):
            torch.save(model.state_dict(), 'lstm-'+str(e+1)+'.pth')


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lstm = LSTMModel(n_layers=2, embedding_matrix=embedding_matrix, device = device)
print("Training on",device)
fit(lstm, train_loader, device)

In [None]:
lstm = LSTMModel(n_layers=2, embedding_matrix=embedding_matrix, device = device)
lstm.load_state_dict(torch.load("./lstm-10.pth"))

In [None]:
def eval(model, test, device = 'cpu'):
    model.eval()
    correct = 0
    n = 0
    for i, (x,y) in enumerate(test):
        x = Variable(x).to(device)
        y = Variable(y).float().to(device)
        y_pred = model.forward(x)
        predicted = torch.round(F.sigmoid(y_pred))
        n += BATCH_SIZE
        correct +=  (predicted == y).sum().item()
            
    return correct/n 
        


In [None]:
print(eval(lstm, test_loader, device))

In [None]:
a = "Today I'm really exicited about this. I love it. In my whole time in the cinema, I can't stop laughing. This is the best movie I have ever seen. Thanks"
s = sentence_to_index_tokens(a,words_index)

prob = F.sigmoid(lstm(torch.tensor([s], device=device)))
if(prob >= 0.5):
    print("Good:", prob.item())
else:
    print("Bad:", 1 - prob.item())