In [6]:
import pandas as pd
from string import punctuation
import numpy as np
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json

In [7]:
with open("sentiment.txt") as f:
    reviews = f.read()
    
data = pd.DataFrame([review.split('\t') for review in reviews.split('\n')])

data.columns = ['Review','Sentiment']

data = data.sample(frac=1)

In [8]:
data.head()

Unnamed: 0,Review,Sentiment
2579,Due to this happening on every call I was forc...,0
1766,One of the few places in Phoenix that I would ...,1
949,Simply beautiful.,1
2372,Battery life is also great!,1
1025,That's right....the red velvet cake.....ohhh t...,1


In [9]:
def split_words_reviews(data):
    text = list(data['Review'].values)
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text = []
    for tokens in tokenized:
        for t in tokens:
            all_text.append(t)
    return tokenized, set(all_text)

reviews, vocab = split_words_reviews(data)

reviews[0]

['due',
 'to',
 'this',
 'happening',
 'on',
 'every',
 'call',
 'i',
 'was',
 'forced',
 'to',
 'stop',
 'using',
 'this',
 'headset']

In [10]:
def create_dictionaries(words):
    word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict

word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

int_to_word_dict

{1: 'selfdiscovery',
 2: 'life',
 3: 'definitely',
 4: 'palmtopcameracellphone',
 5: 'indie',
 6: 'suffering',
 7: 'watered',
 8: 'sympathetic',
 9: 'r',
 10: 'constructed',
 11: 'aria',
 12: 'meanings',
 13: 'serve',
 14: 'meats',
 15: 'topvery',
 16: 'chimplike',
 17: 'disturbing',
 18: 'tops',
 19: 'tooth',
 20: 'drinks',
 21: 'earlier',
 22: 'similar',
 23: 'empty',
 24: 'insulin',
 25: 'yummy',
 26: 'kabuki',
 27: 'blanket',
 28: 'lovely',
 29: 'trythe',
 30: 'monolog',
 31: 'nuts',
 32: 'sangria',
 33: 'hanks',
 34: 'favorite',
 35: 'candace',
 36: 'heaven',
 37: 'experiences',
 38: 'counter',
 39: 'superficial',
 40: 'boot',
 41: 'buds',
 42: 'shipping',
 43: 'fanciful',
 44: 'again',
 45: 'bloodiest',
 46: 'sells',
 47: 'pancakes',
 48: 'afraid',
 49: 'seafood',
 50: 'filmmostly',
 51: 'rock',
 52: 'taped',
 53: 'across',
 54: 'prompted',
 55: 'loops',
 56: 'unconditional',
 57: 'setting',
 58: 'joins',
 59: 'workers',
 60: 'cheekbones',
 61: 'exquisite',
 62: 'writer',
 63: 'f

In [11]:
with open('word_to_int_dict.json', 'w') as fp:
    json.dump(word_to_int_dict, fp)

In [12]:
def pad_text(tokenized_reviews, seq_length):
    
    reviews = []
    
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length-len(review)) + review)
        
    return np.array(reviews)

padded_sentences = pad_text(reviews, seq_length = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', 'due', 'to', 'this', 'happening', 'on', 'every', 'call', 'i',
       'was', 'forced', 'to', 'stop', 'using', 'this', 'headset'],
      dtype='<U33')

In [13]:
int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

In [14]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 2419, 1729, 2801, 5156, 4817,  526, 4298, 2774,  937,
       2849, 1729, 4482,  671, 2801, 1078])

In [15]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)                  
        sigmoid_out = self.sigmoid(fc_out)              
        sigmoid_out = sigmoid_out.view(batch_size, -1)  
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):
        
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [16]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [17]:
labels = np.array([int(x) for x in data['Sentiment'].values])

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [39]:
print_every = 2400
step = 0
n_epochs = 33
clip = 5  
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

In [40]:
output.squeeze().shape

torch.Size([])

In [41]:
for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1  
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output, labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output, v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

  nn.utils.clip_grad_norm(net.parameters(), clip)


Epoch: 1/33 Step: 2400 Training Loss: 0.3792 Validation Loss: 0.5718
Epoch: 2/33 Step: 4800 Training Loss: 1.4280 Validation Loss: 0.5825
Epoch: 3/33 Step: 7200 Training Loss: 0.0001 Validation Loss: 0.7038
Epoch: 4/33 Step: 9600 Training Loss: 0.0124 Validation Loss: 0.8224
Epoch: 5/33 Step: 12000 Training Loss: 0.0001 Validation Loss: 1.1294
Epoch: 6/33 Step: 14400 Training Loss: 0.0418 Validation Loss: 1.1879
Epoch: 7/33 Step: 16800 Training Loss: 0.0067 Validation Loss: 1.1236
Epoch: 8/33 Step: 19200 Training Loss: 0.0000 Validation Loss: 1.4580
Epoch: 9/33 Step: 21600 Training Loss: 0.0001 Validation Loss: 1.6247
Epoch: 10/33 Step: 24000 Training Loss: 0.0000 Validation Loss: 1.6294
Epoch: 11/33 Step: 26400 Training Loss: 0.0001 Validation Loss: 1.8337
Epoch: 12/33 Step: 28800 Training Loss: 0.0001 Validation Loss: 1.8131
Epoch: 13/33 Step: 31200 Training Loss: 0.0000 Validation Loss: 2.0953
Epoch: 14/33 Step: 33600 Training Loss: 0.0000 Validation Loss: 2.1164
Epoch: 15/33 Step: 

In [42]:
torch.save(net.state_dict(), 'model.pkl')

In [43]:
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.load_state_dict(torch.load('model.pkl'))

<All keys matched successfully>

In [44]:
def preprocess_review(review):
    review = review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
    tokenized = word_tokenize(review)
    if len(tokenized) >= 50:
        review = tokenized[:50]
    else:
        review= ['0']*(50-len(tokenized)) + tokenized
    
    final = []
    
    for token in review:
        try:
            final.append(word_to_int_dict[token])
            
        except:
            final.append(word_to_int_dict[''])
        
    return final

In [45]:
def predict(review):
    net.eval()
    words = np.array([preprocess_review(review)])
    padded_words = torch.from_numpy(words)
    pred_loader = DataLoader(padded_words, batch_size = 1, shuffle = True)
    for x in pred_loader:
        output = net(x)[0].item()
    
    msg = "This is a positive review." if output >= 0.5 else "This is a negative review."
    print(msg)
    print('Prediction = ' + str(output))

In [50]:
predict("the song is bad")

This is a negative review.
Prediction = 5.213006026849598e-09
