In [1]:
!pip install torch==1.0.0

Collecting torch==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/f5/3b/0b8de6e654c2983898564226792c6f09d9bcaba97b7b29c40e4ed4ae43ed/torch-1.0.0-cp37-cp37m-manylinux1_x86_64.whl (591.8MB)
[K     |████████████████████████████████| 591.8MB 30kB/s 
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.0.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.0.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.8.1+cu101
    Uninstalling torch-1.8.1+cu101:
      Successfully uninstalled torch-1.8.1+cu101
Successfully installed torch-1.0.0


In [2]:
import torch
print(torch.__version__)

1.0.0


In [3]:
import pandas as pd
from string import punctuation
import numpy as np
import torch 
from nltk.tokenize import word_tokenize 
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json

In [4]:
with open("sentiment.txt") as f:
  reviews = f.read()

data = pd.DataFrame([review.split('\t') for review in reviews.split('\n')])

data.columns = ['Review', 'Sentiment']

data = data.sample(frac=1)


In [5]:
data.head()

Unnamed: 0,Review,Sentiment
2771,"VERY cheap plastic, creaks like an old wooden ...",0
224,The flat reenactments don't hold your attentio...,0
1422,"Furthermore, you can't even find hours of oper...",0
25,This short film certainly pulls no punches.,0
2487,Unfortunately the ability to actually know you...,0


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
def split_words_reviews(data):
  text = list(data['Review'].values)
  clean_text = []
  for t in text:
    clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
  tokenized = [word_tokenize(x) for x in clean_text]
  all_text = []
  for tokens in tokenized:
    for t in tokens:
      all_text.append(t)
  return tokenized, set(all_text)

In [8]:
reviews, vocab = split_words_reviews(data)

In [9]:
reviews[0]

['very', 'cheap', 'plastic', 'creaks', 'like', 'an', 'old', 'wooden', 'floor']

In [10]:
def create_dictionaries(words):
  word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
  in_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
  return word_to_int_dict, in_to_word_dict

In [11]:
word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

In [12]:
int_to_word_dict

{1: 'spinach',
 2: 'open',
 3: 'kidnapped',
 4: 'largely',
 5: 'exceeding',
 6: 'funnyall',
 7: 'plays',
 8: 'message',
 9: 'install',
 10: 'rickman',
 11: 'violinists',
 12: 'customize',
 13: 'complaint',
 14: 'remorse',
 15: 'lock',
 16: '95',
 17: 'overhip',
 18: 'eyepleasing',
 19: 'crocdodile',
 20: 'shame',
 21: 'professionals',
 22: 'disgraceful',
 23: 'owed',
 24: 'impossible',
 25: 'famed',
 26: 'dualpurpose',
 27: 'hot',
 28: 'florida',
 29: 'extremely',
 30: 'expanded',
 31: 'dusted',
 32: 'copier',
 33: 'four',
 34: 'hbo',
 35: 'recessed',
 36: 'vinaigrette',
 37: '350',
 38: 'tract',
 39: 'future',
 40: 'jealousy',
 41: 'killing',
 42: 'wildly',
 43: 'tired',
 44: 'endearing',
 45: 'beensteppedinandtrackedeverywhere',
 46: 'technology',
 47: 'stars',
 48: 'peculiarity',
 49: 'wake',
 50: 'advertised',
 51: 'shoots',
 52: 'survived',
 53: 'enthusiastic',
 54: 'still',
 55: 'shipped',
 56: 'minor',
 57: 'little',
 58: 'looking',
 59: 'masterpieces',
 60: 'reasonable',
 61: '

In [13]:
with open('word_to_int_dict.json', 'w') as fp:
  json.dump(word_to_int_dict, fp)

In [14]:
print(np.max([len(x) for x in reviews]))
print(np.mean([len(x) for x in reviews]))

70
11.783666666666667


In [15]:
def pad_text(tokenized_reviews, seq_length):
    
    reviews = []
    
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length-len(review)) + review)
        
    return np.array(reviews)


In [16]:
padded_sentences = pad_text(reviews, seq_length = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', 'very', 'cheap', 'plastic', 'creaks',
       'like', 'an', 'old', 'wooden', 'floor'], dtype='<U33')

In [17]:
int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

In [18]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0, 2536, 4406,  845,
       4776, 2242, 1506, 3139, 2961, 1346])

In [19]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)                  
        sigmoid_out = self.sigmoid(fc_out)              
        sigmoid_out = sigmoid_out.view(batch_size, -1)  
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):
        
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [20]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)


In [21]:
labels = np.array([int(x) for x in data['Sentiment'].values])

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [22]:
print_every = 2400
step = 0
n_epochs = 3
clip = 5
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [23]:
for epoch in range(n_epochs):
  h = net.init_hidden(batch_size)

  for inputs, labels in train_loader:
    step = step + 1
    net.zero_grad()
    output, h = net(inputs)
    loss = criterion(output.squeeze(), labels.float())
    loss.backward()
    nn.utils.clip_grad_norm(net.parameters(), clip)
    optimizer.step()

    if (step % print_every) == 0:
      net.eval()
      valid_losses = []

      for v_inputs, v_labels in valid_loader:

        v_output, v_h = net(v_inputs)
        v_loss = criterion(v_output.squeeze(), v_labels.float())
        valid_losses.append(v_loss.item())


      print("Epoch: {}/{}".format((epoch+1), n_epochs),
            "Step: {}".format(step),
            "Training Loss: {:.4f}".format(loss.item()),
            "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
      net.train()

  "Please ensure they have the same size.".format(target.size(), input.size()))
  # Remove the CWD from sys.path while we load stuff.


Epoch: 1/3 Step: 2400 Training Loss: 0.4108 Validation Loss: 0.6099
Epoch: 2/3 Step: 4800 Training Loss: 2.3610 Validation Loss: 0.5489
Epoch: 3/3 Step: 7200 Training Loss: 0.0244 Validation Loss: 0.6919


In [24]:
torch.save(net.state_dict(), 'model.pkl')

In [25]:
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.load_state_dict(torch.load('model.pkl'))

In [26]:
net.eval()
test_losses = []
num_correct = 0

for inputs, labels in test_loader:

  test_output, test_h = net(inputs)
  loss = criterion(test_output, labels)
  test_losses.append(loss.item())

  preds = torch.round(test_output.squeeze())
  correct_tensor = preds.eq(labels.float().view_as(preds))
  correct = np.squeeze(correct_tensor.numpy())
  num_correct = num_correct + np.sum(correct)

print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

Test Loss: 0.7250
Test Accuracy: 0.76


In [30]:
def preprocess_review(review):
  review = review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
  tokenized = word_tokenize(review)
  if len(tokenized) >= 50:
    review = tokenized[:50]
  else:
    review = ['0'] * (50-len(tokenized)) + tokenized

  final = []

  for token in review:
    try:
      final.append(word_to_int_dict[token])
    except:
      final.append(word_to_int_dict[''])

    
  return final

In [31]:
def predict(review):
  net.eval()
  words = np.array([preprocess_review(review)])
  padded_words = torch.from_numpy(words)
  pred_loader = DataLoader(padded_words, batch_size=1, shuffle=True)
  for x in pred_loader:
    output = net(x)[0].item()
  
  msg = "This is a positive review." if output >= 0.5 else "This is a negative review"
  print(msg)
  print('Prediction = ' + str(output))
  

In [32]:
predict("The film was good")

This is a positive review.
Prediction = 0.989628791809082


In [33]:
predict("It was not good")

This is a negative review
Prediction = 0.008618216030299664
