# Sentiment with RNN

In [None]:
import numpy as np
import torch
import torch.nn as nn

In [None]:
f = open('data/reviews.txt')
reviews = f.read()

f = open('data/labels.txt')
labels = f.read()

# print
print(reviews[:20])
print(labels[:20])

### Pre-processing

Get rid of punctuation

In [None]:
from string import punctuation

reviews = reviews.lower()
text = ''.join([c for c in reviews if c not in punctuation])

reviews_split = text.split('\n')
#text = ' '.join(reviews_split)
#word = text.split()

# print
#word[:10]

### Encoding

Deal with reviews

In [None]:
from collections import Counter

cnt = Counter(word)
vocabulary = sorted(cnt, key=cnt.get, reverse=True)
vocabulary2int = {word: ii for ii, word in enumerate(vocabulary, 1)}

reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocabulary2int[word] for word in review.split()])
    
# print
print(len(vocabulary2int))
print(reviews_ints[0])

Deal with lables

In [None]:
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

### Removing outliters and padding

In [None]:
review_lens = Counter([len(x) for x in reviews_ints])

# print
print(review_lens[0])
print(max(review_lens))

In [None]:
for i, x in enumerate(reviews_ints):
    if len(x) == 0:
        #print(i)
        reviews_ints = np.delete(reviews_ints, i)
        encoded_labels = np.delete(encoded_labels, i)
# print        
#print(len(reviews_ints))
#print(len(encoded_labels))

Padding

In [None]:
def pad_features(reviews_ints, seq_length):
    
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    for i, row in enumerate (reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
        
    return features

In [None]:
seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

# print
#print(features[:30, :10])

### Training, validation, test

In [None]:
index = int(len(features) * 0.8)

train_x, remaining_x = features[:index], features[index:]
train_y, remaining_y = encoded_labels[:index], encoded_labels[index:]

test_index = int(len(remaining_x) * 0.5)
valid_x, test_x = remaining_x[:test_index], remaining_x[test_index:]
valid_y, test_y = remaining_y[:test_index], remaining_y[test_index:]

# print
print("Training set: {}".format(train_x.shape))
print("Validation set: {}".format(valid_x.shape))
print("Test set: {}".format(test_x.shape))

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 20

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
# print
sampler_x, sampler_y = next(iter(train_loader))
print(sampler_x)
print(sampler_y)

### Sentiment network with pytorch

GPU

In [None]:
train_on_gpu = torch.cuda.is_available()

In [None]:
class Sentiment_RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_lyaers, drop_prob=0.5):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_lyaers, batch_first=True, dropout=drop_prob)
        
        self.drop_prob = nn.Dropout(0.3)
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

        
    def forward(self, x, n_hidden):
        
        e = self.embedding(x)
        l_out, hidden = self.lstm(e, n_hidden)
        
        l_out = l_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.drop_prob(l_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        
        batch_size = x.size(0)
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        
        return sig_out, hidden
        
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden   

In [None]:
vocab_size = len(vocabulary2int) + 1
output_size = 1
embedding_dim = 200
hidden_dim = 128
n_layers = 3

net = Sentiment_RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

### Training

In [None]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)

In [None]:
epochs = 3
counter = 0

net.train()

for e in range(epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        
        if train_on_gpu:
            intputs, labels = inputs.cuda(), labels.cuda() 
            
        h = tuple([each.data for each in h])    
            
        net.zero_grad()
        
        output, h = net(inputs, h)
        
        loss = criterion(output.squeeze(), labels.float())
        
        loss.backward()
        
        nn.utils.clip_grad_norm_(net.parameters(), 5)
        
        optimizer.step()
        
        #
        if counter % 100 == 0:
            
            val_h = net.init_hidden(batch_size)
            
            val_losses = []
            net.eval()
            
            for inputs, labels in valid_loader:

                if train_on_gpu:
                    intputs, labels = inputs.cuda(), labels.cuda() 

                val_h = tuple([each.data for each in val_h])
                
                output, val_h = net(input, val_h)
                val_loss = criterion(output.squeeze(), labels.float())
                val_losses.append(val_loss.item())
                
            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))        