# Deep learning tips and tricks

In this notebook we will cover a couple of tips and tricks for tweaking a neural text classifier. We will use an LSTM model for our experiments. We use torch version 1.4.
The code is inspired by [this](https://github.com/lukysummer/Movie-Review-Sentiment-Analysis-LSTM-Pytorch/blob/master/sentiment_analysis_LSTM.py) repository.

### 1. LOAD THE TRAINING TEXT

In [135]:
from sklearn.datasets import load_files

In [140]:
from nltk import download

In [None]:
download("movie_reviews", download_dir=".data")

In [145]:
movies = load_files(".data/movie_reviews")

In [151]:
reviews, encoded_labels = [review.decode() for review in movies.data], movies.target

### 2. TEXT PRE-PROCESSING

In [152]:
from string import punctuation
import re

word_re = re.compile(r"\b[a-z]{2,}\b")

def tokenize(text):
    processed_text = "".join(ch for ch in text.lower() if ch not in punctuation)
    processed_text = processed_text.replace("\n", " ")
    return word_re.findall(processed_text)

def flatten(tokenized_texts):
    return [word for text in tokenized_texts for word in text]

In [153]:
all_reviews = list(map(tokenize, reviews))
all_words = flatten(all_reviews)

### 3. CREATE DICTIONARIES & ENCODE REVIEWS

In [154]:
from collections import Counter

word_counts = Counter(all_words)
word_list = sorted(word_counts, key=lambda k: word_counts[k], reverse = True)
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}
encoded_reviews = [[vocab_to_int[word] for word in review] for review in all_reviews]

### 4. CHECK LABELS

In [155]:
assert len(encoded_reviews) == len(encoded_labels), "# of encoded reviews & encoded labels must be the same!"

### 5. GET RID OF LENGTH-0 REVIEWS

In [156]:
import numpy as np
import torch

encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

### 6. MAKE ALL REVIEWS SAME LENGTH

In [157]:
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0] * (seq_length - len(review)) + review)
        
    return np.array(reviews)

padded_reviews = pad_text(encoded_reviews, seq_length=200)

### 7. SPLIT DATA & GET (REVIEW, LABEL) DATALOADER

In [239]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.from_numpy(padded_reviews[:train_cutoff]), torch.from_numpy(encoded_labels[:train_cutoff])
valid_x, valid_y = torch.from_numpy(padded_reviews[train_cutoff:valid_cutoff]), torch.from_numpy(encoded_labels[train_cutoff:valid_cutoff])
test_x, test_y = torch.from_numpy(padded_reviews[valid_cutoff:]), torch.from_numpy(encoded_labels[valid_cutoff:])

from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

BATCH_SIZE = 50
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = True)

### 8. DEFINE THE LSTM MODEL

During the model definition step, we might re-implement model weight initialisation, apply another tricks such as adding various types of dropout to the needed layers etc. There is a noteworthy [discussion](https://stackoverflow.com/questions/49433936/how-to-initialize-weights-in-pytorch) on wheather one should initialize weights manually or not, and, if yes, how? The functions that implement various initialisation methods are located in the `torch.nn.init` module.

In [306]:
from torch import nn

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p) 
        self.fc = nn.Linear(n_hidden, n_output)
        self.dropout = nn.Dropout(drop_p) # this layer is designed to turn any neuron off with probability drop_p
        self.sigmoid = nn.Sigmoid()
        self.__init_linear()
        
        
    def forward (self, input_words, hidden_state):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words, hidden_state)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(input_words.shape[0], -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    def __init_linear(self):
        self.fc.weight.data.normal_(0.0, 1/np.sqrt(self.n_hidden))
        self.fc.bias.data.fill_(0)
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

### 9. INSTANTIATE THE MODEL W/ HYPERPARAMETERS

In [307]:
n_vocab = len(vocab_to_int)
n_embed = 100
n_hidden = 256
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

### 10. DEFINE LOSS & OPTIMIZER

L2-regularization is already included into SGD optimizer. The `weight_decay` parameter is responsible for controlling its intensity.

In [308]:
from torch import optim

criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001, weight_decay=1e-5)

### 11. TRAIN THE NETWORK!

To prevent the exploding gradient problem in LSTM/RNN we use the `clip_grad_norm_` function, that takes the `clip` parameter.


In [309]:
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # gradient clip to prevent exploding gradient problem in LSTM/RNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net.to(device)
net.train()

for epoch in range(n_epochs):
    h = net.init_hidden(BATCH_SIZE)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(BATCH_SIZE)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs, v_h)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

Epoch: 4/4 Step: 100 Training Loss: 0.6990 Validation Loss: 0.6933


### 12. TEST THE TRAINED MODEL ON THE TEST SET

In [310]:
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(BATCH_SIZE)
net.to(device)
for inputs, labels in test_loader:
    test_h = tuple([each.data for each in test_h])
    inputs, labels = inputs.to(device), labels.to(device)
    test_output, test_h = net(inputs, test_h)
    loss = criterion(test_output, labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct / len(test_loader.dataset)))

Test Loss: 0.6923
Test Accuracy: 0.56


### 13. TEST THE TRAINED MODEL ON A RANDOM SINGLE REVIEW

In [290]:
def predict(net, review, seq_length = 200):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    words = tokenize(review)
    encoded_words = [vocab_to_int[word] for word in words]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words.reshape(1, -1)).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    output, h = net(padded_words, h)
    pred = torch.round(output.squeeze())
    msg = "This is a positive review." if pred == 0 else "This is a negative review."
    
    print(msg)


review1 = "It made me cry."
review2 = "It was so good it made me cry."
review3 = "It's ok."
review4 = "This movie had the best acting and the dialogue was so good. I loved it."
review5 = "Garbage"
                       ### OUTPUT ###
predict(net, review1)  ## negative ##
predict(net, review2)  ## positive ##
predict(net, review3)  ## negative ##
predict(net, review4)  ## positive ##
predict(net, review5)  ## negative ##

This is a positive review.
This is a positive review.
This is a positive review.
This is a positive review.
This is a positive review.
