## 1. Data Preprocessing

In [1]:
import pandas as pd
import torch
import nltk
import numpy as np

from torch import nn
from tqdm.notebook import tqdm

from utils.helpers import Utility
from utils.model import RNN

# nltk.download('stopwords')

Using TensorFlow backend.


In [2]:
train_result = Utility.generate_tensors_from_csv_file('tn_reviews', None)

In [3]:
tokenizer = train_result['tokenizer']

In [4]:
valid_result = Utility.generate_tensors_from_csv_file('vd_reviews', tokenizer)

In [5]:
test_result = Utility.generate_tensors_from_csv_file('tt_reviews', tokenizer)

In [6]:
test_result.keys()

dict_keys(['tokenizer', 'review_tensor', 'label_tensor'])

## 2. Batch Data

In [7]:
batch_size = 50

In [8]:
train_dataloader = Utility.generate_dataloader_from_tensors(train_result, batch_size)
valid_dataloader = Utility.generate_dataloader_from_tensors(valid_result, batch_size)
test_dataloader = Utility.generate_dataloader_from_tensors(test_result, batch_size)

In [9]:
reviews, labels = next(iter(train_dataloader))

reviews.shape, labels.shape

(torch.Size([50, 5000]), torch.Size([50, 1]))

## 3. Build the Neural Network

In [10]:
train_on_gpu = torch.cuda.is_available()

In [11]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):

    if train_on_gpu:
        inp, target = inp.cuda(), target.cuda()
        
    hidden = tuple([each.data for each in hidden])
    
    # zero accumulated gradients
    rnn.zero_grad()
    
    # get the output from the model
    output, hidden = rnn(inp, hidden)
    
    # calculate the loss and perform backprop
    loss = criterion(output.to(dtype=torch.float), target.to(dtype=torch.float))
    
    loss.backward()
     
    optimizer.step()
    
    return loss.item(), hidden

In [12]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    
    batch_losses = []
    
    batch_loss_min = np.Inf
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(tqdm(train_dataloader)):
            
            inputs, labels = inputs.to(torch.int64), labels.to(torch.int64)
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

        
        avg_train_loss = np.average(batch_losses)
        
        if avg_train_loss < batch_loss_min:
            
            torch.save(rnn, './output/trained_rnn_final.pt')
            
            batch_loss_min = avg_train_loss
        
        print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
            epoch_i, n_epochs, avg_train_loss))
        batch_losses = []

    # returns a trained rnn
    return rnn

In [13]:
# Data params
# Sequence Length
# sequence_length = 20  # of words in a sequence
# Batch Size

# data loader - do not change
# train_loader = batch_data(int_text, sequence_length, batch_size)

# Training parameters
# Number of Epochs
num_epochs = 100
# Learning Rate
learning_rate = 0.06

vocab_to_int = tokenizer.word_index

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int) + 1
# Output size
output_size = 1
# Embedding Dimension
embedding_dim = 100
# Hidden Dimension
hidden_dim = 200
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 500

In [None]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
# torch.save(trained_rnn, './output/trained_rnn_final.pt')
# print('Model Trained and Saved')

Training for 100 epoch(s)...


HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    1/100   Loss: 1.088809347698952



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    2/100   Loss: 0.9458807024030351



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    3/100   Loss: 0.8879722544767786



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    4/100   Loss: 0.9587946175886293



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    5/100   Loss: 0.9406988366916174



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    6/100   Loss: 0.9138570860068431



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    7/100   Loss: 0.9219032482638192



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    8/100   Loss: 0.8944483548804435



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:    9/100   Loss: 1.012117697704192



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   10/100   Loss: 0.9102738649054679



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   11/100   Loss: 1.0611662054961581



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   12/100   Loss: 0.9793439520337189



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   13/100   Loss: 0.8679956750407052



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   14/100   Loss: 0.9149771472514479



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   15/100   Loss: 1.3666312577268185



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   16/100   Loss: 0.8345119527408055



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))


Epoch:   17/100   Loss: 0.8215271454615735



HBox(children=(FloatProgress(value=0.0, max=371.0), HTML(value='')))

In [None]:
# saving the trained model
torch.save(trained_rnn, './output/trained_rnn_final.pt')
print('Model Trained and Saved')