In [None]:
import pandas as pd
import numpy as np
import torch

import tqdm
from importlib import reload

from torchtext import datasets

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#### Custom imports and download data

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz && tar -xf aclImdb_v1.tar.gz

In [None]:
!git clone https://github.com/zemerov/greenatom_assignment.git

In [None]:
import greenatom_assignment.classifier.preproc as preproc
import greenatom_assignment.classifier.models as models
import greenatom_assignment.classifier.utils as utils

In [None]:
!cd greenatom_assignment && git pull

preproc = reload(preproc)
models = reload(models)
utils = reload(utils)

In [None]:
!ls  # You have to see aclImdb directory

In [None]:
tokenizer = preproc.ManualTokenizer()

train = []
test = []

for current_dir in ['aclImdb/train/pos/', 'aclImdb/train/neg/']:
    for text, score in tokenizer.get_tokens_and_score(current_dir):
        train.append((text, score))

for current_dir in ['aclImdb/test/pos/', 'aclImdb/test/neg/']:
    for text, score in tokenizer.get_tokens_and_score(current_dir):
        test.append((text, score))

### Build token mapping

In [None]:
np_train = np.array(train)
np_test = np.array(test)

In [None]:
vocab = preproc.Vocabulary(special_tokens=['END', 'BEGIN', "PAD", 'UNK'])

vocab.fit(np.concatenate([np_train[:, 0], np_test[:, 0]]), min_count=7)

print("vocab size:", len(vocab))

In [None]:
vocab.counter.most_common(10)

### Create dataloader

In [None]:
train_dataset = utils.Dataset(train, vocab, overfit_size=1200)
test_dataset = utils.Dataset(test, vocab)

In [None]:
BATCH_SIZE = 64
PAD_TOKEN = train_dataset.vocab([['PAD']])[0][0]

print('PAD TOKEN {}; BATCH SIZE {}'.format(PAD_TOKEN, BATCH_SIZE))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=utils.Padder(pad_symbol=PAD_TOKEN))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=utils.Padder(pad_symbol=PAD_TOKEN))

### Train models

In [None]:
embedding_dim = 256
hidden_size = 128
lr = 10e-3

num_epoch = 3
batch_size = 64
device = 'cuda'

model = models.CNN(len(vocab.i2t), embedding_dim, [3, 4, 5],  hidden_size).to(device)
#model = GRU(len(vocab.i2t), embedding_dim, hidden_size, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss(ignore_index=PAD_TOKEN)

In [None]:
%%time
epochs = 3
val_losses = []
epoch_losses = []

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    cnt = 0
    
    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        preds, _ = model(x)
        #print(preds.shape, h.shape, y.shape)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        if not np.isnan(loss.item()):
            running_loss += loss.item()

        if cnt % 50 == 0:
          print('current loss on iter {}'.format(cnt), loss.item() / batch_size)
        cnt += 1
    
    epoch_loss = running_loss / len(train_dataset)
    
    epoch_losses.append(epoch_loss)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for x, y in test_loader:
        with torch.no_grad():
            x = x.to(device)
            y = y.to(device)
            
            preds, _ = model(x)
            loss = criterion(preds, y)
            if not np.isnan(loss.item()):
                val_loss += loss.item()
        
    val_loss /= len(test_dataset)
    val_losses.append(val_loss)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

In [None]:
def calculate_metrics(pred, real):
    tp = (pred[real == 1] == 1).sum()
    fp = (pred[real == 0] == 1).sum()
    fn = (pred[real == 1] == 0).sum()

    accuracy = (pred == real).sum() / real.shape[0]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return accuracy, precision, recall, f1

In [None]:
predicted = np.array([])
real = np.array([])

for x, y in test_loader:
    with torch.no_grad():
        x = x.to(device)
            
        preds, _ = model(x)
        predicted = np.concatenate([predicted, preds.cpu().detach().numpy().argmax(axis=1) + 1])
        real = np.concatenate([real, y.numpy()])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 8))
plt.rcParams.update({'font.size': 22})

points = np.arange(1, len(val_losses) + 1)
plt.plot(points, val_losses)
plt.plot(points, epoch_losses)

plt.title('GRU train error')
plt.xlabel('Epoch')
plt.xticks([1, 2, 3, 4])
plt.legend(['Test dataset', 'Train_dataset'])
plt.grid(linestyle='-', linewidth=1)

plt.show()

In [None]:
print("Accuracy {} \nPrecision {}\nRecall {}\nF1 {}".format(*calculate_metrics(predicted > 6, real > 6)))

### Save model

In [None]:
#torch.save(model, 'greenatom_assignment/classifier/gru.pt')