In [26]:
import pandas as pd
import numpy as np
import torch

import spacy
from spacy.symbols import ORTH
import string
import io
import os
import collections
import itertools
import tqdm
from importlib import reload

from torchtext import datasets

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#### Custom imports and download data

In [2]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz && tar -xf aclImdb_v1.tar.gz

--2020-06-19 21:18:53--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2020-06-19 21:19:11 (4.54 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [3]:
!git clone https://github.com/zemerov/greenatom_assignment.git

fatal: destination path 'greenatom_assignment' already exists and is not an empty directory.


In [21]:
import greenatom_assignment.classifier.preproc as preproc
import greenatom_assignment.classifier.models as models
import greenatom_assignment.classifier.utils as utils

In [45]:
!cd greenatom_assignment && git pull

preproc = reload(preproc)
models = reload(models)
utils = reload(utils)

remote: Enumerating objects: 11, done.[K
remote: Counting objects:   9% (1/11)[Kremote: Counting objects:  18% (2/11)[Kremote: Counting objects:  27% (3/11)[Kremote: Counting objects:  36% (4/11)[Kremote: Counting objects:  45% (5/11)[Kremote: Counting objects:  54% (6/11)[Kremote: Counting objects:  63% (7/11)[Kremote: Counting objects:  72% (8/11)[Kremote: Counting objects:  81% (9/11)[Kremote: Counting objects:  90% (10/11)[Kremote: Counting objects: 100% (11/11)[Kremote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects:  25% (1/4)[Kremote: Compressing objects:  50% (2/4)[Kremote: Compressing objects:  75% (3/4)[Kremote: Compressing objects: 100% (4/4)[Kremote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 2), reused 6 (delta 2), pack-reused 0[K
Unpacking objects:  16% (1/6)   Unpacking objects:  33% (2/6)   Unpacking objects:  50% (3/6)   Unpacking objects:  66% (4/6)   Unpacking objects:  83% (5/6)   Unpac

In [22]:
!ls  # You have to see aclImdb directory

aclImdb		   aclImdb_v1.tar.gz.1	 sample_data
aclImdb_v1.tar.gz  greenatom_assignment


In [7]:
tokenizer = preproc.ManualTokenizer()

train = []
test = []

for current_dir in ['aclImdb/train/pos/', 'aclImdb/train/neg/']:
    for text, score in tokenizer.get_tokens_and_score(current_dir):
        train.append((text, score))

for current_dir in ['aclImdb/test/pos/', 'aclImdb/test/neg/']:
    for text, score in tokenizer.get_tokens_and_score(current_dir):
        test.append((text, score))

### Build token mapping

In [8]:
np_train = np.array(train)
np_test = np.array(test)

In [9]:
vocab = preproc.Vocabulary(special_tokens=['END', 'BEGIN', "PAD", 'UNK'])

vocab.fit(np.concatenate([np_train[:, 0], np_test[:, 0]]), min_count=7)

print("vocab size:", len(vocab))

vocab size: 35940


In [10]:
vocab.counter.most_common(10)

[('the', 663905),
 ('and', 320719),
 ('a', 320574),
 ('of', 288484),
 ('to', 266931),
 ('is', 210514),
 ('in', 185063),
 ('it', 154907),
 ('i', 152115),
 ('this', 149904)]

### Create dataloader

In [46]:
train_dataset = utils.Dataset(train, vocab, overfit_size=1200)
test_dataset = utils.Dataset(test, vocab)

In [47]:
BATCH_SIZE = 64
PAD_TOKEN = train_dataset.vocab([['PAD']])[0][0]

print('PAD TOKEN {}; BATCH SIZE {}'.format(PAD_TOKEN, BATCH_SIZE))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=utils.Padder(pad_symbol=PAD_TOKEN))
val_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=utils.Padder(pad_symbol=PAD_TOKEN))

PAD TOKEN 2; BATCH SIZE 64


### Train models

In [49]:
embedding_dim = 256
hidden_size = 128
lr = 10e-3

num_epoch = 3
batch_size = 64

model = models.GRU(len(vocab.i2t), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss(ignore_index=PAD_TOKEN)

NameError: ignored

In [44]:
for x, y in train_loader:
    print(x.shape)
    print(y)
    break

torch.Size([64, 970])
[tensor([4.]), tensor([8.]), tensor([10.]), tensor([8.]), tensor([3.]), tensor([8.]), tensor([3.]), tensor([8.]), tensor([1.]), tensor([7.]), tensor([10.]), tensor([1.]), tensor([10.]), tensor([8.]), tensor([9.]), tensor([10.]), tensor([2.]), tensor([10.]), tensor([1.]), tensor([9.]), tensor([3.]), tensor([10.]), tensor([1.]), tensor([1.]), tensor([8.]), tensor([4.]), tensor([3.]), tensor([7.]), tensor([4.]), tensor([7.]), tensor([7.]), tensor([1.]), tensor([1.]), tensor([9.]), tensor([7.]), tensor([10.]), tensor([9.]), tensor([9.]), tensor([3.]), tensor([3.]), tensor([7.]), tensor([10.]), tensor([4.]), tensor([3.]), tensor([7.]), tensor([1.]), tensor([10.]), tensor([10.]), tensor([1.]), tensor([8.]), tensor([4.]), tensor([1.]), tensor([8.]), tensor([4.]), tensor([2.]), tensor([10.]), tensor([1.]), tensor([4.]), tensor([9.]), tensor([4.]), tensor([8.]), tensor([1.]), tensor([1.]), tensor([2.])]


In [None]:
%%time
epochs = 5

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    cnt = 0
    
    for x, y in train_loader:
        optimizer.zero_grad()
        preds = model(x)
        loss = criterion(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()

        if cnt % 50 == 0:
          print('current loss on iter {}'.format(cnt), loss.item())
        cnt += 1
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0

    for x, y in val_iter:
        preds = model(x)
        loss = criterion(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

### Save model