In [1]:
import ipdb
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torch.optim as optim
import mlearn.base as base
from mlearn.base import Field
from sklearn.metrics import accuracy_score
from mlearn.utils.pipeline import process_and_batch
from mlearn.data_processing.data import GeneralDataset
import mlearn.modeling.onehot as oh

## Load and process data

In [2]:
text_field = Field('text', train = True, label = False, ignore = False, ix = 5, cname = 'text')
label_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 4)
ignore_field = Field('ignore', train = False, label = False, cname = 'ignore', ignore = True)

fields = [text_field, label_field]
dataset = GeneralDataset(data_dir = '~/PhD/projects/tools/mlearn/tests/',
                         ftype = 'csv', fields = fields, train = 'garcia_stormfront_train.tsv', dev = None,
                         test = 'garcia_stormfront_test.tsv', train_labels = None, tokenizer = lambda x: x.split(),
                         lower = True, preprocessor = None, transformations = None,
                         label_processor = None, sep = '\t', name = 'Test')
dataset.load('train')
dataset.load('test')

Loading Test (train): 1914it [00:00, 39755.65it/s]
Loading Test (test): 478it [00:00, 11693.38it/s]


## Encode the documents and labels

In [3]:
dataset.build_token_vocab(dataset.data)
dataset.build_label_vocab(dataset.data)

Building vocabulary: 100%|██████████| 1914/1914 [00:00<00:00, 55954.85it/s]
Encoding vocabulary: 100%|██████████| 6291/6291 [00:00<00:00, 121708.33it/s]
Encode label vocab: 100%|██████████| 2/2 [00:00<00:00, 4522.16it/s]


In [4]:
train = process_and_batch(dataset, dataset.data, 32, onehot = True)
test = process_and_batch(dataset, dataset.test, 32, onehot = True)

In [5]:
print("Number of batches:", len(train))
print("Length of first batch:", len(train[0]))

Number of batches: 60
Length of first batch: 32


## Prepare the models

### RNN

In [6]:
model = oh.RNNClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = dataset.vocab_size(), output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [7]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)


100%|██████████| 60/60 [02:10<00:00,  2.17s/it, accuracy=0.423, loss=0.125] 


### LSTM

In [8]:
model = oh.LSTMClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = 128, output_dim = 3, num_layers = 1, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [None]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)

  0%|          | 0/60 [00:00<?, ?it/s]

> /Users/zeerakw/.virtualenvs/mlearn/lib/python3.7/site-packages/mlearn-0.0.1-py3.7.egg/mlearn/modeling/onehot.py(45)forward()
-> sequence = sequence.float()


### MLP

In [None]:
model = oh.MLPClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = 128, output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [None]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)

### CNN

In [None]:
model = oh.CNNClassifier(window_sizes = [2,3,4], num_filters = 128, max_feats = dataset.vocab_size(), 
                      output_dim = 3, vocab_size = dataset.vocab_size(), hidden_dim = 128,
                      batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [None]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)