In [27]:
import ipdb
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torch.optim as optim
import mlearn.base as base
from mlearn.base import Field
from sklearn.metrics import accuracy_score
from mlearn.utils.pipeline import process_and_batch
from mlearn.data_processing.data import GeneralDataset
from mlearn.modeling.embedding import EmbeddingRNNClassifier, EmbeddingLSTMClassifier, CNNClassifier, EmbeddingMLPClassifier

## Load and process data

In [2]:
text_field = Field('text', train = True, label = False, ignore = False, ix = 5, cname = 'text')
label_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 4)
ignore_field = Field('ignore', train = False, label = False, cname = 'ignore', ignore = True)

fields = [text_field, label_field]
dataset = GeneralDataset(data_dir = '~/PhD/projects/tools/mlearn/tests/',
                         ftype = 'csv', fields = fields, train = 'garcia_stormfront_train.tsv', dev = None,
                         test = 'garcia_stormfront_test.tsv', train_labels = None, tokenizer = lambda x: x.split(),
                         lower = True, preprocessor = None, transformations = None,
                         label_processor = None, sep = '\t', name = 'Test')
dataset.load('train')
dataset.load('test')

Loading Test (train): 1914it [00:00, 26759.84it/s]
Loading Test (test): 478it [00:00, 24797.80it/s]


## Encode the documents and labels

In [3]:
dataset.build_token_vocab(dataset.data)
dataset.build_label_vocab(dataset.data)

Building vocabulary: 100%|██████████| 1914/1914 [00:00<00:00, 128214.55it/s]
Encoding vocabulary: 100%|██████████| 6291/6291 [00:00<00:00, 494108.21it/s]
Encode label vocab: 100%|██████████| 2/2 [00:00<00:00, 1766.77it/s]


In [4]:
train = process_and_batch(dataset, dataset.data, 32, onehot = False)
test = process_and_batch(dataset, dataset.test, 32, onehot = False)

In [8]:
print("Number of batches:", len(train))
print("Length of first batch:", len(train[0]))

Number of batches: 60
Length of first batch: 32


## Prepare the models

### RNN

In [9]:
model = EmbeddingRNNClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = dataset.vocab_size(), output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [10]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)


100%|██████████| 60/60 [04:26<00:00,  4.44s/it, accuracy=0, loss=0.125]  


### LSTM

In [13]:
model = EmbeddingLSTMClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = 128, output_dim = 3, num_layers = 1, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [12]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)

100%|██████████| 60/60 [03:09<00:00,  3.16s/it, accuracy=0.308, loss=0.126]


### MLP

In [14]:
model = EmbeddingMLPClassifier(dataset.vocab_size(), hidden_dim = 128, embedding_dim = 128, output_dim = 3, batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [15]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)

100%|██████████| 60/60 [00:11<00:00,  5.22it/s, accuracy=0.5, loss=1]       


### CNN

In [36]:
class CNNClassifier(nn.Module):
    """CNN Classifier."""

    def __init__(self, window_sizes: base.List[int], num_filters: int, max_feats: int, input_dim: int, 
                 embedding_dim: int, output_dim: int, batch_first: bool = True, **kwargs) -> None:
        """
        Initialise the model.

        :window_sizes: The size of the filters (e.g. 1: unigram, 2: bigram, etc.)
        :no_filters: The number of filters to apply.
        :max_feats: The maximum length of the sequence to consider.
        :hidden_dim (int): Hidden dimension size.
        :output_dim (int): Output dimension.
        :batch_first (bool, default: True): True if the batch is the first dimension.
        """
        super(CNNClassifier, self).__init__()
        self.batch_first = batch_first
        self.name = 'cnn'

        self.itoh = nn.Embedding(input_dim, embedding_dim)  # Works
        self.conv = nn.ModuleList([nn.Conv2d(1, num_filters, (w, embedding_dim)) for w in window_sizes])
        self.linear = nn.Linear(len(window_sizes) * num_filters, output_dim)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, sequence) -> base.DataType:
        """
        Forward step of the model.

        :sequence: The sequence to be predicted on.
        :return (base.DataType): The scores computed by the model.
        """
        # CNNs expect batch first so let's try that
        if not self.batch_first:
            sequence = sequence.transpose(0, 1)

        emb = self.itoh(sequence)  # Get embeddings for sequence
        output = [F.relu(conv(emb.unsqueeze(1))).squeeze(3) for conv in self.conv]
        output = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in output]
        output = torch.cat(output, 1)
        scores = self.softmax(self.linear(output))

        return scores

In [37]:
model = CNNClassifier(window_sizes = [2,3,4], num_filters = 128, max_feats = 100, 
                      output_dim = 3, vocab_size = dataset.vocab_size(), embedding_dim = 128,
                      batch_first = True)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
loss = nn.NLLLoss()

In [38]:
with tqdm(train) as loop:
    for X, y in loop:
        res = model(X.long())
        l = loss(res, y)
        
        acc = accuracy_score(res.argmax(dim=1).cpu(), y.cpu())

        l.backward()
        optimizer.step()
        loop.set_postfix(loss = l.data.item() / X.shape[0], accuracy = acc)

100%|██████████| 60/60 [00:20<00:00,  2.89it/s, accuracy=0.5, loss=0.854]   
