In [None]:
# Plan
# [x] load doc vectors
# [x] create FFNN
# [x] load word vectors
# [x] create LSTM
# [ ] write report

In [15]:
import numpy as np
import pandas as pd

In [8]:
# get data

import pickle

def load_data(name):
    with open('{}.pickle'.format(name), 'rb') as f:
        return pickle.load(f)
    
dataset = load_data('cleaned')

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.33, random_state=42)

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit_transform(y_train)

y_train_enc = le.transform(y_train)
y_test_enc = le.transform(y_test)

In [17]:
VEC_SIZE = 300
NUM_CLASS = len(le.classes_)

## FFNN + doc2vec

In [16]:
# get doc vectors

from gensim.models.doc2vec import TaggedDocument, Doc2Vec

X_doc_train = [TaggedDocument(words=words, tags=[str(i)])
               for i, words in enumerate(X_train)]

X_doc_test = [TaggedDocument(words=words, tags=[str(i)])
              for i, words in enumerate(X_test)]

model = Doc2Vec(dm=1, vector_size=VEC_SIZE, min_count=5, window=10, workers=4, epochs=100)
model.build_vocab(X_doc_train)
model.train(X_doc_train, total_examples=model.corpus_count, epochs=model.epochs)

X_train_vec = np.array([model.infer_vector(doc.words) for doc in X_doc_train])
X_test_vec = np.array([model.infer_vector(doc.words) for doc in X_doc_test])

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.nn.functional import relu

torch.manual_seed(0)

<torch._C.Generator at 0x112b162d0>

In [21]:
def train(model, loss, train_data, train_target, epochs = 1):    
    for n in range(1, epochs + 1, 1):
        optimizer.zero_grad()
        
        train_output = model(train_data)
        
        train_error = loss(train_output, train_target)
        train_error.backward()
        
        if n % 50 == 0 or n == 1 or n == epochs:
            print("Step = {}/{} Error = {}".format(n, epochs, train_error.item()))
        
        optimizer.step()
    
    return model

In [25]:
# build model

class FFNN(nn.Module):
    def __init__(self, input_size, num_class):
        super(FFNN, self).__init__()
        self.linear1 = nn.Linear(input_size, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, num_class)
        
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.xavier_normal_(self.linear2.weight)
        
        self.bn1 = nn.BatchNorm1d(128, track_running_stats=False)
        self.bn2 = nn.BatchNorm1d(64, track_running_stats=False)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.bn1(x)
        x = relu(x)
        x = self.linear2(x)
        x = self.bn2(x)
        x = relu(x)
        x = self.linear3(x)
        
        return x

In [26]:
# prepare data

train_data = torch.from_numpy(X_train_vec)
train_target = torch.from_numpy(y_train_enc)
test_data = torch.from_numpy(X_test_vec)
test_target = torch.from_numpy(y_test_enc)

In [27]:
# train model

model = FFNN(VEC_SIZE, NUM_CLASS)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.2, momentum=0.9)
lr = 0.01 # 3e-4
optimizer = optim.Adam(model.parameters(), lr=lr)

model = train(model, loss, train_data, train_target, 100)

Step = 1/100 Error = 5.4134063720703125
Step = 50/100 Error = 1.8324609994888306
Step = 100/100 Error = 1.0295368432998657


In [33]:
# print report

y_pred = model(test_data)
y_pred = torch.max(y_pred, 1).indices

pd.DataFrame(classification_report(test_target, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,accuracy,macro avg,weighted avg
precision,0.154472,0.0,0.377049,0.108911,0.181818,0.178571,0.190476,0.25,0.186047,0.418269,...,0.1,0.076923,0.197802,0.26,0.135135,0.263158,0.105263,0.361506,0.227509,0.346951
recall,0.159664,0.0,0.30531,0.139241,0.166667,0.227273,0.125,0.111111,0.25,0.426471,...,0.034483,0.045455,0.214286,0.236364,0.080645,0.238095,0.090909,0.361506,0.190774,0.361506
f1-score,0.157025,0.0,0.337408,0.122222,0.173913,0.2,0.150943,0.153846,0.213333,0.42233,...,0.051282,0.057143,0.205714,0.247619,0.10101,0.25,0.097561,0.361506,0.199573,0.347858
support,119.0,14.0,226.0,79.0,24.0,22.0,32.0,18.0,32.0,204.0,...,29.0,22.0,84.0,220.0,62.0,21.0,22.0,0.361506,20824.0,20824.0


## LSTM + word2vec

In [34]:
import spacy

nlp = spacy.load('/tmp/uk_vectors')

def vec(text):
    return nlp(text)[0].vector

def vectorize(tokens):
    v = vec('unk')
    
    for t in tokens:        
        v += vec(t)
            
    v /= len(tokens)

    return v

In [60]:
# get word vectors

X_train_vec = np.vstack(X_train.apply(vectorize))
X_test_vec = np.vstack(X_test.apply(vectorize))

In [87]:
# build lstm model

class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, vec_weights, classes) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(vec_weights)
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [85]:
# prepare data

train_data = torch.from_numpy(X_train_vec)
train_target = torch.from_numpy(y_train_enc)
test_data = torch.from_numpy(X_test_vec)
test_target = torch.from_numpy(y_test_enc)

In [None]:
# train model

n_vocab, vocab_dim = nlp.vocab.vectors.shape
weights = torch.from_numpy(nlp.vocab.vectors.data)

model = LSTM(n_vocab, vocab_dim, 256, weights, NUM_CLASS)
loss = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.2, momentum=0.9)
lr = 0.01 # 3e-4
optimizer = optim.Adam(model.parameters(), lr=lr)

model = train(model, loss, train_data, train_target, 100)

In [31]:
# print report

y_pred = model(test_data)
y_pred = torch.max(y_pred, 1).indices

pd.DataFrame(classification_report(test_target, y_pred, output_dict=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,197,198,199,200,201,202,203,accuracy,macro avg,weighted avg
precision,0.154472,0.0,0.377049,0.108911,0.181818,0.178571,0.190476,0.25,0.186047,0.418269,...,0.1,0.076923,0.197802,0.26,0.135135,0.263158,0.105263,0.361506,0.227509,0.346951
recall,0.159664,0.0,0.30531,0.139241,0.166667,0.227273,0.125,0.111111,0.25,0.426471,...,0.034483,0.045455,0.214286,0.236364,0.080645,0.238095,0.090909,0.361506,0.190774,0.361506
f1-score,0.157025,0.0,0.337408,0.122222,0.173913,0.2,0.150943,0.153846,0.213333,0.42233,...,0.051282,0.057143,0.205714,0.247619,0.10101,0.25,0.097561,0.361506,0.199573,0.347858
support,119.0,14.0,226.0,79.0,24.0,22.0,32.0,18.0,32.0,204.0,...,29.0,22.0,84.0,220.0,62.0,21.0,22.0,0.361506,20824.0,20824.0
