# LSTM classifier with word embedding representation

In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score

import torch
from torch import nn
from torch.nn.utils.rnn import (pad_sequence,
                                pack_padded_sequence,
                                pad_packed_sequence)
import torch.nn.functional as F
import spacy
nlp = spacy.load('en_core_web_md')

from helper import load_data

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
x_train, x_validation, x_test, y_train, y_validation, y_test = load_data()

N_CLASSES = y_train.columns.size
BATCH_SIZE = 64

In [3]:
def text2tensor(text):
    doc = nlp(text)
    tokens = [token.vector for token in doc if token.is_alpha]
    if len(tokens) < 10:
        tokens = [token.vector for token in doc]
    return torch.Tensor(np.vstack(tokens)).to(device)

In [4]:
def batch_pad_pack_train(corpus, batch_size=64):
    groups = np.arange(len(corpus)) // batch_size
    batched = corpus.groupby(groups)

    tensors = []
    for i, text_batch in batched:
        batched_tensors = [text2tensor(text) for text in text_batch]
        padded = pad_sequence(batched_tensors, batch_first=True)
        packed = pack_padded_sequence(
            padded,
            lengths=[tensor.shape[0] for tensor in batched_tensors],
            batch_first=True,
            enforce_sorted=False)
        tensors.append(packed)

    return tensors

def batch_labels(labels, batch_size=64):
    out = []
    for _, df in labels.groupby(np.arange(len(labels)) // batch_size):
        out.append(torch.Tensor(df.values).to(device))
    return out

In [5]:
x_train_tensors = batch_pad_pack_train(x_train, BATCH_SIZE)

In [6]:
y_train_tensors = batch_labels(y_train, BATCH_SIZE)

x_validation_tensors = batch_pad_pack_train(x_validation, BATCH_SIZE)

In [59]:
class Model(nn.Module):
    EMBEDDINGS_DIM = 300
    LSTM_HIDDEN_SIZE = 128
    LSTM_STACK_DEPTH = 2
    
    def __init__(self, n_classes):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size=Model.EMBEDDINGS_DIM,
                            hidden_size=Model.LSTM_HIDDEN_SIZE,
                            num_layers=Model.LSTM_STACK_DEPTH,
                            dropout=0.5)
        self.dense = nn.Linear(in_features=Model.LSTM_HIDDEN_SIZE,
                               out_features=n_classes)
    
    def forward(self, embeddings):
        lstm_out_packed, _ = self.lstm(embeddings)
        lstm_last_out = pad_packed_sequence(lstm_out_packed, batch_first=True)[0][:, len(lstm_out_packed) - 1]
        dense_out = self.dense(lstm_last_out)
        return torch.sigmoid(dense_out)

In [57]:
def inference_score(model, x, y, average='macro'):
    """
    @param x - list of PyTorch's PackedSequence
    @param y - numpy ndarray (n_samples, n_classes)
    @return ROC AUC score
    """
    batch_size = x[0].sorted_indices.shape[0]
    y_pred = np.zeros_like(y, dtype=np.float32)
    with torch.no_grad():
        for i, batch in enumerate(x):
            pred_i = model(batch).numpy()
            y_pred[i * batch_size : i * batch_size + pred_i.shape[0]] = pred_i

    return roc_auc_score(y, y_pred, average=average)

In [60]:
N_EPOCHS = 5
model = Model(N_CLASSES)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

training_scores = []
validation_scores = []

for epoch in range(N_EPOCHS):
    epoch_loss = 0.0
    for text_embedding, ground_truth in zip(x_train_tensors, y_train_tensors):
        optimizer.zero_grad()
        output = model(text_embedding)
        loss = criterion(output, ground_truth)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    train_score = inference_score(model, x_train_tensors, y_train)
    validation_score = inference_score(model, x_validation_tensors, y_validation)
    print('Epoch [{}/{}] | loss: {:.5f}'.format(
          epoch + 1, N_EPOCHS,
          epoch_loss / len(x_train_tensors)))
    print('ROC AUC score - train: {:.5f}, validation: {:.5f}\n'.format(
          train_score, validation_score))
    training_scores.append(train_score)
    validation_scores.append(validation_score)

Epoch [1/5] | loss: 0.42788
ROC AUC score - train: 0.84704, validation: 0.83745

Epoch [2/5] | loss: 0.32531
ROC AUC score - train: 0.87011, validation: 0.84834



KeyboardInterrupt: 

## Model performance evaluation

x_test_tensors = corpus2tensors(x_test)

test_score = inference_score(x_test_tensors, y_test)
print('Test ROC AUC:', test_score)