In [None]:
import os
import csv
import shutil
import zipfile
import pickle
import itertools
import urllib.parse
import urllib.request
from collections import Counter

import numpy as np
import torch
import torch.utils.data
import sklearn.datasets
from sklearn.metrics import accuracy_score

## Introduction

In this turorial, we will build a simple neural network for sentence classification using word embeddings. The model simply sums up the embeddings of the tokens in the sentence and pass it through several fully connected layers.

## Dataset

We will use the [Stanford Sentiment Treebank](https://nlp.stanford.edu/sentiment/index.html) dataset, converted into a two-way classification problem, where the goal is given an input sentence to determine is it positive or negative.

In [None]:
def maybe_download_and_unzip_file(file_url, file_name=None):
    """
    Download and unzip a remote archive if it does not exists yet

    :param file_url: Url of the archive
    :param file_name:  (Default value = None) The filename to save the content

    """
    if file_name is None:
        file_name = os.path.basename(file_url)
        
    if not os.path.exists(file_name):
        print(f'Downloading: {file_name}')
        
        with urllib.request.urlopen(file_url) as response, open(file_name, 'wb') as target_file:
            shutil.copyfileobj(response, target_file)

        print(f'Downloaded: {file_name}')
            
        if os.path.splitext(file_name)[1] == '.zip':
            print(f'Extracting: {file_name}')
            with zipfile.ZipFile(file_name, 'r') as zip_file:
                zip_file.extractall('.')
                
    else:
        print(f'Exists: {file_name}')

In [None]:
dataset_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
dataset_filename = 'SST-2.zip'

In [None]:
train_filename = 'SST-2/train.tsv'
val_filename = 'SST-2/dev.tsv'

In [None]:
maybe_download_and_unzip_file(dataset_url, dataset_filename)

### Vocabulary

Before the data gets loaded into the model, it has to be converted from raw text to a numeric representation. One way to achieve this is to introduce a token-to-id mapping. More specifically, we will use a vocabulary class that maintains the mapping between tokens and their IDs, and that is able to flexibly add tokens and prune the vocabulary based on the token counts. When the input dataset is very large, vocabulary pruning is widely used in practice for more efficient memory usage.

In [None]:
class Vocab(object):
    """ Vocabulary class to provide token to id correpondance """
    END_TOKEN = '<end>'
    START_TOKEN = '<start>'
    PAD_TOKEN = '<pad>'
    UNK_TOKEN = '<unk>'

    def __init__(self, special_tokens=None):
        """
        Initialize the vocabulary class

        :param special_tokens:  (Default value = None) A list of special tokens. The PAD token should be the first in the list, if used.

        """
        super().__init__()

        self.special_tokens = special_tokens

        self.token2id = {}
        self.id2token = {}

        self.token_counts = Counter()

        if self.special_tokens is not None:
            self.add_document(self.special_tokens)

    def add_document(self, document, rebuild=True):
        """
        Process the document and add tokens from the it to the vocabulary

        :param document: A list of tokens in the document
        :param rebuild:  (Default value = True) Whether to rebuild the token2id correspondance or not

        """
        for token in document:
            self.token_counts[token] += 1

            if token not in self.token2id:
                self.token2id[token] = len(self.token2id)

        if rebuild:
            self._rebuild_id2token()

    def add_documents(self, documents):
        """
        Process a list of documents and tokens from the them to the vocabulary

        :param documents: A list of documents, where each document is a list of tokens

        """
        for doc in documents:
            self.add_document(doc, rebuild=False)

        self._rebuild_id2token()

    def _rebuild_id2token(self):
        """ Revuild the token to id correspondance """
        self.id2token = {i: t for t, i in self.token2id.items()}

    def get(self, item, default=None):
        """
        Given a token, return the corresponding id

        :param item: A token
        :param default:  (Default value = None) Default value to return if token is not present in the vocabulary

        """
        return self.token2id.get(item, default)

    def __getitem__(self, item):
        """
        Given a token, return the corresponding id

        :param item: A token

        """
        return self.token2id[item]

    def __contains__(self, item):
        """
        Check if a token is present in the vocabulary

        :param item: A token

        """
        return item in self.token2id

    def __len__(self):
        """ Return the length of the vocabulary """
        return len(self.token2id)

    def __str__(self):
        """ Get a string representation of the vocabulary """
        return f'{len(self)} tokens'

Now, let's create a dataset class. Notice how the vocabulary can be shared between the train and the test datasets.

In [None]:
class SSTDataset(torch.utils.data.Dataset):
    """ """
    def __init__(self, filename, vocab=None, max_len=None):
        """
        Initialize the Stanford Sentiment Treebank Dataset

        :param filename: Path to the dataset from the GLUE benchmark
        :param vocab:  (Default value = None) Vocabulary to use, will be created if None
        :param max_len:  (Default value = None) Maximum length of the sentneces. The longer sentences will be cut

        """
        super().__init__()
    
        data = self._load_file(filename)
        
        self.sentences = [sent.split(' ') for sent, label in data]
        self.labels = [int(label) for sent, label in data]
    
        print(f'Sentences: {len(self.sentences)}')
        print(f'Labels: {len(self.labels)}')
    
        if vocab is None:            
            vocab = Vocab(special_tokens=[Vocab.PAD_TOKEN, Vocab.UNK_TOKEN])
            vocab.add_documents(self.sentences)
            print(f'Creating vocab: {vocab}')
        
        if max_len is None:
            max_len = max(len(s) for s in itertools.chain.from_iterable(self.sentences))
            print(f'Calculating max len: {max_len}')
        
        self.max_len = max_len
        self.vocab = vocab
    
    def _load_file(self, filename):
        """
        Read the dataset from the file

        :param filename: Path to the dataset

        """
        with open(filename, 'r') as csv_file:
            reader = csv.DictReader(csv_file, delimiter='\t')
            data = [(r['sentence'].strip(), r['label']) for r in reader]
            
            return data
        
    def _pad_sentnece(self, sent):
        """
        Cut the sentence if needed and pad it to the maximum len

        :param sent: The input sentnece

        """
        sent = sent[:self.max_len]
        
        nb_pad = self.max_len - len(sent)
        sent = sent + [Vocab.PAD_TOKEN,] * nb_pad
        
        return sent
        
    def __getitem__(self, index):
        """
        Return a processed and ready to be batched item from the dataset by its index

        :param index: The index of the sentence in the dataset

        """
        sent = self.sentences[index]
        label = self.labels[index]
        
        sent = self._pad_sentnece(sent)
        sent = [self.vocab[t] if t in self.vocab else self.vocab[Vocab.UNK_TOKEN] for t in sent]
        sent = np.array(sent, dtype=np.long)
        
        return sent, label
    
    def __len__(self):
        """ Return the length of the dataset """
        return len(self.labels)

In [None]:
dataset_train = SSTDataset(train_filename)

In [None]:
dataset_val = SSTDataset(val_filename, vocab=dataset_train.vocab, max_len = dataset_train.max_len)

In [None]:
dataset_train[0]

## Word embeddings

We'll use the [fastText](https://fasttext.cc/) embeddings, trained on Common Crawl. We've conveted them into a dictionary and pickled them using the standard `pickle` module.

In [None]:
embeddings_url = 'https://mednli.blob.core.windows.net/shared/word_embeddings/crawl-300d-2M.pickled'
embeddings_filename = 'crawl-300d-2M.pickled'

In [None]:
maybe_download_and_unzip_file(embeddings_url, embeddings_filename)

In [None]:
with open(embeddings_filename, 'rb') as pkl_file:
    word_embeddings = pickle.load(pkl_file)

In [None]:
print(f'Word embeddings: {len(word_embeddings)} tokens, shape {word_embeddings[list(word_embeddings.keys())[0]].shape}')

In [None]:
list(word_embeddings.keys())[:10]

In [None]:
word_embeddings['cat'].shape

In [None]:
word_embeddings['cat'][:20]

### Embedding matrix

Since we do not need all the embeddings, let's create a matrix, where each row will correspond to a token in the vocabulary and will contain the corresponding embedding.

In [None]:
def create_embeddings_matrix(word_embeddings, vocab):
    """
    Given word embeddings dictionary and the vocabulary, construct the embeddings martix, where each row corresponds to a token and contains the embedding of this token

    :param word_embeddings: Word embeddings dictionary, token -> numpy array
    :param vocab: Vocabulary

    """
    embedding_size = word_embeddings[list(word_embeddings.keys())[0]].shape[0]

    W_emb = np.zeros((len(vocab), embedding_size), dtype=np.float32)
    
    special_tokens = {
        t: np.random.uniform(-0.3, 0.3, (embedding_size,))
        for t in (Vocab.UNK_TOKEN, )
    }
    special_tokens[Vocab.PAD_TOKEN] = np.zeros((embedding_size,))

    nb_unk = 0
    for i, t in vocab.id2token.items():
        if t in special_tokens:
            W_emb[i] = special_tokens[t]
        else:
            if t in word_embeddings:
                W_emb[i] = word_embeddings[t]
            else:
                W_emb[i] = np.random.uniform(-0.3, 0.3, embedding_size)
                nb_unk += 1

    print(f'Nb unk: {nb_unk}')

    return W_emb

In [None]:
len(dataset_train.vocab)

In [None]:
W_emb = create_embeddings_matrix(word_embeddings, dataset_train.vocab)

## Model

Finally, let's declare a simple model. Notice how we put fully connected layers inside a `torch.nn.Sequential` container.

In [None]:
class BOWModel(torch.nn.Module):
    """ """
    def __init__(self, vocab_size, embedding_size, hidden_size, dropout, trainable_embeddings, nb_classes, pad_index, W_emb=None):
        """
        Initialize a simple feedforward Bag-of-words model with several hidden layers

        :param vocab_size: Vocabulary size
        :param embedding_size: Dmension of the embeddings
        :param hidden_size: The size of the hidden layers
        :param dropout: Probability of the dropout 
        :param trainable_embeddings: Whether the embedding layer will be trainable or frozen
        :param nb_classes: Number of the classes to classify the input to
        :param pad_index: Index of the PAD token
        :param W_emb:  (Default value = None) Initial values of the embedding layer, a numpy array

        """
        super().__init__()

        self.pad_index = pad_index
        
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_index)
        if W_emb is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(W_emb))
        if not trainable_embeddings:
            self.embedding.weight.requires_grad = False

        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(embedding_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden_size, nb_classes),
        )

        
    def forward(self, inputs):
        """
        Perform the forward pass of the model

        :param inputs: Input sentnences

        """
        embedded = self.embedding(inputs)
        inputs_lengths = torch.sum(inputs != self.pad_index, dim=1).long()
        
        z = torch.sum(embedded, dim=1) / inputs_lenghts.unsqueeze(-1).float()
        
        logits = self.classifier(z)
        
        return logits

In [None]:
hidden_size = 128
dropout = 0.3
trainable_embeddings = False

In [None]:
model = BOWModel(
    vocab_size=len(dataset_train.vocab), 
    embedding_size = W_emb.shape[1], 
    hidden_size=hidden_size, 
    dropout=dropout, 
    trainable_embeddings=trainable_embeddings, 
    nb_classes=len(set(dataset_train.labels)), 
    pad_index=dataset_train.vocab[Vocab.PAD_TOKEN], 
    W_emb=W_emb
)

In [None]:
model = model.to('cuda')

In [None]:
model

## Training

In [None]:
batch_size=256
nb_epochs = 5
learning_rate=0.001
weight_decay = 0.00001

In [None]:
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

Run the training!

In [None]:
for i in range(nb_epochs):
    epoch_losses_train = []
    epoch_losses_val = []
    epoch_predictions = []
    epoch_targets = []
    
    for inputs, targets in dataloader_train:
        model.train()
        optimizer.zero_grad()
        
        inputs = inputs.to('cuda')
        targets = targets.to('cuda')
        
        logits = model(inputs)
        loss = criterion(logits, targets)
        
        loss.backward()
        optimizer.step()
        
        epoch_losses_train.append(loss.item())

    # calc accuracy on the dev set
    for inputs, targets in dataloader_val:
        model.eval()
        
        with torch.no_grad():
            inputs = inputs.to('cuda')
            targets = targets.to('cuda')

            logits = model(inputs)
            loss = criterion(logits, targets)
            pred = torch.argmax(logits, dim=1)

            epoch_losses_val.append(loss.item())
            epoch_predictions.append(pred.cpu().numpy())
            epoch_targets.append(targets.cpu().numpy())
    
    epoch_predictions = np.concatenate(epoch_predictions, axis=0)
    epoch_targets = np.concatenate(epoch_targets, axis=0)
    epoch_accuracy = accuracy_score(epoch_targets, epoch_predictions)
    epoch_loss_train = np.mean(epoch_losses_train)
    epoch_loss_val = np.mean(epoch_losses_val)    
    
    print(f'Epoch: {i+1}, train loss: {epoch_loss_train:.3f}, val loss: {epoch_loss_val:.3f}, accuracy: {epoch_accuracy:.3f}')