In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [2]:
import spacy # you may need to install spacy "conda install -c conda-forge spacy"
import string
import re
from collections import Counter

In [3]:
import os
from io import open

## Sentiment analysis data

In [4]:
def unpack_dataset():
    ! mkdir -p data/aclImdb
    ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    ! tar -zxvf aclImdb_v1.tar.gz -C data
#unpack_dataset()

In [5]:
from pathlib import Path
PATH = Path("data/aclImdb/")
list(PATH.iterdir())

[PosixPath('data/aclImdb/imdbEr.txt'),
 PosixPath('data/aclImdb/imdb.vocab'),
 PosixPath('data/aclImdb/train'),
 PosixPath('data/aclImdb/test'),
 PosixPath('data/aclImdb/README')]

In [6]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

### Tokenization
We use Spacy to "tokenize" sentences into words.

In [7]:
# first time run this
#!python3 -m spacy download en

In [8]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [9]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index
Here we compute a vocabulary and assign an index to each word on it.

In [10]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('data/aclImdb/train/pos/10544_8.txt'),
 PosixPath('data/aclImdb/train/pos/9530_9.txt'),
 PosixPath('data/aclImdb/train/pos/9901_8.txt'),
 PosixPath('data/aclImdb/train/pos/11951_8.txt'),
 PosixPath('data/aclImdb/train/pos/7441_7.txt')]

In [11]:
# takes some time
def count_word_freq(paths):
    counts = Counter()
    for path in paths:
        counts.update(spacy_tok(path.read_text()))
    return counts

In [12]:
def delete_rare_words(counts, min_freq=4):
    for word in list(counts):
        if counts[word] <= min_freq:
            del counts[word]
    return counts

In [13]:
def compute_vocab2_index(counts):
    vocab2index = {"":0, "UNK":1}
    words = ["", "UNK"]
    for word in counts:
        vocab2index[word] = len(words)
        words.append(word)
    return vocab2index, words

In [14]:
counts = count_word_freq(all_files)

In [15]:
counts = delete_rare_words(counts)
vocab2index, words_list = compute_vocab2_index(counts)

In [16]:
len(words_list)

33909

### Text representation
Text as a list of indexes

In [17]:
# spacy_tok takes a while. Run it just once
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [18]:
path = PATH/"train/neg/211_4.txt"
#encode_sentence(path, vocab2index, N=400, padding_start=True)

## Dataset and dataloader

In [19]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", seq_length=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence(path, vocab2index, seq_length, padding_start) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, self.y[idx]

In [20]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [88]:
batch_size = 5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

## Model

In [21]:
num_tokens = len(words_list)
num_classes = 1
seq_length = 400 # this is N
emb_size = 200
heads = 4
num_layers = 1

In [70]:
class CTransformer(nn.Module):
    """
    Transformer for classifying sequences
    """

    def __init__(self, emb_size, heads, num_layers, seq_length, num_tokens, num_classes,
                 dim_feedforward=200):
        """
        :param emb: Embedding dimension
        :param heads: nr. of attention heads
        :param depth: Number of transformer blocks
        :param seq_length: Expected maximum sequence length
        :param num_tokens: Number of tokens in the vocabulary
        :param num_classes: Number of classes.
        """
        super().__init__()

        self.num_tokens = num_tokens

        self.token_embedding = nn.Embedding(num_tokens, emb_size)
        self.pos_embedding = nn.Embedding(seq_length, emb_size)
        
        encoder_layers = TransformerEncoderLayer(emb_size, heads, dim_feedforward=200)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers)
        self.linear = nn.Linear(emb_size, num_classes)

        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        """
        :param x: A batch by sequence length integer tensor of token indices.
                  x has shape (batch_size x seq_length )
        :return: predicted logit vectors.
        """
        emb = self.token_embedding(x)
        _, seq_length, _ = emb.size()

        pos = self.pos_embedding(torch.arange(seq_length).cuda())[None]
        x = emb + pos
        x = self.dropout(x)
        
        # wants seq_length x batch_size x emb_size
        x = torch.transpose(x, 0, 1)
        x = self.transformer_encoder(x)
        x, _ = x.max(dim=0) # pool over the time dimension
        x = self.linear(x)

        return x

## Training functions

In [31]:
def train_epocs(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [25]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in valid_dl:
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [26]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [71]:
batch_size = 50
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

## Experiments

In [72]:
num_tokens = len(words_list)
num_classes = 1
seq_length = 400 # this is N
emb_size = 100
heads = 4
num_layers = 2

In [73]:
model = CTransformer(emb_size, heads, num_layers, seq_length, num_tokens, num_classes).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

train loss 0.621 val loss 0.506 and val accuracy 0.751
train loss 0.449 val loss 0.427 and val accuracy 0.802
train loss 0.350 val loss 0.357 and val accuracy 0.847
train loss 0.285 val loss 0.371 and val accuracy 0.848
train loss 0.238 val loss 0.337 and val accuracy 0.860
train loss 0.200 val loss 0.345 and val accuracy 0.866


In [None]:
num_tokens = len(words_list)
num_classes = 1
seq_length = 400 # this is N
emb_size = 200
heads = 4
num_layers = 2

In [None]:
model = CTransformer(emb_size, heads, num_layers, seq_length, num_tokens, num_classes).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

In [54]:
### warm-up experiment
model = CTransformer(emb_size, heads, num_layers, seq_length, num_tokens, num_classes).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=3)

train loss 0.681 val loss 0.651 and val accuracy 0.631
train loss 0.600 val loss 0.546 and val accuracy 0.723
train loss 0.506 val loss 0.488 and val accuracy 0.762


In [55]:
update_optimizer(optimizer, lr=0.001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

train loss 0.431 val loss 0.365 and val accuracy 0.840
train loss 0.295 val loss 0.339 and val accuracy 0.858
train loss 0.208 val loss 0.459 and val accuracy 0.827
train loss 0.151 val loss 0.426 and val accuracy 0.843
train loss 0.116 val loss 0.461 and val accuracy 0.850
train loss 0.090 val loss 0.487 and val accuracy 0.855
train loss 0.078 val loss 0.515 and val accuracy 0.851
train loss 0.089 val loss 0.457 and val accuracy 0.849
train loss 0.099 val loss 0.561 and val accuracy 0.833
train loss 0.080 val loss 0.518 and val accuracy 0.853


## Smaller embedding size

In [65]:
num_tokens = len(words_list)
num_classes = 1
seq_length = 400 # this is N
emb_size = 48
heads = 4
num_layers = 2

In [66]:
batch_size = 100
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [67]:
model = CTransformer(emb_size, heads, num_layers, seq_length, num_tokens, num_classes).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [68]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=3)

train loss 0.696 val loss 0.691 and val accuracy 0.526
train loss 0.689 val loss 0.688 and val accuracy 0.544
train loss 0.684 val loss 0.680 and val accuracy 0.575


In [69]:
update_optimizer(optimizer, lr=0.001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

train loss 0.650 val loss 0.593 and val accuracy 0.683
train loss 0.521 val loss 0.459 and val accuracy 0.781
train loss 0.404 val loss 0.402 and val accuracy 0.817
train loss 0.328 val loss 0.432 and val accuracy 0.813
train loss 0.276 val loss 0.358 and val accuracy 0.847
train loss 0.240 val loss 0.377 and val accuracy 0.841
train loss 0.211 val loss 0.386 and val accuracy 0.846
train loss 0.177 val loss 0.387 and val accuracy 0.851
train loss 0.155 val loss 0.394 and val accuracy 0.856
train loss 0.135 val loss 0.413 and val accuracy 0.855
