# Homework 1

I use textacy which is a lib on top of spacy to do some preprocessing.

In [1]:
# run if you dont have it installed
!pip install more_itertools



## Next cell is loading the training data disregarding NGRAM size and max vocabulary/max seq len size, it needs to be run only once

In [52]:
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
import random
import numpy
import itertools
from operator import itemgetter 
from glob import glob
from tqdm import tqdm_notebook, tqdm
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
import string
import re
import more_itertools as mit  # not built-in package
_tqdm = tqdm_notebook  # prolly you need jupyter widget for this, change for tqdm for simple tqdm

# get the training data
TRAIN_FILES_POS = glob('/home/kulikov/vlgwork/aclImdb/train/pos/*')
pos_train_texts = []
TRAIN_FILES_NEG = glob('/home/kulikov/vlgwork/aclImdb/train/neg/*')
neg_train_texts = []

# get training text in RAM
for fname in _tqdm(TRAIN_FILES_NEG):
    with open(fname, 'r') as f:
        neg_train_texts.append(f.read())
for fname in _tqdm(TRAIN_FILES_POS):
    with open(fname, 'r') as f:
        pos_train_texts.append(f.read())
        
print("Positive training samples : {} \nNegaitve training samples : {}".format(len(pos_train_texts), len(neg_train_texts)))

TRAIN_SIZE=10000  # change this if you want
                       
# Split training data on train valid parts now
pos_valid_texts = pos_train_texts[TRAIN_SIZE:]
pos_train_texts = pos_train_texts[:TRAIN_SIZE]
neg_valid_texts = neg_train_texts[TRAIN_SIZE:]
neg_train_texts = neg_train_texts[:TRAIN_SIZE]

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
TAG_RE = re.compile(r'<[^>]+>') # get rid off HTML tags from the data

def remove_tags(text):
    return TAG_RE.sub('', text)

def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)] #and (token.is_stop is False)]

def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []

    for sample in _tqdm(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
        tokens = lower_case_remove_punc(sample)
        token_dataset.append(tokens)
        all_tokens += tokens
        
    return token_dataset, all_tokens
                       
#clean from html tags
pos_valid_texts = [remove_tags(t) for t in pos_valid_texts]
neg_valid_texts = [remove_tags(t) for t in neg_valid_texts]
pos_train_texts = [remove_tags(t) for t in pos_train_texts]
neg_train_texts = [remove_tags(t) for t in neg_train_texts]

pos_valid_texts_toked, n1 =  tokenize_dataset(pos_valid_texts)
neg_valid_texts_toked, n2 =  tokenize_dataset(neg_valid_texts)
pos_train_texts_toked, n3 =  tokenize_dataset(pos_train_texts)
neg_train_texts_toked, n4 =  tokenize_dataset(neg_train_texts)
                       
voc = list(set(n1 + n2 + n3 + n4))
print('Word vocabulary size: {} words'.format(len(voc)))

PAD_IDX = 0
UNK_IDX = 1

# get the test data
TEST_FILES_POS = glob('/home/kulikov/vlgwork/aclImdb/test/pos/*')
pos_test_texts = []
TEST_FILES_NEG = glob('/home/kulikov/vlgwork/aclImdb/test/neg/*')
neg_test_texts = []

# get training text in RAM
for fname in _tqdm(TEST_FILES_NEG):
    with open(fname, 'r') as f:
        neg_test_texts.append(f.read())
for fname in _tqdm(TEST_FILES_POS):
    with open(fname, 'r') as f:
        pos_test_texts.append(f.read())

print("Positive training samples : {} \nNegaitve training samples : {}".format(len(pos_test_texts), len(neg_test_texts)))

#clean from html tags
pos_test_texts = [remove_tags(t) for t in pos_test_texts]
neg_test_texts = [remove_tags(t) for t in neg_test_texts]

pos_test_texts_toked, n1 =  tokenize_dataset(pos_test_texts)
neg_test_texts_toked, n2 =  tokenize_dataset(neg_test_texts)



def find_ngrams(input_list, n):
    result_list = []
    for l in input_list:
        result_list.append(list(zip(*[l[i:] for i in range(n)])))
    return result_list

class ImdbDataset(Dataset):
    def __init__(self, data_list, max_inp_length=None, device='cpu'):
        """
        data_list is a list of tuples: (x,y) where x is a list of ids and y is a label
        """
        self.data = data_list
        self.max_len = max_inp_length
        self.data_tensors = []
        for (i, t) in tqdm_notebook(self.data):
            
            self.data_tensors.append((torch.LongTensor(i[:self.max_len]).to(device), torch.LongTensor([t]).to(device)))
              
    def __getitem__(self, key):
        (inp, tgt) = self.data_tensors[key]
        
        return inp, tgt, len(inp)

    def __len__(self):
        return len(self.data)

def pad(tensor, length, dim=0, pad=0):
    """Pad tensor to a specific length.
    :param tensor: vector to pad
    :param length: new length
    :param dim: (default 0) dimension to pad
    :returns: padded tensor if the tensor is shorter than length
    """
    if tensor.size(dim) < length:
        return torch.cat(
            [tensor, tensor.new(*tensor.size()[:dim],
                                length - tensor.size(dim),
                                *tensor.size()[dim + 1:]).fill_(pad)],
            dim=dim)
    else:
        return tensor
    
def batchify(batch):
    maxlen = max(batch, key = itemgetter(2))[-1]
    batch_list = []
    target_list = []
    for b in batch:
        batch_list.append(pad(b[0], maxlen, dim=0, pad=PAD_IDX))
        target_list.append(b[1])
    input_batch = torch.stack(batch_list, 0)
    target_batch = torch.stack(target_list, 0)
    
    return input_batch, target_batch

class BagOfNGrams(nn.Module):
    def init_layers(self):
        for l in self.layers:
            if getattr(l, 'weight', None) is not None:
                torch.nn.init.xavier_uniform_(l.weight)
    
    def __init__(self, vocab_size, emb_dim=300, hidden_size=512, reduce='sum', nlayers=2, act='ReLU', nclasses=2, dropout=0.1, batch_norm=False):
        super(BagOfNGrams, self).__init__()
        self.emb_dim = emb_dim
        self.reduce = reduce
        self.nlayers = nlayers
        self.hidden_size = hidden_size
        self.nclasses = nclasses
        self.act = getattr(nn, act)
        self.embedding = nn.EmbeddingBag(num_embeddings=vocab_size, embedding_dim=emb_dim, mode=reduce)
        if batch_norm is True:
            self.batch_norm = nn.BatchNorm1d(self.emb_dim)
        #self.layers = nn.ModuleList([nn.Linear(self.emb_dim, 1)])
        self.layers = nn.ModuleList([nn.Linear(self.emb_dim, self.hidden_size)])
        self.layers.append(self.act())
        self.layers.append(nn.Dropout(p=dropout))
        for i in range(self.nlayers-2):
            self.layers.append(nn.Linear(self.hidden_size, self.hidden_size))
            self.layers.append(self.act())
            self.layers.append(nn.Dropout(p=dropout))
        self.layers.append(nn.Linear(self.hidden_size, 1))
        self.init_layers()
    
    def forward(self, x):
        postemb = self.embedding(x)
        if hasattr(self, 'batch_norm'):
            x = self.batch_norm(postemb)
        else:
            x = postemb
        for l in self.layers:
            x = l(x)
        
        return x

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))

Positive training samples : 12500 
Negaitve training samples : 12500


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Word vocabulary size: 92929 words


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))

Positive training samples : 12500 
Negaitve training samples : 12500


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [61]:
import itertools

OPTS = {
    'lr': [0.001, 0.1],
    'batch_size': [512],
    'max_voc_size': [1000, 100000],
    'context_length': [1,2,3,4],
    'max_input_length': [None, 300],
    'embedding_size': [30, 300],
    'num_epochs': [20],
    'optimizer': ['Adam', 'SGD'],
    
}

indp = [[{key: value} for value in values] for key, values in OPTS.items()]
product_options = list(itertools.product(*indp))

In [62]:
prod_dicts = []
for t in product_options:
    prod_dicts.append({ k: v for d in t for k, v in d.items() })

In [84]:
def do_train_val(opts):
    #print(opts)
    NGRAM = opts['context_length']  # change this to make different N grams for each token
    # now make train and valid dicts

    train_data = {'pos': find_ngrams(pos_train_texts_toked, NGRAM),
                  'neg': find_ngrams(neg_train_texts_toked, NGRAM)}
    valid_data = {'pos': find_ngrams(pos_valid_texts_toked, NGRAM),
                 'neg': find_ngrams(neg_valid_texts_toked, NGRAM)}

    max_vocab_size = opts['max_voc_size']
    # save index 0 for unk and 1 for pad
    PAD_IDX = 0
    UNK_IDX = 1

    all_train_tokens = list(mit.flatten(train_data['pos'] + train_data['neg'] + valid_data['pos'] + valid_data['neg']))
    counted_tokens = Counter(all_train_tokens)

    vocab, count = zip(*counted_tokens.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX

    # Lets check the dictionary by loading random token from it

    random_token_id = random.randint(0, len(id2token)-1)
    random_token = id2token[random_token_id]

    #print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
    #print ("Token {}; token id {}".format(random_token, token2id[random_token]))

    def _text2id(doc):
        return [token2id[t] if t in token2id else UNK_IDX for t in doc]

    def _id2text(vec):
        return [id2token[i] for i in vec]

    train_data_id = {}
    valid_data_id = {}

    train_data_id['pos'] = []
    for d in train_data['pos']:
        train_data_id['pos'].append(_text2id(d))

    train_data_id['neg'] = []
    for d in train_data['neg']:
        train_data_id['neg'].append(_text2id(d))

    valid_data_id['pos'] = []
    for d in valid_data['pos']:
        valid_data_id['pos'].append(_text2id(d))

    valid_data_id['neg'] = []
    for d in valid_data['neg']:
        valid_data_id['neg'].append(_text2id(d))

    train_data_id_merged = []
    valid_data_id_merged = []

    for d in train_data_id['pos']:
        train_data_id_merged.append((d, 0))
    for d in train_data_id['neg']:
        train_data_id_merged.append((d, 1))

    for d in valid_data_id['pos']:
        valid_data_id_merged.append((d, 0))
    for d in valid_data_id['neg']:
        valid_data_id_merged.append((d, 1))

    train_dataset = ImdbDataset(train_data_id_merged, max_inp_length=opts['max_input_length'], device='cuda')
    train_loader = DataLoader(train_dataset, batch_size=opts['batch_size'], collate_fn=batchify, shuffle=True)

    valid_dataset = ImdbDataset(valid_data_id_merged, max_inp_length=opts['max_input_length'], device='cuda')
    valid_loader = DataLoader(valid_dataset, batch_size=opts['batch_size'], collate_fn=batchify, shuffle=True)
    
    test_data = {'pos': find_ngrams(pos_test_texts_toked, opts['context_length']),
                  'neg': find_ngrams(neg_test_texts_toked, opts['context_length'])}

    test_data_id = {}
    test_data_id['pos'] = []
    for d in test_data['pos']:
        test_data_id['pos'].append(_text2id(d))

    test_data_id['neg'] = []
    for d in test_data['neg']:
        test_data_id['neg'].append(_text2id(d))

    test_data_id_merged = []

    for d in test_data_id['pos']:
        test_data_id_merged.append((d, 0))
    for d in test_data_id['neg']:
        test_data_id_merged.append((d, 1))
        
    test_dataset = ImdbDataset(test_data_id_merged, max_inp_length=opts['max_input_length'], device='cuda')
    test_loader = DataLoader(test_dataset, batch_size=opts['batch_size'], collate_fn=batchify, shuffle=False)

    model = BagOfNGrams(len(id2token), emb_dim=opts['embedding_size'], hidden_size=2048, act='Tanh', nlayers=1, reduce='mean', dropout=0.0, batch_norm=False)
    model.cuda()
    
    #print(model)

    learning_rate = opts['lr']
    num_epochs = opts['num_epochs'] # number epoch to train

    # Criterion and Optimizer
    criterion = torch.nn.BCEWithLogitsLoss(reduction='elementwise_mean')
    if opts['optimizer'] == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.99, nesterov=True)
    elif opts['optimizer'] == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), amsgrad=True, lr=learning_rate)

    # Function for testing the model
    def test_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        model.eval()
        for data, labels in loader:
            outputs = torch.sigmoid(model(data))
            #import ipdb; ipdb.set_trace()
            predicted = (outputs > 0.5).long()
            total += labels.size(0)
            correct += predicted.eq(labels.view_as(predicted)).sum().item()
        return (100 * correct / total)

    val_accs = []
    for epoch in tqdm_notebook(range(num_epochs)):
        for i, (data, labels) in enumerate(train_loader): 
            model.train()
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs.view(-1), labels.float().view(-1))
            loss.backward()

            optimizer.step()
        val_acc = test_model(loader=valid_loader, model=model)
        val_accs.append((loss.item(),val_acc))
            
    return [opts, test_dataset, test_loader, val_accs, model]
    #return [opts, val_accs]

In [None]:
results = []

for o in tqdm_notebook(prod_dicts):
    out = do_train_val(o)
    results.append(out)

HBox(children=(IntProgress(value=0, max=128), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

# Test set creation

In [59]:
def test_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        model.eval()
        for data, labels in loader:
            outputs = torch.sigmoid(model(data))
            #import ipdb; ipdb.set_trace()
            predicted = (outputs > 0.5).long()
            total += labels.size(0)
            correct += predicted.eq(labels.view_as(predicted)).sum().item()
        return (100 * correct / total)

In [60]:
test_model(loader=tl, model=model)

57.616

In [69]:
len(results)

128

In [70]:
import pickle


In [72]:
with open('gridres.pkl', 'wb') as f: pickle.dump(file=f, obj=results)

In [74]:
for r in results:
    print('Last valacc: {}'.format(r[1][-1][1]))

Last valacc: 80.92
Last valacc: 55.04
Last valacc: 80.98
Last valacc: 60.66
Last valacc: 82.58
Last valacc: 60.34
Last valacc: 73.6
Last valacc: 69.36
Last valacc: 75.96
Last valacc: 51.16
Last valacc: 58.54
Last valacc: 50.06
Last valacc: 75.92
Last valacc: 52.54
Last valacc: 69.22
Last valacc: 62.1
Last valacc: 50.34
Last valacc: 49.76
Last valacc: 55.48
Last valacc: 50.56
Last valacc: 69.3
Last valacc: 50.0
Last valacc: 63.32
Last valacc: 52.62
Last valacc: 50.0
Last valacc: 51.22
Last valacc: 50.52
Last valacc: 50.74
Last valacc: 54.22
Last valacc: 50.0
Last valacc: 62.68
Last valacc: 52.3
Last valacc: 84.38
Last valacc: 58.7
Last valacc: 84.22
Last valacc: 60.04
Last valacc: 83.18
Last valacc: 58.3
Last valacc: 79.5
Last valacc: 69.16
Last valacc: 84.48
Last valacc: 51.34
Last valacc: 87.06
Last valacc: 51.3
Last valacc: 82.04
Last valacc: 52.62
Last valacc: 84.28
Last valacc: 60.22
Last valacc: 78.2
Last valacc: 50.82
Last valacc: 67.86
Last valacc: 50.02
Last valacc: 78.26
Last 

In [76]:
max([(i,r[1][-1][1]) for i,r in enumerate(results)], key=lambda x: x[1])

(104, 87.66)

In [78]:
results[104]
prod_dicts[104]

{'lr': 0.1,
 'batch_size': 512,
 'max_voc_size': 100000,
 'context_length': 2,
 'max_input_length': None,
 'embedding_size': 30,
 'num_epochs': 20,
 'optimizer': 'Adam'}

In [85]:
out = do_train_val(prod_dicts[104])

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))