In [1]:
!head ./hw2_data/snli_train.tsv

sentence1	sentence2	label
A young girl in a pink shirt sitting on a dock viewing a body of water .	A young girl watching the sunset over the water .	neutral
A woman is smiling while the man next to her is focused on a blue object with a pattern on it .	Two people are next to each other .	entailment
Across the river , you can see a large building .	The large building is full of apartments and tenants	neutral
a man in white shorts and a black shirt is paragliding on the ocean	A man is riding a jetski on the ocean .	contradiction
Four black dogs run together on bright green grass .	Four dogs are preparing to be launched into space .	contradiction
A female laying on her stomach in the water outside with umbrellas .	There is a women outdoors	entailment
Children eat at a long table with black chairs .	Kids at a short table with red chairs .	contradiction
A person rides a motorcycle quickly .	The man is racing his motorcycle in a race .	neutral
Woman riding a red bicycle down a city 

In [2]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
random.seed(134)

from bagofwords import BagOfWords
from module import Module as M
from hyperparameter import Hyperparameter as hp
import RR


import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize   
from module import Module as M
from train_f import *
%load_ext autoreload
%autoreload 2


PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [3]:
def build_all_tokens(inp1, inp2):
    all_tokens = []
    for sent in inp1:
        for x in sent:
            all_tokens.append(x)   
    for sent in inp2:
        for x in sent:
            all_tokens.append(x)    
    
    return all_tokens
    
def read_data(filename, count = 9999999):
    x1 = []
    x2 = []
    y = []
    x1_w = []
    x2_w = []
    raw_data = []
    filename =hp.prepath_data + filename
    len_list = []
    with open(filename, "r") as f:

        line_num = 0
        line = f.readline()
        line = f.readline()
        while line != None and line != "" and line_num<count:
            line = line.lower()
            arr = line.split('\t')
            #list of length of  sentences
            x1.append(arr[0])
            x2.append(arr[1])
            y.append(hp.dummy2int[arr[2][:-1]])
            line_num += 1
            raw_data.append(line)
            line = f.readline()
        x1_w = [line.split() for line in x1]
        x2_w = [line.split() for line in x2]
        len_list = [len(line.split()) for line in x1]
        len_list += [len(line.split()) for line in x2]
        # take the 99 precentile of the length, let it be maximum sentence length
        if count < 1000:
            MAX_SENTENCE_LENGTH = sorted(len_list, reverse = True)[0]
        else:
            MAX_SENTENCE_LENGTH = sorted(len_list, reverse = True)[1000]
        a = sorted(len_list, reverse = True)
        
        all_tokens = build_all_tokens(x1_w, x2_w)
    return x1_w, x2_w, y, MAX_SENTENCE_LENGTH, all_tokens


In [4]:
count = 9999999999
train_x1, train_x2, train_y, MAX_SENTENCE_LENGTH, all_train_tokens  = read_data('snli_train.tsv', count = count)
val_x1, val_x2, val_y, _, _ = read_data('snli_val.tsv', count = count)

In [5]:
from collections import Counter

max_vocab_size = 20000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [6]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_x1_indices = token2index_dataset(train_x1)
train_x2_indices = token2index_dataset(train_x2)
val_x1_indices = token2index_dataset(val_x1)
val_x2_indices = token2index_dataset(val_x2)

# double checking
print ("Train dataset 1 size is {}".format(len(train_x1_indices)))
print ("Train dataset 2 size is {}".format(len(train_x2_indices)))
print ("Val dataset 1 size is {}".format(len(val_x1_indices)))
print ("Val dataset 2 size is {}".format(len(val_x2_indices)))



Train dataset 1 size is 100000
Train dataset 2 size is 100000
Val dataset 1 size is 1000
Val dataset 2 size is 1000


In [7]:
ft_home = './'
words_to_load = 50000
emb_dim =  300

with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((len(id2token), emb_dim))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        #if the word in vocabulary is in fasttext, we load the embedding for that word.
        if s[0] in token2id:
            idx = token2id[s[0]]
            loaded_embeddings_ft[idx] = np.asarray(s[1:])
        

In [8]:
#if the word in vocabulary is not in fasttext(include unk and pad), we initialize a random vector.
for i in range(len(id2token)):
    if loaded_embeddings_ft[i][0] == 0:
        loaded_embeddings_ft[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [9]:
def label2onehot_helper(x):
    if x == 0:
        return np.array([1,0,0])
    elif x == 1:
        return np.array([0,1,0])
    elif x == 2:
        return np.array([0,0,1])
def label2onehot(labels):
    output = np.zeros((len(labels), 3))
    for i, x in enumerate(labels):
        output[i] = label2onehot_helper(x)
    return output

In [10]:
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list1, data_list2,target_list):
        """
        @param data_list: list of SNLI tokens 
        @param target_list: list of SNLI targets 

        """
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        self.target_list = target_list
        assert (len(self.data_list1) == len(self.target_list))

    def __len__(self):
        return len(self.data_list1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH]
        token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx1, token_idx2, len(token_idx1), len(token_idx2),label]

def SNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[2])
        length_list2.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        data_list2.append(padded_vec2)
    
    #transform labels to one hot label to fit into cross-entropy loss
    label_list_onehot = label2onehot(label_list)
    # define example
    
    
    
    return [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2)),
            torch.LongTensor(length_list1), torch.LongTensor(length_list2),torch.LongTensor(label_list_onehot)]

BATCH_SIZE = 32
train_dataset = SNLIDataset(train_x1_indices, train_x1_indices,train_y)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(val_x1_indices, val_x1_indices, val_y)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

In [11]:
def create_emb_layer(weights_matrix, non_trainable=False):
    weights_matrix = torch.from_numpy(weights_matrix)
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [12]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        #self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, False)

        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = torch.sum(hidden, dim=1)
        logits = self.linear(hidden)
        return logits

In [13]:

    
class RNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        #self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, False)
        self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional = True) #dim1: batch dim2: sequence dim3: emb
        self.linear1 = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.linear2 = nn.Linear(hidden_size * 2, num_classes)
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size)

        return hidden
        
    def forward(self, x1,x2):
        # reset hidden state

        batch_size, seq_len = x1.size()
        
        self.hidden = self.init_hidden(batch_size)

        # get embedding of characters
        #embed = self.embedding(x)
        # pack padded sequence
        #embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        rnn_out1, self.hidden1 = self.rnn(embed1, self.hidden)
        rnn_out2, self.hidden2 = self.rnn(embed2, self.hidden)
        rnn_out = torch.cat((rnn_out1, rnn_out2),1)
        linear_out1 = self.linear1(rnn_out)
        logits = self.linear2(linear_out1)
        # undo packing
        #rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        #rnn_out = torch.sum(rnn_out, dim=1)
        
        #logits = self.linear(rnn_out)
        return logits


In [14]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1,data2,lengths1,length2,labels in loader:
        data1_batch, data2_batch,lengths1_batch,lengths2_batch, label_batch = data1,data2,lengths1,length2,labels
        outputs = F.softmax(model(data1_batch, data2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = RNN(weights_matrix=loaded_embeddings_ft, hidden_size=200, num_layers=1, num_classes=3)
learning_rate = 3e-4
num_epochs = 2 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    losses = AverageMeter()
    for i, (x1, x2, lengths1, lengths2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(x1, x2)
        loss = criterion(outputs, labels)
        losses.update(loss, x1.size(0))
        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print(' Epoch: [{}/{}], Step: [{}/{}], Training loss: {loss.avg:.4f}, Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), loss = losses, val_acc))


KeyboardInterrupt: 

In [15]:
a = AverageMeter()

In [None]:
def build_vocab(data):
    # Returns:
    # id2char: list of chars, where id2char[i] returns char that corresponds to char i
    # char2id: dictionary where keys represent chars and corresponding values represent indices
    # some preprocessing
    max_len = max([len(word[0]) for word in data])
    all_chars = []
    for word in data:
        all_chars += word[0]
    unique_chars = list(set(all_chars))

    id2char = unique_chars
    char2id = dict(zip(unique_chars, range(2,2+len(unique_chars))))
    id2char = ['<pad>', '<unk>'] + id2char
    char2id['<pad>'] = PAD_IDX
    char2id['<unk>'] = UNK_IDX

    return char2id, id2char, max_len

def convert_to_chars(data):
    return [([c for c in sample[0]], sample[1]) for sample in data]

### Function that preprocessed dataset
def read_data():
    data = pkl.load(open("data.p", "rb"))
    train_data, val_data, test_data = data['train'], data['valid'], data['test']
    train_data, val_data, test_data = convert_to_chars(train_data), convert_to_chars(val_data), convert_to_chars(test_data)
    char2id, id2char, max_len = build_vocab(train_data)
    return train_data, val_data, test_data, char2id, id2char, max_len


In [None]:
train_data, val_data, test_data, char2id, id2char, MAX_WORD_LENGTH = read_data()

print ("Maximum word length of dataset is {}".format(MAX_WORD_LENGTH))
print ("Number of characters in dataset is {}".format(len(id2char)))
print ("Characters:")
print (char2id.keys())

In [None]:
data = pkl.load(open("data.p", "rb"))
train_data, val_data, test_data = data['train'], data['valid'], data['test']
train_data

### Now lets build the PyTorch DataLoader as we did in previous lab

In [None]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, char2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = zip(*data_tuple)
        assert (len(self.data_list) == len(self.target_list))
        self.char2id = char2id

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        char_idx = [self.char2id[c] if c in self.char2id.keys() else UNK_IDX  for c in self.data_list[key][:MAX_WORD_LENGTH]]
        label = self.target_list[key]
        return [char_idx, len(char_idx), label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_WORD_LENGTH-datum[1])),
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    ind_dec_order = np.argsort(length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    length_list = np.array(length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


In [None]:
# Build train, valid and test dataloaders

train_dataset = VocabDataset(train_data, char2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_data, char2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_data, char2id)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

### Now lets implement basic Recurrent Neural Net model

## Important things to keep in mind when using variable sized sequences in RNN in Pytorch

### RNN modules accept packed sequences as inputs
* pack_padded_sequence function packs a sequence (in Tensor format) containing padded sequences of variable length. **IMPORTANT: the sequences should be sorted by length in a decreasing order before passing to this function**

* pad_packed_sequence function is an inverse operation to pack_padded_sequence. Transforms a padded sequence into a tensor of variable lenth sequences

## Exercise 1:
### Implement LSTM cell instead of RNN cell. Train the model and compare the results.
### Hint (modify init_hidden function and cell in __init__) 

## Exercise 2:
### Implement Bidirectional LSTM. You can do it very easily by adding one argument to cell when you create it.
### For better understanding we recommend that you implement it youself by reversing a sequence and passing it to another cell.

## Exercise 3:

### Add max-pooling (over time) after passing through RNN instead of summing over hidden layers through time

### Now lets implement basic Convolutional Neural Net model for text


## Important things to keep in mind when using Convolutional Nets for Language Tasks in Pytorch

### Conv1d module expect input of size (batch_size, num_channels, length), where in our case input has size (batch_size, length, num_channels). Hence it is important call transpose(1,2) before passing it to convolutional layer and then reshape it back to (batch_size, length, num_channels) by calling transpose(1,2) again

### Additionally we need to reshape hidden activations into 2D tensor before passing it to Relu layer by calling view(-1, hidden.size(-1)

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = CNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


## Exercise 4:
### Implement Gated Relu activations as well as Gated Linear activations and compare them with Relu (reference: https://arxiv.org/pdf/1612.08083.pdf )
### Hint: Gated Relu activations are sigmoid(conv1_1(x)) * relu(conv1_2(x))
### Hint: Gated Linear activations are sigmoid(conv1_1(x)) * conv1_2(x)

### Feel free to play with other variants of gating


## Exercise 5:

### Add max-pooling (over time) after passing through conv as well as add non-linear fully connected layer

## Exercise 6:

### Use Bag-of-Words and Bag-of-NGrams model for this task and compare it with RNN and CNN

## Exercise 7:

### Use FastText for this task