In [1]:
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import random
%matplotlib inline
import pickle as pkl
from tqdm import tqdm_notebook
random.seed(134)

In [2]:
#change labels
def change_label(label):
    if label == 'neutral':
        return 0.0
    elif label == 'entailment':
        return 1.0
    elif label == 'contradiction':
        return 2.0
    
def split_file_token(file):
    file_ls = []
    y = []
    with open(file) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar=' ')
        for row in rd:
            if row[0] == 'sentence1':
                pass
            else:
                file_ls.append([row[0].split(),row[1].split(),row[3]])
                y.append(change_label(row[2]))
    return file_ls,y

In [3]:
mnli_val_path = "hw2_data/mnli_val.tsv"      
mnli_val_token,mnli_val_y = split_file_token(mnli_val_path)

In [4]:
genres_val = [mnli_val_token[i][2] for i in range(len(mnli_val_token))]
genres = np.unique(genres_val)
genres

array(['fiction', 'government', 'slate', 'telephone', 'travel'],
      dtype='<U10')

In [5]:
def tokenizer_dt(tokenizer_pair):
    premise = []
    hypothesis = []
    for i in tokenizer_pair:
        premise.append(i[0])
        hypothesis.append(i[1])
    return premise,hypothesis

#build vocabulary dictionary
PAD_IDX = 0
UNK_IDX = 1
Emb_Mtx = []
mnli_train_token2id = {}
mnli_train_id2token = {}
        
words_to_load = 60000    
with open('wiki-news-300d-1M.vec') as f:
    Emb_Mtx = np.zeros((words_to_load+2, 300))
    Emb_Mtx[UNK_IDX] = np.ones(300)
    mnli_train_token2id['<pad>'] = PAD_IDX 
    mnli_train_token2id['<unk>'] = UNK_IDX
    mnli_train_id2token[PAD_IDX] = '<pad>'
    mnli_train_id2token[UNK_IDX] = '<unk>'
    for i, line in tqdm_notebook(enumerate(f)):
        if i >= words_to_load: 
            break
        s = line.split()
        Emb_Mtx[i+2, :] = np.asarray(s[1:])
        mnli_train_token2id[s[0]] = i+2
        mnli_train_id2token[i+2] = s[0]

#get index
def token2index(tokens_data,token_id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token_id[token] if token in token_id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
Emb_Mtx = torch.from_numpy(Emb_Mtx)

In [7]:
MAX_SENTENCE_LENGTH = 30
BATCH_SIZE = 32
class SNLIDataset(Dataset):
    def __init__(self, data_list1, data_list2, target_list):
        
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        self.target_list = target_list
        
        assert (len(self.data_list1) == len(self.target_list))
        assert (len(self.data_list2) == len(self.target_list))

    def __len__(self):
        #return len(self.data_list1)
        return len(self.target_list)
        
    def __getitem__(self, key):
        token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH]
        token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx1, len(token_idx1),token_idx2, len(token_idx2), label]
    
def MNLI_collate_func(batch):
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []
    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[1])
        length_list2.append(datum[3])
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        
        padded_vec2 = np.pad(np.array(datum[2]), 
                             pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                             mode="constant", constant_values=0)
        data_list2.append(padded_vec2)
        
    return [torch.from_numpy(np.array(data_list1)).cuda(), torch.LongTensor(length_list1).cuda(), 
            torch.from_numpy(np.array(data_list2)).cuda(), torch.LongTensor(length_list2).cuda(), 
            torch.LongTensor(label_list).cuda()]

In [8]:
def genre_cut(genre):
    ind_val = [i for i, x in enumerate(genres_val) if x == genre]

    val_genre_x = [[mnli_val_token[i][0],mnli_val_token[i][1]] for i in ind_val]
    val_genre_y = [mnli_val_y[i] for i in ind_val]
    
    val_pre,val_hyp = tokenizer_dt(val_genre_x)
    
    indices_val_pre = token2index(val_pre,mnli_train_token2id)
    indices_val_hyp = token2index(val_hyp,mnli_train_token2id)

    val_dataset = SNLIDataset(indices_val_pre,indices_val_hyp, val_genre_y)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=BATCH_SIZE,
                                             collate_fn=MNLI_collate_func,
                                             shuffle=True)

    return val_loader,val_genre_y

# RNN Model

In [9]:
class RNN(nn.Module):
    def __init__(self, hidden_size1, hidden_size2,size, num_layers, num_classes, emb_size=300):

        super(RNN, self).__init__()

        self.num_layers, self.size = num_layers, size
        self.hidden_size1,self.hidden_size2 = hidden_size1, hidden_size2
        
        self.embedding = nn.Embedding.from_pretrained(Emb_Mtx).float()
       
        self.rnn1 = nn.GRU(emb_size, hidden_size1, num_layers, batch_first = True, bidirectional = True)
        self.rnn2 = nn.GRU(emb_size, hidden_size2, num_layers, batch_first = True, bidirectional = True)  
        
        self.linear1 = nn.Linear(hidden_size1 + hidden_size2, size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(size, num_classes)

    def init_hidden(self, batch_size, hidden_size):
        hidden = torch.randn(self.num_layers * 2 , batch_size, hidden_size).cuda()
        return hidden

    def forward(self, x1, lengths1, x2, lengths2):
        
        # reset hidden state
        batch_size1, seq_len1 = x1.size()
        batch_size2, seq_len2 = x2.size()
        
        self.hidden1 = self.init_hidden(batch_size1,self.hidden_size1)
        self.hidden2 = self.init_hidden(batch_size2,self.hidden_size2)

        # get embedding of characters
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
 
        # no pack padded sequence
    
        # fprop though RNN
        rnn_out1, self.hidden1 = self.rnn1(embed1, self.hidden1)
        rnn_out2, self.hidden2 = self.rnn2(embed2, self.hidden2)
        
        # sum hidden activations of RNN across time
        cat_out = torch.cat([self.hidden1,self.hidden2],dim = -1) 
        rnn_out = torch.sum(cat_out, dim=0)
        rnn_out = self.linear1(rnn_out)
        rnn_out = self.relu(rnn_out)
        logits = self.linear2(rnn_out)
        
        return logits

# CNN Model

In [10]:
class CNN(nn.Module):
    def __init__(self,k_size,p_size, hidden_size,size2, num_layers, num_classes,emb_size =300):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size, self.size2 = num_layers, hidden_size, size2
        self.k_size,self.p_size = k_size, p_size
        
        self.embedding = nn.Embedding.from_pretrained(Emb_Mtx).float()
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=k_size, padding=p_size)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=k_size, padding=p_size)
        self.conv3 = nn.Conv1d(emb_size, hidden_size, kernel_size=k_size, padding=p_size)
        self.conv4 = nn.Conv1d(hidden_size, hidden_size, kernel_size=k_size, padding=p_size)
        
        self.maxpool1 = nn.MaxPool1d(MAX_SENTENCE_LENGTH)
        self.maxpool2 = nn.MaxPool1d(MAX_SENTENCE_LENGTH)
        
        self.linear1 = nn.Linear(2*hidden_size, size2)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(size2, num_classes)

    def forward(self, x1, lengths1, x2, lengths2):
        batch_size1, seq_len1 = x1.size()
        batch_size2, seq_len2 = x2.size()

        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        hidden2 = self.conv3(embed2.transpose(1,2)).transpose(1,2)
        
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, seq_len1, hidden1.size(-1))
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))

        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden2 = self.conv4(hidden2.transpose(1,2)).transpose(1,2)
        
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, hidden1.size(-1), seq_len1)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, hidden2.size(-1), seq_len2)

        hidden1 = self.maxpool1(hidden1)
        hidden2 = self.maxpool2(hidden2)
        
        # sum hidden activations of CNN across time
        cat_out = torch.cat([hidden1,hidden2],dim = 1) 

        rnn_out = torch.sum(cat_out, dim = -1)
        
        rnn_out = self.linear1(rnn_out)
        rnn_out = self.relu(rnn_out)
        logits = self.linear2(rnn_out)
        
        return logits

# Evaluation

In [11]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, lengths1, data2, lengths2, labels in loader:
        data_batch1, lengths_batch1, label_batch = data1, lengths1, labels
        data_batch2, lengths_batch2 = data2, lengths2
        outputs = F.softmax(model(data_batch1, lengths_batch1,data_batch2, lengths_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [12]:
fic_val_loader,fic_val_genre_y = genre_cut('fiction')
gov_val_loader,gov_val_genre_y = genre_cut('government')
sla_val_loader,sla_val_genre_y = genre_cut('slate')
tel_val_loader,tel_val_genre_y = genre_cut('telephone')
tra_val_loader,tra_val_genre_y = genre_cut('travel')

In [19]:
model = CNN(k_size = 3,p_size=1,hidden_size = 200,size2 = 100, num_layers = 2, num_classes = 3)
model.cuda()

print("For Fiction Val acc of best CNN: {}".format(round(test_model(fic_val_loader, model),2)))
print("For Government Val acc of best CNN: {}".format(round(test_model(gov_val_loader, model),2)))
print("For Slate Val acc of best CNN: {}".format(round(test_model(sla_val_loader, model),2)))
print("For Telephone Val acc of best CNN: {}".format(round(test_model(gov_val_loader, model),2)))
print("For Travel Val acc of best CNN: {}".format(round(test_model(tra_val_loader, model),2)))

For Fiction Val acc of best CNN: 30.65
For Government Val acc of best CNN: 29.33
For Slate Val acc of best CNN: 35.03
For Telephone Val acc of best CNN: 29.33
For Travel Val acc of best CNN: 30.75


In [18]:
model = RNN(hidden_size1 = 150, hidden_size2 = 150,size = 50, num_layers = 1, num_classes = 3)
model.cuda()

print("For Fiction Val acc of best RNN: {}".format(round(test_model(fic_val_loader, model),2)))
print("For Government Val acc of best RNN: {}".format(round(test_model(gov_val_loader, model),2)))
print("For Slate Val acc of best RNN: {}".format(round(test_model(sla_val_loader, model),2)))
print("For Telephone Val acc of best RNN: {}".format(round(test_model(gov_val_loader, model),2)))
print("For Travel Val acc of best RNN: {}".format(round(test_model(tra_val_loader, model),2)))

For Fiction Val acc of best RNN: 34.27
For Government Val acc of best RNN: 33.96
For Slate Val acc of best RNN: 30.14
For Telephone Val acc of best RNN: 33.96
For Travel Val acc of best RNN: 34.01
