In [1]:
# coding: utf-8

# In[1]:

# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import csv
random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# In[2]:

# Fasttext


# In[3]:

def build_vocab():
    words_to_load = 50000

    with open('wiki-news-300d-1M.vec', 'rb') as f:
        loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
        words2idx_ft = {}
        idx2words_ft = {}
        ordered_words_ft = []
        #give <pad> and <unk> random vectors
        loaded_embeddings_ft[0] = np.random.rand(1,300) * 0
        loaded_embeddings_ft[1] = np.random.rand(1,300)

        idx2words_ft[PAD_IDX] =  '<pad>'
        idx2words_ft[UNK_IDX] = '<unk>'
        words2idx_ft['<pad>'] = PAD_IDX
        words2idx_ft['<unk>'] = UNK_IDX
        ordered_words_ft.append('<pad>')
        ordered_words_ft.append('<unk>')

        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.decode('utf8').split()
            loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
            words2idx_ft[s[0]] = i+2
            idx2words_ft[i+2] = s[0]
            ordered_words_ft.append(s[0])
        return words2idx_ft, idx2words_ft, loaded_embeddings_ft

In [3]:
# In[4]:
def convert_to_words(data):
    return [(sample[0].split(), sample[1].split(), sample[2]) for sample in data]

def read_data():
    train_dataset = 'snli_train.tsv'
    val_dataset = 'snli_val.tsv'
    with open(train_dataset) as tsvfile1:
        reader1 = csv.reader(tsvfile1, delimiter='\t')
        train_data =convert_to_words(reader1)
    max_len1 = max([len(word[0]) for word in train_data])
    max_len2 = max([len(word[0]) for word in train_data])
    max_len = max(max_len1, max_len2)
    
    with open(val_dataset) as tsvfile2:
        reader2 = csv.reader(tsvfile2, delimiter = '\t')
        val_data = convert_to_words(reader2)
    words2id, id2words, loaded_embeddings = build_vocab()
    return train_data[1:10000], val_data[1:10000], words2id, id2words, loaded_embeddings, max_len

In [4]:
pretrained_embeddings = build_vocab()[2]


In [5]:
# In[5]:

train_data, val_data, words2id, id2words, loaded_embeddings, max_len =read_data()
labels_list = ["entailment", "neutral", "contradiction"]

# In[6]:

print ("Maximum word length of dataset is {}".format(max_len))
print ("Number of words in dataset is {}".format(len(id2words)))
#print ("Characters:")
#print (char2id.keys())
#print(train_data[:10])


Maximum word length of dataset is 82
Number of words in dataset is 50002


# convert token to id in the dataset
def token2index_dataset(tokens_data):
    index_lists1 = []
    index_lists2 = []
    index_lists3 = []
    for tokens in tokens_data:
        index_list1 = [words2id[token] if token in words2id else UNK_IDX for token in tokens[0]]
        index_list2 = [words2id[token] if token in words2id else UNK_IDX for token in tokens[1]]
        index_lists3.append(tokens[2])
        index_lists1.append(index_list1)
        index_lists2.append(index_list2)
    return index_lists1,index_lists2, index_lists3

train_data_indices1, train_data_indices2, train_target_indices = token2index_dataset(train_data)
val_data_indices1, val_data_indices2, val_target_indices = token2index_dataset(val_data)

In [6]:
# In[7]:

class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, words2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list1,self.data_list2, self.target_list = zip(*data_tuple)
        assert (len(self.data_list1) ==len(self.data_list2))
        assert (len(self.data_list2) == len(self.target_list))
        self.words2id = words2id

    def __len__(self):
        return len(self.data_list1)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        word_idx1 = [self.words2id[c] if c in self.words2id else UNK_IDX  for c in self.data_list1[key][:max_len]]
        word_idx2 = [self.words2id[c] if c in self.words2id else UNK_IDX  for c in self.data_list2[key][:max_len]]
        label = labels_list.index(self.target_list[key])
        return [word_idx1,word_idx2, len(word_idx1),len(word_idx2), label]


# In[8]:




In [7]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []

    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[2])
        length_list2.append(datum[3])
        
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]),
                                pad_width=((0,max_len -datum[2])),
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        
        padded_vec2 = np.pad(np.array(datum[1]),
                                pad_width=((0, max_len -datum[3])),
                                mode="constant", constant_values=0)
        data_list2.append(padded_vec2)

    

    return (torch.LongTensor(data_list1).cuda(), 
        torch.LongTensor(data_list2).cuda(),
        torch.LongTensor(length_list1).cuda(), 
        torch.LongTensor(length_list2).cuda(),
        torch.LongTensor(label_list).cuda())

In [8]:
# In[9]:

train_dataset = VocabDataset(train_data, words2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_data, words2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

In [9]:
# In[10]:

#CNN


# In[11]:

class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_embeddings), freeze=True).cuda()
        #self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        
        #self.embedding.weight.data.copy_(torch.from_numpy(build_vocab()[2]))
        #self.embedding.weight.requires_grad = False
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)
        

        self.maxpool = nn.MaxPool1d(82, stride = 2)
        self.fc1 = nn.Linear(hidden_size*2,64)
        
        self.fc2 = nn.Linear(64, 3)

        #self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x1,x2, lengths1, lengths2):
        batch_size1, seq_len1 = x1.size()
        batch_size2, seq_len2 = x2.size()
        x1 = x1.long()

        embed1 = self.embedding(x1).float()
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        #print(hidden1.size())
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, seq_len1, hidden1.size(-1))
        #print(hidden1.size())

        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        #print(hidden1.size())
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, seq_len1, hidden1.size(-1))
        #print(hidden1.size())
        
        output1 = self.maxpool(hidden1.transpose(1,2)).transpose(1, 2).squeeze(1)
        #print(hidden1.size())
        #output1 = torch.max(output1, dim=1)[0]
        
        
        batch_size2, seq_len2 = x1.size()
        x2 = x2.long()

        embed2 = self.embedding(x2)
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))

        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))
        #print(hidden2.transpose(1,2).size())
        output2 = self.maxpool(hidden2.transpose(1,2)).transpose(1, 2).squeeze(1)
        #print(output2.size())
        
        #output2 = torch.max(output2, dim=1)[0]
        #print(output2.size())
        output = torch.cat((output1, output2), dim=1)
        #print(output.size())
        output = F.relu(self.fc1(output))
        #print(output.size())
        output = self.fc2(output)
        
        '''
        
        
        output1 = torch.max(hidden1, dim=1)[0]
        #print(hidden1.size())
        #output1 = torch.sum(output1, dim=1)
        
        
        
        x2 = x2.long()

        embed2 = self.embedding(x2).float()
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))

        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))
        output2 = torch.max(hidden1, dim=1)[0]
        
        #output2 = torch.sum(output2, dim=1)
        output = torch.cat((output1, output2), dim=1)
    
        #print(output.size())
        output = F.relu(self.fc1(output))
        #print(output.size())
        output = self.fc2(output)
        #print(output.size())
        #output = F.softmax(output, dim =1)
        #print(output.size())
        
        
        

        #hidden = torch.sum(hidden, dim=1)
        #logits = self.linear(hidden)
        '''
        return output
    
    

In [10]:
# In[12]:

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1,data2, lengths1, lengths2, labels in loader:
        data_batch1,data_batch2, lengths_batch1,lengths_batch2, label_batch = data1,data2, lengths1, lengths2,labels
        outputs = F.softmax(model(data_batch1,data_batch2, lengths_batch1, lengths_batch2), dim=1)
        #outputs = model(data_batch1,data_batch2, lengths_batch1, lengths_batch2)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def earily_stop(val_acc_history, t=5, required_progress=0.001):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: Finished
    stop=False
    repeat=0
    if len(val_acc_history)<=t:
        stop=False
    else:
        for i in range(1,t+1):
            i=-i
            diff=val_acc_history[i]-val_acc_history[i-1]
            
            if diff-required_progress<=0.00001:
                repeat+=1
            if repeat==t:
                stop=True
                break
    return stop

In [11]:
learning_rate = 1e-4
num_epochs = 10 # number epoch to train

model = CNN(emb_size=300, hidden_size=200, num_layers=2, vocab_size=len(id2words)).cuda()

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

# Train the model
total_step = len(train_loader)
validation_acc_history = []
stop_training = False

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (data1,data2, lengths1,lengths2, labels) in enumerate(train_loader):
        
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data1,data2, lengths1,lengths2)
        #print(outputs.size())
        #print(labels.size())
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            train_acc = test_model(train_loader, model)
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}],  training loss: {}, Train Acc: {}, Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader),  running_loss/10, train_acc, val_acc))
            running_loss = 0.0
            validation_acc_history.append(val_acc)
            # check if we need to earily stop the model
            stop_training = earily_stop(validation_acc_history)
            
            if stop_training:
                print("earily stop triggered")
                break
    # because of the the nested loop
    if stop_training:
        break



# In[ ]:

Epoch: [1/10], Step: [101/313],  training loss: 11.1007155418396, Train Acc: 41.964196419641965, Validation Acc: 38.6
Epoch: [1/10], Step: [201/313],  training loss: 10.94191154241562, Train Acc: 37.85378537853786, Validation Acc: 36.2
Epoch: [1/10], Step: [301/313],  training loss: 10.900769138336182, Train Acc: 47.914791479147915, Validation Acc: 45.3
Epoch: [2/10], Step: [101/313],  training loss: 10.84502398967743, Train Acc: 49.104910491049104, Validation Acc: 48.5
Epoch: [2/10], Step: [201/313],  training loss: 10.393507212400436, Train Acc: 51.23512351235124, Validation Acc: 48.0
Epoch: [2/10], Step: [301/313],  training loss: 10.155423486232758, Train Acc: 55.105510551055104, Validation Acc: 51.6
Epoch: [3/10], Step: [101/313],  training loss: 9.699889653921128, Train Acc: 56.70567056705671, Validation Acc: 52.8
Epoch: [3/10], Step: [201/313],  training loss: 9.31410629749298, Train Acc: 56.37563756375638, Validation Acc: 54.5
Epoch: [3/10], Step: [301/313],  training loss: 9.3

In [12]:
# In[ ]:

#Bidirectional GRU


# In[ ]:

class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, linear_size):
    
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        #self.embedding = nn.Embedding.from_pretrained(embedding_weight_matrix, freeze=True)
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        
        self.embedding.weight.data.copy_(torch.from_numpy(build_vocab()[2]))
        self.embedding.weight.requires_grad = False
        self.gru= nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.linear_1 = nn.Linear(hidden_size*2, linear_size)
        self.linear_2 = nn.Linear(linear_size, num_classes)

    def init_hidden(self, batch_size):
        hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).cuda()
        '''
        
        if torch.cuda.is_available and torch.has_cudnn:
            hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).cuda()
        else:
            hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size)
        '''
        return hidden
    

    def forward(self, premise, hypothesis, pre_length, hyp_length):
        # reset hidden state
        batch_size, seq_len_premise = premise.size()
        batch_size, seq_len_hypothesis = hypothesis.size()

        self.hidden = self.init_hidden(batch_size)
        
         # compute sorted sequence lengths
        _, idx_sort_1 = torch.sort(pre_length, dim=0, descending=True)
        _, idx_sort_2 = torch.sort(hyp_length, dim=0, descending=True)
        _, idx_unsort_1 = torch.sort(idx_sort_1, dim=0)
        _, idx_unsort_2 = torch.sort(idx_sort_2, dim=0)
        
        # get embedding of characters
        embed_1 = self.embedding(premise).float()
        embed_2 = self.embedding(hypothesis).float()
        # sort embeddings and lengths
        embed_1 = embed_1.index_select(0,idx_sort_1)
        embed_2 = embed_2.index_select(0,idx_sort_2)
        len1 = list(pre_length[idx_sort_1])
        len2 = list(hyp_length[idx_sort_2])
        
        # pack padded sequence
        embed_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, np.array(len1), batch_first=True)
        embed_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, np.array(len2), batch_first=True)
        
        # fprop though RNN
        _, hidden_1 = self.gru(embed_1, self.hidden)
        _, hidden_2 = self.gru(embed_2, self.hidden)
        
        # sum the hidden state on the first dimension
        hidden_1 = torch.sum(hidden_1, dim=0)
        hidden_2 = torch.sum(hidden_2, dim=0)
        
        # unsort the hidden state and concatenate the two
        hidden_1 = hidden_1.index_select(0, idx_unsort_1)
        hidden_2 = hidden_2.index_select(0, idx_unsort_2)
        
        concat_input = torch.cat((hidden_1, hidden_2), dim=1)
        output = self.linear_1(concat_input)
        output = self.relu(output)
        logits = self.linear_2(output)
        return logits

   

In [16]:
# In[12]:

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1,data2, lengths1, lengths2, labels in loader:
        data_batch1,data_batch2, lengths_batch1,lengths_batch2, label_batch = data1,data2, lengths1, lengths2,labels
        outputs = F.softmax(model(data_batch1,data_batch2, lengths_batch1, lengths_batch2), dim=1)
        #outputs = model(data_batch1,data_batch2, lengths_batch1, lengths_batch2)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train
vocab_size=len(id2words)


    
model = RNN(emb_size=300, hidden_size=200, num_layers=1, num_classes=3,linear_size =64).cuda()

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

# Train the model
total_step = len(train_loader)


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (data1,data2, lengths1,lengths2, labels) in enumerate(train_loader):
        
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data1,data2, lengths1,lengths2)
        #print(outputs.size())
        #print(labels.size())
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            train_acc = test_model(train_loader, model)
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}],  training loss: {}, Train Acc: {}, Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader),  running_loss/10, train_acc, val_acc))
            running_loss = 0.0



# In[ ]:

Epoch: [1/10], Step: [101/313],  training loss: 11.065701937675476, Train Acc: 45.53455345534554, Validation Acc: 43.0
Epoch: [1/10], Step: [201/313],  training loss: 10.72524288892746, Train Acc: 46.83468346834683, Validation Acc: 45.4
Epoch: [1/10], Step: [301/313],  training loss: 10.43424677848816, Train Acc: 52.045204520452046, Validation Acc: 50.7
Epoch: [2/10], Step: [101/313],  training loss: 10.10987230539322, Train Acc: 52.34523452345235, Validation Acc: 53.3
Epoch: [2/10], Step: [201/313],  training loss: 9.699177259206772, Train Acc: 55.685568556855685, Validation Acc: 54.9
Epoch: [2/10], Step: [301/313],  training loss: 9.536523520946503, Train Acc: 56.925692569256924, Validation Acc: 55.8
Epoch: [3/10], Step: [101/313],  training loss: 9.310713976621628, Train Acc: 56.975697569756974, Validation Acc: 55.8
Epoch: [3/10], Step: [201/313],  training loss: 9.242854982614517, Train Acc: 58.58585858585859, Validation Acc: 56.8
Epoch: [3/10], Step: [301/313],  training loss: 9.1