In [None]:
#import the packages
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from google.colab import files


PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [None]:
#import the tables
datadir = os.getcwd()
snli_train_data = pd.read_table(datadir + '/snli_train.tsv')
snli_val_data = pd.read_table(datadir+'/snli_val.tsv')
test_data = pd.read_table(datadir + '/mnli_val.tsv')


In [None]:
#Load 50,000 pre-trained word embeddings
#Also return the word2id and id2word and the entire vocab
datadir = os.getcwd()
words_to_load = 50000
emb_size = 300
with open(datadir + '/dataset_ml_2/hw2_data/wiki-news-300d-1M.vec') as f:
    loaded_embeddings = np.zeros(((words_to_load+2), 300))
    word2id = {}
    id2words = {}
    
    id2words[PAD_IDX] = '<pad>'
    id2words[UNK_IDX] = '<unk>'
    word2id['<pad>'] = PAD_IDX
    word2id['<unk>'] = UNK_IDX
    
    ordered_words= []
    ordered_words.append('<pad>')
    ordered_words.append('<unk>')
    
    for i, line in enumerate(f):
        if i >= words_to_load:
            break
        s = line.split()
        loaded_embeddings[i+2,:] = np.asarray(s[1:])
        word2id[s[0]] = i+2 #for extra pad and unk 
        id2words[i+2] = s[0]
        ordered_words.append(s[0])

In [None]:
def sentence2id(sentence_list):
    id_list = []
    for sentence in sentence_list:
        sentence_id_list = [word2id[word] if word in word2id else UNK_IDX for word in sentence]
        id_list.append(sentence_id_list)
    return id_list

In [None]:
test_data['genre'].unique()
#Split the 5 genres
test_data_fiction = test_data[test_data['genre'] == 'fiction']
test_data_telephone = test_data[test_data['genre'] == 'telephone']
test_data_slate = test_data[test_data['genre'] == 'slate']
test_data_government = test_data[test_data['genre'] == 'government']
test_data_travel = test_data[test_data['genre'] == 'travel']

In [None]:
#Form two lists for train and val data, will deal with test data later when training is done
sentence1_train_data = []
sentence2_train_data = []
sentence1_val_data = []
sentence2_val_data = []

for line in snli_train_data.sentence1:
    sentence1_train_data.append(line.split())
    
for line in snli_train_data.sentence2:
    sentence2_train_data.append(line.split())
    
for line in snli_val_data.sentence1:
    sentence1_val_data.append(line.split())
    
for line in snli_val_data.sentence2:
    sentence2_val_data.append(line.split())

train_target = []
val_target = []
for label in snli_train_data.label:
    if label == 'entailment':
        train_target.append(0)
    elif label == 'contradiction':
        train_target.append(1)
    elif label == 'neutral':
        train_target.append(2)

        
for label in snli_val_data.label:
    if label == 'entailment':
        val_target.append(0)
    elif label == 'contradiction':
        val_target.append(1)
    elif label == 'neutral':
        val_target.append(2)

#Write the processed tokens
# pkl.dump(sentence1_train_data, open('./train_1_tokens.p', 'wb'))
# pkl.dump(sentence2_train_data, open('./train_2_tokens.p', 'wb'))
# pkl.dump(sentence1_val_data, open('./val_1_tokens.p', 'wb'))
# pkl.dump(sentence2_val_data, open('./val_2_tokens.p', 'wb'))
# pkl.dump(train_target, open('./train_target.p', 'wb'))
# pkl.dump(val_target, open('./val_target.p', 'wb'))



In [None]:
#load the train and val data (only need to load this)
sentence1_train_data = pkl.load(open(datadir+'/dataset_ml_2/train_1_tokens.p', 'rb'))
sentence2_train_data = pkl.load(open(datadir+'/dataset_ml_2/train_2_tokens.p', 'rb'))
sentence1_val_data = pkl.load(open(datadir+'/dataset_ml_2/val_1_tokens.p', 'rb'))
sentence2_val_data = pkl.load(open(datadir+'/dataset_ml_2/val_2_tokens.p', 'rb'))
train_target = pkl.load(open(datadir+'/dataset_ml_2/train_target.p', 'rb'))
val_target = pkl.load(open(datadir+'/dataset_ml_2/val_target.p', 'rb'))


In [None]:
#Transfer into integer index vector by table look up
sentence1_train_id = sentence2id(sentence1_train_data)
max_length1 = max(len(sentence1_train_id[i]) for i in range(0,len(sentence1_train_id)))
sentence2_train_id = sentence2id(sentence2_train_data)
max_length2 = max(len(sentence2_train_id[i]) for i in range(0,len(sentence2_train_id)))

sentence1_val_id = sentence2id(sentence1_val_data)
max_length3 = max(len(sentence1_val_id[i]) for i in range(0,len(sentence1_val_id)))

sentence2_val_id = sentence2id(sentence2_val_data)
max_length4 = max(len(sentence2_val_id[i]) for i in range(0,len(sentence2_val_id)))

MAX_SENTENCE_LENGTH = max(max_length1, max_length2)
MAX_SENTENCE_LENGTH_1 = max(max_length3, max_length4)
emb_size = 300


MAX_SENTENCE_LENGTH, MAX_SENTENCE_LENGTH_1#should be (82,53)

In [None]:
#Dataclass needed
class SnliDataset(Dataset):
    def __init__(self, sentence1_id, sentence2_id, target_list):
        self.sentence1_id = sentence1_id
        self.sentence2_id = sentence2_id
        self.target_list = target_list
        assert(len(self.sentence1_id) == len(self.target_list) & len(self.sentence2_id) == len(self.target_list))
        
    def __len__(self):
        return len(self.target_list)
    
    def __getitem__(self, index):
        content1 = self.sentence1_id[index][:MAX_SENTENCE_LENGTH]
        content2 = self.sentence2_id[index][:MAX_SENTENCE_LENGTH]
        label = self.target_list[index]
        return [content1, len(content1), content2, len(content2), label]
    
def vocab_collate_func(batch):
    sentence1_list = []
    sentence2_list = []
    
    label_list = []
    length1_list = []
    length2_list = []
    
    for datum in batch:
        label_list.append(datum[4])
        length1_list.append(datum[1])
        length2_list.append(datum[3])
    
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]),
                              pad_width = ((0, MAX_SENTENCE_LENGTH - datum[1])),
                              mode = 'constant', constant_values = 0)
        padded_vec_2 = np.pad(np.array(datum[2]),
                              pad_width = ((0, MAX_SENTENCE_LENGTH - datum[3])),
                              mode = 'constant', constant_values = 0)
        sentence1_list.append(padded_vec_1)
        sentence2_list.append(padded_vec_2)
        

    return [torch.from_numpy(np.array(sentence1_list)).cuda(),
            torch.LongTensor(length1_list).cuda(), 
            torch.from_numpy(np.array(sentence2_list)).cuda(),
            torch.LongTensor(length2_list).cuda(),
            torch.LongTensor(label_list).cuda()]
        

In [None]:
train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)

### Will do CNN first
### 1. concatenation with different hidden size

In [None]:
def plot_learning_curve(loss_list):
    fig, ax = plt.subplots(figsize = (12,10))
    ax.plot(loss_list)
    #fig.show()

def test_model_cnn(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data_1, length_1, data_2, length_2, labels in loader:
        data_1_batch, length_1_batch, data_2_batch, length_2_batch,labels_batch = data_1, length_1, data_2, length_2, labels
        data_1_batch.cuda()
        data_2_batch.cuda()
        length_1_batch.cuda()
        length_2_batch.cuda()
        labels_batch.cuda()

        outputs = model(data_1_batch, data_2_batch)
        
        outputs = F.softmax(outputs, dim = 1)
   
        predicted = outputs.max(1, keepdim = True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        
        return (100 * correct/total)

In [None]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, kernel_size, num_layers, num_classes, loaded_embedding):
        super(CNN, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = loaded_embedding
        self.conv1_1 = nn.Conv1d(emb_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        self.conv2_1 = nn.Conv1d(hidden_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        
        self.conv1_2 = nn.Conv1d(emb_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        self.conv2_2 = nn.Conv1d(hidden_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()

        self.fc1 = nn.Linear(2*hidden_size, hidden_size).double().cuda()
        self.fc2 = nn.Linear(hidden_size, num_classes).double().cuda()
        
    def forward(self, x1, x2):
        batch_size, seq_len = x1.size()

        embed_1 = torch.from_numpy(self.embedding[x1]).cuda()
  
        hidden_1 = self.conv1_1(embed_1.transpose(1,2)).transpose(1,2).cuda() 
        hidden_1 = F.relu(hidden_1.contiguous().view(-1,hidden_1.size(-1))).view(batch_size, hidden_1.size(-2), hidden_1.size(-1)).cuda()
        
        hidden_1 = self.conv2_1(hidden_1.transpose(1,2)).transpose(1,2).cuda()
        hidden_1 = F.relu(hidden_1.contiguous().view(-1, hidden_1.size(-1))).view(batch_size, hidden_1.size(-2), hidden_1.size(-1)).cuda()
        
        m_pool = nn.MaxPool1d(kernel_size = hidden_1.size()[1]).cuda()#Pooling over the time dimension
        hidden_1 = m_pool(hidden_1.transpose(1,2)).transpose(1,2).cuda()#Change the dimension and change back
        hidden_1 = hidden_1.squeeze(dim=1).cuda()#Drop the time dimension

        
        embed_2 = torch.from_numpy(self.embedding[x2]).cuda()
        hidden_2 = self.conv1_2(embed_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(-2), hidden_2.size(-1)).cuda()
        
        hidden_2 = self.conv2_2(hidden_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(-2), hidden_2.size(-1)).cuda()
        

        hidden_2 = m_pool(hidden_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = hidden_2.squeeze(dim=1).cuda()
        
        hidden = torch.cat((hidden_1, hidden_2),dim = 1).cuda()
        #print('The concat hidden out size is {}'.format(hidden.size()))
        
        hidden = self.fc1(hidden).cuda()
        hidden = F.relu(hidden, inplace = True)
        logits = self.fc2(hidden).cuda()

        return logits

In [None]:
##Test for models
model_cnn = CNN(emb_size = 300, hidden_size = 600, kernel_size = 3, num_layers = 2, num_classes = 3, 
           loaded_embedding = loaded_embeddings)
model_cnn = model_cnn.double()
model_cnn = model_cnn.cuda()

learning_rate = 3e-4
num_epochs = 6;
num_classes = 3;
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr = learning_rate)

train_loss_list = []
val_acc_list = []

for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
    if i == 0:
        break

train_loss_list = []
for i in range(0,500):
  #print(i)
    model_cnn.train()
    optimizer.zero_grad()
    outputs = model_cnn(data_1, data_2).cuda()
    loss = criterion(outputs, labels).cuda()
    train_loss_list.append(loss.item())
    loss.backward()
    optimizer.step()


total = 0
correct = 0
model_gru_mul.eval()
outputs = model_gru_mul(data_1, data_2, length_1, length_2).cuda()
        
outputs = F.softmax(outputs, dim = 1)

predicted = outputs.max(1, keepdim = True)[1]

total += labels.size(0)
correct += predicted.eq(labels.view_as(predicted)).sum().item()

print('The train accuracy is {}'.format(100 * correct/total))
plot_learning_curve(train_loss_list)    
        

In [None]:
hidden_list = [200,400,600, 800, 1000]
#hidden_list = [200,400]


total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for hidden_size in hidden_list:
    print('The current hidden size is {}'.format(hidden_size))
  
  #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
  
  #Create new models
    model_cnn = CNN(emb_size = 300, hidden_size = hidden_size, kernel_size = 3, num_layers = 2, num_classes = 3, 
           loaded_embedding = loaded_embeddings)
    model_cnn = model_cnn.double()
    model_cnn = model_cnn.cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_cnn.parameters(), lr = learning_rate)
    num_epochs = 10;
  #num_classes = 3;

    train_loss_list = []
    val_acc_list = []
  
    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_cnn.train()
            optimizer.zero_grad()

            outputs = model_cnn(data_1, data_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_cnn(val_loader, model_cnn)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                      epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_cnn(val_loader, model_cnn)
    total_final_val_acc.append(final_val_acc)

print('The length of total_final_val_acc is {}, should be 5'.format(len(total_final_val_acc)))
print('The length of total_train_loss is {}, should be 5, the lenfth of 1 should be 3125 x 10 {}'.format(len(total_train_loss_list), 
                                                                                               len(total_train_loss_list[0])))
print('The length of total_val_loss is {}, should be 5, the lenfth of 1 should be 60 {}'.format(len(total_val_acc_list), 
                                                                                               len(total_val_acc_list[0])))

  
pkl.dump(total_train_loss_list, open('cnn_cat_train_loss.p', 'wb'))
pkl.dump(total_val_acc_list, open('cnn_cat_val_acc.p', 'wb'))
pkl.dump(total_final_val_acc, open('cnn_cat_final_val_acc.p', 'wb'))

# files.download('cnn_cat_train_loss.p')

# files.download('cnn_cat_val_acc.p')
# files.download('cnn_cat_final_val_acc.p')
  

### 2. Multiplication with different hidden size

In [None]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, kernel_size, num_layers, num_classes, loaded_embedding):
        super(CNN, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = loaded_embedding
        self.conv1_1 = nn.Conv1d(emb_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        self.conv2_1 = nn.Conv1d(hidden_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        
        self.conv1_2 = nn.Conv1d(emb_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()
        self.conv2_2 = nn.Conv1d(hidden_size, hidden_size, kernel_size =kernel_size, padding = 1).cuda()

        self.fc1 = nn.Linear(hidden_size, hidden_size).double().cuda()
        self.fc2 = nn.Linear(hidden_size, num_classes).double().cuda()
        
    def forward(self, x1, x2):
        batch_size, seq_len = x1.size()

        embed_1 = torch.from_numpy(self.embedding[x1]).cuda()
  
        hidden_1 = self.conv1_1(embed_1.transpose(1,2)).transpose(1,2).cuda() 
        hidden_1 = F.relu(hidden_1.contiguous().view(-1,hidden_1.size(-1))).view(batch_size, hidden_1.size(-2), hidden_1.size(-1)).cuda()
        
        hidden_1 = self.conv2_1(hidden_1.transpose(1,2)).transpose(1,2).cuda()
        hidden_1 = F.relu(hidden_1.contiguous().view(-1, hidden_1.size(-1))).view(batch_size, hidden_1.size(-2), hidden_1.size(-1)).cuda()
        
        m_pool = nn.MaxPool1d(kernel_size = hidden_1.size()[1]).cuda()#Pooling over the time dimension
        hidden_1 = m_pool(hidden_1.transpose(1,2)).transpose(1,2).cuda()#Change the dimension and change back
        hidden_1 = hidden_1.squeeze(dim=1).cuda()#Drop the time dimension

        
        embed_2 = torch.from_numpy(self.embedding[x2]).cuda()
        hidden_2 = self.conv1_2(embed_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(-2), hidden_2.size(-1)).cuda()
        
        hidden_2 = self.conv2_2(hidden_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(-2), hidden_2.size(-1)).cuda()
        

        hidden_2 = m_pool(hidden_2.transpose(1,2)).transpose(1,2).cuda()
        hidden_2 = hidden_2.squeeze(dim=1).cuda()
        
        #hidden = torch.cat((hidden_1, hidden_2),dim = 1).cuda()
        hidden = torch.mul(hidden_1, hidden_2).cuda()
        #print('The concat hidden out size is {}'.format(hidden.size()))
        
        hidden = self.fc1(hidden).cuda()
        hidden = F.relu(hidden, inplace = True)
        logits = self.fc2(hidden).cuda()

        return logits

In [None]:
hidden_list = [200,400,600, 800, 1000]
#hidden_list = [200,400]


total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for hidden_size in hidden_list:
    print('The current hidden size is {}'.format(hidden_size))
  
  #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
  
  #Create new models
    model_cnn = CNN(emb_size = 300, hidden_size = hidden_size, kernel_size = 3, num_layers = 2, num_classes = 3, 
           loaded_embedding = loaded_embeddings)
    model_cnn = model_cnn.double()
    model_cnn = model_cnn.cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_cnn.parameters(), lr = learning_rate)
    num_epochs = 10;
  #num_classes = 3;

    train_loss_list = []
    val_acc_list = []
  
    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_cnn.train()
            optimizer.zero_grad()

            outputs = model_cnn(data_1, data_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_cnn(val_loader, model_cnn)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                      epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_cnn(val_loader, model_cnn)
    total_final_val_acc.append(final_val_acc)

print('The length of total_final_val_acc is {}, should be 5'.format(len(total_final_val_acc)))
print('The length of total_train_loss is {}, should be 5, the lenfth of 1 should be 3125 x 10 {}'.format(len(total_train_loss_list), 
                                                                                               len(total_train_loss_list[0])))
print('The length of total_val_loss is {}, should be 5, the lenfth of 1 should be 60 {}'.format(len(total_val_acc_list), 
                                                                                               len(total_val_acc_list[0])))

  
pkl.dump(total_train_loss_list, open('cnn_mul_train_loss.p', 'wb'))
pkl.dump(total_val_acc_list, open('cnn_mul_val_acc.p', 'wb'))
pkl.dump(total_final_val_acc, open('cnn_mul_final_val_acc.p', 'wb'))

# files.download('cnn_mul_train_loss.p')

# files.download('cnn_mul_val_acc.p')
# files.download('cnn_mul_final_val_acc.p')
  

### We have chosen based on validation accuracy to have multiplication with hidden size 600
### Now we test on different kernel sizes

In [None]:
#NOw test for different kernel size

kernel_list = [3, 5, 10,15, 20,25]

total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for kernel_size in kernel_list:
    print('The current kernel size is {}'.format(kernel_size))

    #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)

    #Create new models
    model_cnn = CNN(emb_size = 300, hidden_size = 600, kernel_size = kernel_size, num_layers = 2, num_classes = 3, 
           loaded_embedding = loaded_embeddings)
    model_cnn = model_cnn.double()
    model_cnn = model_cnn.cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_cnn.parameters(), lr = learning_rate)
    num_epochs = 10;
    #num_classes = 3;

    train_loss_list = []
    val_acc_list = []

    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_cnn.train()
            optimizer.zero_grad()

            outputs = model_cnn(data_1, data_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_cnn(val_loader, model_cnn)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                          epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_cnn(val_loader, model_cnn)
    total_final_val_acc.append(final_val_acc)

print('The length of total_final_val_acc is {}, should be 5'.format(len(total_final_val_acc)))
print('The length of total_train_loss is {}, should be 5, the lenfth of 1 should be 3125 x 10 {}'.format(len(total_train_loss_list), 
                                                                                               len(total_train_loss_list[0])))
print('The length of total_val_loss is {}, should be 5, the lenfth of 1 should be 60 {}'.format(len(total_val_acc_list), 
                                                                                               len(total_val_acc_list[0])))

  
pkl.dump(total_train_loss_list, open('cnn_kernel_train_loss.p', 'wb'))
pkl.dump(total_val_acc_list, open('cnn_kernel_val_acc.p', 'wb'))
pkl.dump(total_final_val_acc, open('cnn_kernel_final_val_acc.p', 'wb'))



### We conclude based on validation accuracy that the optimum model is CNN with multiplication, hidden size = 600, kerneksize = 15

In [None]:
#Clean test data and remove outliers
test_data_fiction_cln = test_data_fiction.drop([4377,2920],0)#Remove the 2 abnormal one

#Form two lists - will be repeated later for different genre
sentence1_test_data = []
sentence2_test_data = []
    
for line in test_data_fiction_cln.sentence1:
    sentence1_test_data.append(line.split())
    
for line in test_data_fiction_cln.sentence2:
    sentence2_test_data.append(line.split())

test_target = []
for label in test_data_fiction_cln.label:
    if label == 'entailment':
        test_target.append(0)
    elif label == 'contradiction':
        test_target.append(1)
    elif label == 'neutral':
        test_target.append(2)

sentence1_test_id = sentence2id(sentence1_test_data)
max_length1 = max(len(sentence1_test_id[i]) for i in range(0,len(sentence1_test_id)))
sentence2_test_id = sentence2id(sentence2_test_data)
max_length2 = max(len(sentence2_test_id[i]) for i in range(0,len(sentence2_test_id)))


MAX_SENTENCE_LENGTH = max(max_length1, max_length2)
emb_size = 300


MAX_SENTENCE_LENGTH #Should be 55


test_dataset = SnliDataset(sentence1_test_id, sentence2_test_id, test_target)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                         batch_size = BATCH_SIZE,
                                         collate_fn = vocab_collate_func,
                                         shuffle = False)

In [1]:
def preprocess_test_data(test_data):
    sentence1_test_data = []
    sentence2_test_data = []

    for line in test_data.sentence1:
        sentence1_test_data.append(line.split())

    for line in test_data.sentence2:
        sentence2_test_data.append(line.split())

    test_target = []
    for label in test_data.label:
        if label == 'entailment':
            test_target.append(0)
        elif label == 'contradiction':
            test_target.append(1)
        elif label == 'neutral':
            test_target.append(2)

    sentence1_test_id = sentence2id(sentence1_test_data)
    max_length1 = max(len(sentence1_test_id[i]) for i in range(0,len(sentence1_test_id)))
    sentence2_test_id = sentence2id(sentence2_test_data)
    max_length2 = max(len(sentence2_test_id[i]) for i in range(0,len(sentence2_test_id)))

    MAX_SENTENCE_LENGTH = max(max_length1, max_length2)
    
    test_dataset = SnliDataset(sentence1_test_id, sentence2_test_id, test_target)
    test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                         batch_size = BATCH_SIZE,
                                         collate_fn = vocab_collate_func,
                                         shuffle = False)
    return MAX_SENTENCE_LENGTH, test_loader

In [None]:
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_telephone)
test_acc_telephone_cnn = test_model_cnn(test_loader, model_cnn)
#test_acc_telephone_cnn
#print(len(test_loader))
#print(should be 32)


In [None]:
# max_idx = 0
# sec_len = 0
# max_len = 0
# max_sen = ''
# for i in range(len(sentence1_test_data)):
#   curr_len = len(sentence1_test_data[i])
#   if curr_len > max_len:
#     sec_len = max_len
#     max_idx = i
#     max_len = curr_len
#     max_sen = sentence1_test_data[i]
    
# #THe sec len iso only 53, there we consider remove the abnormal sentence as it will result in excessive paddings
   

In [None]:
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_slate)
test_acc_slate_cnn = test_model_cnn(test_loader, model_cnn)
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_government)
test_acc_government_cnn = test_model_cnn(test_loader, model_cnn)
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_travel)
test_acc_travel_cnn = test_model_cnn(test_loader, model_cnn)


In [None]:
def test_model_cnn_extract(loader, model):
    
    model.eval()
    for i, (data_1, length_1, data_2, length_2, labels) in enumerate(loader):
        #TAKE THE second batch different from RNN
        if i == 2:
            break
      
    data_1_batch, length_1_batch, data_2_batch, length_2_batch,labels_batch = data_1, length_1, data_2, length_2, labels
    data_1_batch.cuda()
    data_2_batch.cuda()
    length_1_batch.cuda()
    length_2_batch.cuda()
    labels_batch.cuda()

    outputs = model(data_1_batch, data_2_batch)

    outputs = F.softmax(outputs, dim = 1)

    predicted = outputs.max(1, keepdim = True)[1]

    result = predicted.eq(labels.view_as(predicted))
        
    return data_1, data_2, labels, predicted, result


### Now RNN
### 1. concatenation with different hidden size

In [None]:
#Test function defined here for all gru modesl
def plot_learning_curve(loss_list):
    fig, ax = plt.subplots(figsize = (12,10))
    ax.plot(loss_list)
  #fig.show()
  
def test_model_gru(loader, model):
    correct = 0
    total = 0
    model.eval()
    
    for data_1, length_1, data_2, length_2, labels in loader:
        data_1_batch, length_1_batch, data_2_batch, length_2_batch,labels_batch = data_1, length_1, data_2, length_2, labels
        data_1_batch = data_1_batch.cuda()
        data_2_batch = data_2_batch.cuda()
        length_1_batch = length_1_batch.cuda()
        length_2_batch = length_2_batch.cuda()
        labels_batch = labels_batch.cuda()

    outputs = model(data_1_batch, data_2_batch, length_1_batch, length_2_batch).cuda()

    outputs = F.softmax(outputs, dim = 1)

    predicted = outputs.max(1, keepdim = True)[1].cuda()

    total += labels.size(0)
    correct += predicted.eq(labels.view_as(predicted)).sum().item()

    return (100 * correct/total)



In [None]:
#GRU bidirectional class with concatenations
class GRU(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, loaded_embedding):
        super(GRU, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = loaded_embedding
        self.GRU = nn.GRU(emb_size, hidden_size, num_layers, batch_first = True, bidirectional = True).cuda()
        
        #Bidirectional and concatenate two together
        self.fc1 = nn.Linear(4*hidden_size, hidden_size).float().cuda()
        self.fc2 = nn.Linear(hidden_size, num_classes).float().cuda()
       

        
    def init_hidden(self, batch_size):
        
        hidden = torch.randn((self.num_layers*2), batch_size, self.hidden_size).cuda()
        return hidden
    
    def forward(self, x1, x2, length1, length2):
        batch_size, seq_len = x1.size()
   
        self.hidden_1 = self.init_hidden(batch_size).cuda()
    
        length1_sorted, sorted_idx_1 = torch.sort(length1, descending = True)
        length1_sorted.cuda()
        sorted_idx_1.cuda()
        
        _, unsorted_idx_1 = torch.sort(sorted_idx_1)
        unsorted_idx_1.cuda()
        x1_sorted = x1.index_select(dim=0, index = sorted_idx_1).cuda()
        
        embed_1 = torch.from_numpy(self.embedding[x1_sorted]).cuda()
        
        length1_sorted_copy = length1_sorted.cpu()
        pac_seq_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, length1_sorted_copy.numpy(), batch_first = True).float()

        _, self.hidden_1 = self.GRU(pac_seq_1, self.hidden_1)
        self.hidden_1.cuda()
        
        first_dir = self.hidden_1.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir = self.hidden_1.narrow(0,1,1).squeeze(dim=0).cuda()

        output_1 = torch.cat((first_dir, sec_dir), dim=1).cuda()

        output_1 = output_1.index_select(dim=0,index=unsorted_idx_1).cuda()

                
        self.hidden_2 = self.init_hidden(batch_size).cuda()

        length2_sorted, sorted_idx_2 = torch.sort(length2, descending = True)
        length2_sorted.cuda()
        sorted_idx_2.cuda()
        
        _, unsorted_idx_2 = torch.sort(sorted_idx_2)
        unsorted_idx_2.cuda()
        x2_sorted = x2.index_select(dim=0, index = sorted_idx_2).cuda()
        
        embed_2 = torch.from_numpy(self.embedding[x2_sorted]).cuda()
        
        length2_sorted_copy = length2_sorted.cpu()
        pac_seq_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, length2_sorted_copy.numpy(), batch_first = True).float()
        
        _, self.hidden_2 = self.GRU(pac_seq_2, self.hidden_2)
        self.hidden_2.cuda()
        first_dir_2 = self.hidden_2.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir_2 = self.hidden_2.narrow(0,1,1).squeeze(dim=0).cuda()
       
        output_2 = torch.cat((first_dir_2, sec_dir_2), dim=1).cuda()

        output_2 = output_2.index_select(dim=0,index=unsorted_idx_2).cuda()
        
        rnn_out = torch.cat((output_1, output_2), dim = 1).cuda()# 32 x (400x2)

        rnn_out = self.fc1(rnn_out).cuda()
        rnn_out = F.relu(rnn_out, inplace = True)
        logits = self.fc2(rnn_out).cuda()
        
        return logits
        

In [None]:
#Brief test
train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)

model_gru_mul = GRU(emb_size = 300, hidden_size = 600, num_layers = 1, num_classes = 3, loaded_embedding = loaded_embeddings)
model_gru_mul = model_gru_mul.cuda()

#Test code for small minibatches
##Test for models
learning_rate = 3e-4
num_epochs = 6;
num_classes = 3;xq
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model_gru_mul.parameters(), lr = learning_rate)

train_loss_list = []
#Get a small batch
for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
    if i == 0: 
        break

train_loss_list = []
for i in range(0,500):
  #print(i)
    model_gru_mul.train()
    optimizer.zero_grad()
    outputs = model_gru_mul(data_1, data_2, length_1, length_2).cuda()
    loss = criterion(outputs, labels).cuda()
    train_loss_list.append(loss.item())
    loss.backward()
    optimizer.step()

total = 0
correct = 0
model_gru_mul.eval()
outputs = model_gru_mul(data_1, data_2, length_1, length_2).cuda()
        
outputs = F.softmax(outputs, dim = 1)

predicted = outputs.max(1, keepdim = True)[1]

total += labels.size(0)
correct += predicted.eq(labels.view_as(predicted)).sum().item()

print('The train accuracy is {}'.format(100 * correct/total))
plot_learning_curve(train_loss_list)

In [None]:
#Train for different hidden size
hidden_list = [200,400,600,800,1000]

total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for hidden_size in hidden_list:
    print('The current hidden size is {}'.format(hidden_size))
  
  #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
  
  #Create new models
    model_gru = GRU(emb_size = 300, hidden_size = hidden_size, num_layers = 1, num_classes = 3, loaded_embedding = loaded_embeddings).cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_gru.parameters(), lr = learning_rate)
    num_epochs = 10;
  #num_classes = 3;

    train_loss_list = []
    val_acc_list = []
  
    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_gru.train()
            optimizer.zero_grad()

            outputs = model_gru(data_1, data_2, length_1, length_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_gru(val_loader, model_gru)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                      epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_gru(val_loader, model_gru)
    total_final_val_acc.append(final_val_acc)

#print('The length of total_final_val_acc is {}, should be 5'.format(len(total_final_val_acc)))
#print('The length of total_train_loss is {}, should be 5, the lenfth of 1 should be 3125 x 10 {}'.format(len(total_train_loss_list), 
#                                                                                                len(total_train_loss_list[0])))
# print('The length of total_val_loss is {}, should be 5, the lenfth of 1 should be 60 {}'.format(len(total_val_acc_list), 
#                                                                                                len(total_val_acc_list[0])))

pkl.dump(total_train_loss_list, open('hidden_size_train_loss.p', 'wb'))
pkl.dump(total_val_acc_list, open('hidden_size_val_acc.p', 'wb'))
pkl.dump(total_final_val_acc, open('total_final_val_acc.p', 'wb'))

# files.download('hidden_size_train_loss.p')
# files.download('hidden_size_val_acc.p')
# files.download('total_final_val_acc.p')
  

### 2. Now lets test on GRU with multiplication on different hidden size

In [None]:
class GRU(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, loaded_embedding):
        super(GRU, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = loaded_embedding
        self.GRU = nn.GRU(emb_size, hidden_size, num_layers, batch_first = True, bidirectional = True).cuda()
        
        #Bidirectional and concatenate two together
        self.fc1 = nn.Linear(2*hidden_size, hidden_size).float().cuda()
        self.fc2 = nn.Linear(hidden_size, num_classes).float().cuda()
       

    def init_hidden(self, batch_size):
        
        hidden = torch.randn((self.num_layers*2), batch_size, self.hidden_size).cuda()
        return hidden
    
    def forward(self, x1, x2, length1, length2):
        batch_size, seq_len = x1.size()
   
        self.hidden_1 = self.init_hidden(batch_size).cuda()
    
        length1_sorted, sorted_idx_1 = torch.sort(length1, descending = True)
        length1_sorted.cuda()
        sorted_idx_1.cuda()
        
        _, unsorted_idx_1 = torch.sort(sorted_idx_1)
        unsorted_idx_1.cuda()
        x1_sorted = x1.index_select(dim=0, index = sorted_idx_1).cuda()
        
        embed_1 = torch.from_numpy(self.embedding[x1_sorted]).cuda()
        
        length1_sorted_copy = length1_sorted.cpu()
        pac_seq_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, length1_sorted_copy.numpy(), batch_first = True).float()

        _, self.hidden_1 = self.GRU(pac_seq_1, self.hidden_1)
        self.hidden_1.cuda()
        
        first_dir = self.hidden_1.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir = self.hidden_1.narrow(0,1,1).squeeze(dim=0).cuda()

        output_1 = torch.cat((first_dir, sec_dir), dim=1).cuda()   
        output_1 = output_1.index_select(dim=0,index=unsorted_idx_1).cuda()
         
          
          
        self.hidden_2 = self.init_hidden(batch_size).cuda()

        length2_sorted, sorted_idx_2 = torch.sort(length2, descending = True)
        length2_sorted.cuda()
        sorted_idx_2.cuda()
        
        _, unsorted_idx_2 = torch.sort(sorted_idx_2)
        unsorted_idx_2.cuda()
        x2_sorted = x2.index_select(dim=0, index = sorted_idx_2).cuda()
        
        embed_2 = torch.from_numpy(self.embedding[x2_sorted]).cuda()
        
        length2_sorted_copy = length2_sorted.cpu()
        pac_seq_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, length2_sorted_copy.numpy(), batch_first = True).float()
        
        _, self.hidden_2 = self.GRU(pac_seq_2, self.hidden_2)
        self.hidden_2.cuda()
        first_dir_2 = self.hidden_2.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir_2 = self.hidden_2.narrow(0,1,1).squeeze(dim=0).cuda()
       
        output_2 = torch.cat((first_dir_2, sec_dir_2), dim=1).cuda()

        output_2 = output_2.index_select(dim=0,index=unsorted_idx_2).cuda()
        
        #print('The shape of output_1 is {}'.format(output_1.size()))
        #print('The shape of output_2 is {}'.format(output_2.size()))

        rnn_out = torch.mul(output_1, 1, output_2).cuda()# 32 x 400
        #print('The shape of rnn_output is {}'.format(rnn_out.size()))
        
        rnn_out = self.fc1(rnn_out).cuda()
        rnn_out = F.relu(rnn_out, inplace = True)
        logits = self.fc2(rnn_out).cuda()
        
        return logits

In [None]:
#Train for different hidden size
hidden_list = [200,400,600,800,1000]

total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for hidden_size in hidden_list:
    print('The current hidden size is {}'.format(hidden_size))
  
  #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
  
  #Create new models
    model_gru = GRU(emb_size = 300, hidden_size = hidden_size, num_layers = 1, num_classes = 3, loaded_embedding = loaded_embeddings).cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_gru.parameters(), lr = learning_rate)
    num_epochs = 10;
  #num_classes = 3;

    train_loss_list = []
    val_acc_list = []
  
    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_gru.train()
            optimizer.zero_grad()

            outputs = model_gru(data_1, data_2, length_1, length_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_gru(val_loader, model_gru)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                      epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_gru(val_loader, model_gru)
    total_final_val_acc.append(final_val_acc)                                                                                             len(total_val_acc_list[0])))

pkl.dump(total_train_loss_list, open('mul_size_train_loss_rnn.p', 'wb'))
pkl.dump(total_val_acc_list, open('mul_size_val_acc_rnn.p', 'wb'))
pkl.dump(total_final_val_acc, open('mul_total_final_val_acc_rnn.p', 'wb'))

  

### 3. Now lets test on GRU with addition on different hidden size

In [None]:
class GRU(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, loaded_embedding):
        super(GRU, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = loaded_embedding
        self.GRU = nn.GRU(emb_size, hidden_size, num_layers, batch_first = True, bidirectional = True).cuda()
        
        #Bidirectional and concatenate two together
        self.fc1 = nn.Linear(2*hidden_size, hidden_size).float().cuda()
        self.fc2 = nn.Linear(hidden_size, num_classes).float().cuda()
       

    def init_hidden(self, batch_size):
        
        hidden = torch.randn((self.num_layers*2), batch_size, self.hidden_size).cuda()
        return hidden
    
    def forward(self, x1, x2, length1, length2):
        batch_size, seq_len = x1.size()
   
        self.hidden_1 = self.init_hidden(batch_size).cuda()
    
        length1_sorted, sorted_idx_1 = torch.sort(length1, descending = True)
        length1_sorted.cuda()
        sorted_idx_1.cuda()
        
        _, unsorted_idx_1 = torch.sort(sorted_idx_1)
        unsorted_idx_1.cuda()
        x1_sorted = x1.index_select(dim=0, index = sorted_idx_1).cuda()
        
        embed_1 = torch.from_numpy(self.embedding[x1_sorted]).cuda()
        
        length1_sorted_copy = length1_sorted.cpu()
        pac_seq_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, length1_sorted_copy.numpy(), batch_first = True).float()

        _, self.hidden_1 = self.GRU(pac_seq_1, self.hidden_1)
        self.hidden_1.cuda()
        
        first_dir = self.hidden_1.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir = self.hidden_1.narrow(0,1,1).squeeze(dim=0).cuda()

        output_1 = torch.cat((first_dir, sec_dir), dim=1).cuda()   
        output_1 = output_1.index_select(dim=0,index=unsorted_idx_1).cuda()
         
          
          
        self.hidden_2 = self.init_hidden(batch_size).cuda()

        length2_sorted, sorted_idx_2 = torch.sort(length2, descending = True)
        length2_sorted.cuda()
        sorted_idx_2.cuda()
        
        _, unsorted_idx_2 = torch.sort(sorted_idx_2)
        unsorted_idx_2.cuda()
        x2_sorted = x2.index_select(dim=0, index = sorted_idx_2).cuda()
        
        embed_2 = torch.from_numpy(self.embedding[x2_sorted]).cuda()
        
        length2_sorted_copy = length2_sorted.cpu()
        pac_seq_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, length2_sorted_copy.numpy(), batch_first = True).float()
        
        _, self.hidden_2 = self.GRU(pac_seq_2, self.hidden_2)
        self.hidden_2.cuda()
        first_dir_2 = self.hidden_2.narrow(0,0,1).squeeze(dim=0).cuda()
        sec_dir_2 = self.hidden_2.narrow(0,1,1).squeeze(dim=0).cuda()
       
        output_2 = torch.cat((first_dir_2, sec_dir_2), dim=1).cuda()

        output_2 = output_2.index_select(dim=0,index=unsorted_idx_2).cuda()
        
        #print('The shape of output_1 is {}'.format(output_1.size()))
        #print('The shape of output_2 is {}'.format(output_2.size()))

        rnn_out = torch.add(output_1, 1, output_2).cuda()# 32 x 400
        #print('The shape of rnn_output is {}'.format(rnn_out.size()))
        
        rnn_out = self.fc1(rnn_out).cuda()
        rnn_out = F.relu(rnn_out, inplace = True)
        logits = self.fc2(rnn_out).cuda()
        
        return logits
        
        
        
        

In [None]:
#Train for different hidden size
hidden_list = [200,400,600,800,1000]

total_train_loss_list = []
total_val_acc_list = []
total_final_val_acc = []

for hidden_size in hidden_list:
    print('The current hidden size is {}'.format(hidden_size))
  
  #Create new loader:
    train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
    train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                          batch_size = BATCH_SIZE,
                                          collate_fn = vocab_collate_func,
                                          shuffle = True)
    val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
  
    #Create new models
    model_gru = GRU(emb_size = 300, hidden_size = hidden_size, num_layers = 1, num_classes = 3, loaded_embedding = loaded_embeddings).cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda()
    learning_rate = 3e-4

    optimizer = torch.optim.Adam(model_gru.parameters(), lr = learning_rate)
    num_epochs = 10;
    #num_classes = 3;

    train_loss_list = []
    val_acc_list = []
  
    for epoch in range(num_epochs):
        for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
            model_gru.train()
            optimizer.zero_grad()

            outputs = model_gru(data_1, data_2, length_1, length_2).cuda()
            loss = criterion(outputs, labels).cuda()
            train_loss_list.append(loss.item())

            loss.backward()
            optimizer.step()

            if i > 0 and i % 500 == 0:
                val_acc = test_model_gru(val_loader, model_gru)
                val_acc_list.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                      epoch+1, num_epochs, i+1, len(train_loader), val_acc))
  
    print('end of epoch iterations')
    total_train_loss_list.append(train_loss_list)
    total_val_acc_list.append(val_acc_list)
  
    final_val_acc = test_model_gru(val_loader, model_gru)
    total_final_val_acc.append(final_val_acc)                                                                                             len(total_val_acc_list[0])))

pkl.dump(total_train_loss_list, open('add_size_train_loss_rnn.p', 'wb'))
pkl.dump(total_val_acc_list, open('add_size_val_acc_rnn.p', 'wb'))
pkl.dump(total_final_val_acc, open('add_total_final_val_acc_rnn.p', 'wb'))

  

In [None]:
val_cat = pkl.load(open('total_final_val_acc.p', 'rb'))
val_mul = pkl.load(open('mul_total_final_val_acc_rnn.p', 'rb'))
val_add = pkl.load(open('add_total_final_val_acc_rnn.p', 'rb'))
val_cat, val_mul, val_add

## As a result, I decided to adopt the concatenation at hidden size 60 to do the test

In [None]:
#Clean test data and remove outliers
test_data_fiction_cln = test_data_fiction.drop([4377,2920],0)#Remove the 2 abnormal one

#Form two lists - will be repeated later for different genre
sentence1_test_data = []
sentence2_test_data = []
    
for line in test_data_fiction_cln.sentence1:
    sentence1_test_data.append(line.split())
    
for line in test_data_fiction_cln.sentence2:
    sentence2_test_data.append(line.split())

test_target = []
for label in test_data_fiction_cln.label:
    if label == 'entailment':
        test_target.append(0)
    elif label == 'contradiction':
        test_target.append(1)
    elif label == 'neutral':
        test_target.append(2)

sentence1_test_id = sentence2id(sentence1_test_data)
max_length1 = max(len(sentence1_test_id[i]) for i in range(0,len(sentence1_test_id)))
sentence2_test_id = sentence2id(sentence2_test_data)
max_length2 = max(len(sentence2_test_id[i]) for i in range(0,len(sentence2_test_id)))


MAX_SENTENCE_LENGTH = max(max_length1, max_length2)
emb_size = 300


MAX_SENTENCE_LENGTH #Should be 55


test_dataset = SnliDataset(sentence1_test_id, sentence2_test_id, test_target)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                         batch_size = BATCH_SIZE,
                                         collate_fn = vocab_collate_func,
                                         shuffle = False)

In [None]:
# max_idx = 0
# sec_len = 0
# max_len = 0
# max_sen = ''
# for i in range(len(sentence1_test_data)):
#   curr_len = len(sentence1_test_data[i])
#   if curr_len > max_len:
#     sec_len = max_len
#     max_idx = i
#     max_len = curr_len
#     max_sen = sentence1_test_data[i]
    
# #THe sec len iso only 53, there we consider remove the abnormal sentence as it will result in excessive paddings
   

In [None]:
def preprocess_test_data(test_data):
    sentence1_test_data = []
    sentence2_test_data = []

    for line in test_data.sentence1:
        sentence1_test_data.append(line.split())

    for line in test_data.sentence2:
        sentence2_test_data.append(line.split())

    test_target = []
    for label in test_data.label:
        if label == 'entailment':
            test_target.append(0)
        elif label == 'contradiction':
            test_target.append(1)
        elif label == 'neutral':
            test_target.append(2)

    sentence1_test_id = sentence2id(sentence1_test_data)
    max_length1 = max(len(sentence1_test_id[i]) for i in range(0,len(sentence1_test_id)))
    sentence2_test_id = sentence2id(sentence2_test_data)
    max_length2 = max(len(sentence2_test_id[i]) for i in range(0,len(sentence2_test_id)))

    MAX_SENTENCE_LENGTH = max(max_length1, max_length2)
    
    test_dataset = SnliDataset(sentence1_test_id, sentence2_test_id, test_target)
    test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                         batch_size = BATCH_SIZE,
                                         collate_fn = vocab_collate_func,
                                         shuffle = False)
    return MAX_SENTENCE_LENGTH, test_loader

In [None]:
train_dataset = SnliDataset(sentence1_train_id, sentence2_train_id, train_target)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset, 
                                        batch_size = BATCH_SIZE,
                                        collate_fn = vocab_collate_func,
                                        shuffle = True)
val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                      batch_size = BATCH_SIZE,
                                      collate_fn = vocab_collate_func,
                                      shuffle = True)
#Train the final models
model_gru_mul = GRU(emb_size = 300, hidden_size = 400, num_layers = 1, 
                    num_classes = 3, loaded_embedding = loaded_embeddings).cuda()
criterion = torch.nn.CrossEntropyLoss().cuda()
learning_rate = 3e-4
optimizer = torch.optim.Adam(model_gru_mul.parameters(), lr = learning_rate)
num_epochs = 8;

train_loss_list = []
val_acc_list = []

for epoch in range(num_epochs):
    for i, (data_1, length_1, data_2, length_2, labels) in enumerate(train_loader):
        model_gru_mul.train()
        optimizer.zero_grad()

        outputs = model_gru_mul(data_1, data_2, length_1, length_2).cuda()
        loss = criterion(outputs, labels).cuda()
        train_loss_list.append(loss.item())

        loss.backward()
        optimizer.step()

        if i > 0 and i % 500 == 0:
            val_acc = test_model_gru(val_loader, model_gru_mul)
            val_acc_list.append(val_acc)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                    epoch+1, num_epochs, i+1, len(train_loader), val_acc))


final_val_acc = test_model_gru(val_loader, model_gru_mul)
final_test_acc = test_model_gru(test_loader, model_gru_mul)

print('The test accuracy for fiction is {}, {}'.format(final_val_acc, final_test_acc))
  
# pkl.dump(train_loss_list, open('fiction_train.p', 'wb'))
# pkl.dump(val_acc_list, open('fiction_val.p', 'wb'))
# pkl.dump([final_val_acc, final_test_acc], open('fiction_test.p', 'wb'))
  

In [None]:
#Little test to see abonormal inputs
# max_idx = 0
# sec_len = 0
# max_len = 0
# max_sen = ''
# for i in range(len(sentence1_test_data)):
#   curr_len = len(sentence1_test_data[i])
#   if curr_len > max_len:
#     sec_len = max_len
#     max_idx = i
#     max_len = curr_len
#     max_sen = sentence1_test_data[i]

In [None]:
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_telephone)
test_acc_telephone = test_model_gru(test_loader, model_gru)
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_slate)
test_acc_slate = test_model_gru(test_loader, model_gru)
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_government)
test_acc_government = test_model_gru(test_loader, model_gru)
MAX_SENTENCE_LENGTH, test_loader = preprocess_test_data(test_data_travel)
test_acc_travel = test_model_gru(test_loader, model_gru)


### Visualization:

In [None]:
#Multiplication error
cat_train_loss_rnn = pkl.load(open('hidden_size_train_loss.p', 'rb'))
cat_val_acc_rnn = pkl.load(open('hidden_size_val_acc.p', 'rb'))
#mul_train_loss_rnn = pkl.load(open('mul_size_train_loss_rnn.p', 'rb'))
#mul_val_acc_rnn = pkl.load(open('mul_size_val_acc_rnn.p', 'rb'))
#add_train_loss_rnn = pkl.load(open('add_size_train_loss_rnn.p', 'rb'))
#add_val_acc_rnn = pkl.load(open('add_size_val_acc_rnn.p', 'rb'))



hidden_size_list = [200,400,600,800,1000]
fig, ax = plt.subplots(nrows = 2, ncols = 5, figsize = (30,8))

for i in range(5):
    ax[0, i].plot(cat_train_loss_rnn[i])
    ax[1, i].plot(cat_val_acc_rnn[i])
    ax[0,i].set_xlabel("Training data size")
    ax[1,i].set_xlabel("Epochs")
    ax[0,i].set_ylabel("Training loss")
    ax[1,i].set_ylabel("Evaluation accuracy")
    ax[0,i].set_title("Hidden Size: {}".format(hidden_size_list[i]))

fig.savefig('cat_hidden_size_rnn_plot.png')
files.download('cat_hidden_size_rnn_plot.png')

# fig.savefig('mul_hidden_size_rnn_plot.png')
# files.download('mul_hidden_size_rnn_plot.png')



# fig.savefig('add_hidden_size_rnn_plot.png')
# files.download('add_hidden_size_rnn_plot.png')



### Extract the examples by using non-shuffled validation data

In [None]:
val_dataset = SnliDataset(sentence1_val_id, sentence2_val_id, val_target)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                         batch_size = BATCH_SIZE,
                                         collate_fn = vocab_collate_func,
                                         shuffle = False)

In [None]:
def test_model_gru_extract(loader, model):
    model.eval()
    for i, (data_1, length_1, data_2, length_2, labels) in enumerate(loader):
        if i == 1:
            break
      
    data_1_batch, length_1_batch, data_2_batch, length_2_batch,labels_batch = data_1, length_1, data_2, length_2, labels
    data_1_batch = data_1_batch.cuda()
    data_2_batch = data_2_batch.cuda()
    length_1_batch = length_1_batch.cuda()
    length_2_batch = length_2_batch.cuda()
    labels_batch = labels_batch.cuda()

    outputs = model(data_1_batch, data_2_batch, length_1_batch, length_2_batch).cuda()

    outputs = F.softmax(outputs, dim = 1)

    predicted = outputs.max(1, keepdim = True)[1].cuda()

    result = predicted.eq(labels.view_as(predicted))

    return data_1, data_2, labels, predicted, result

  

In [None]:
#Manual check for correct and incorrect classifications.
data1_e, data2_2, labels_e, predicted_e, result_e = test_model_gru_extract(val_loader, model_gru_mul)
data1_e1 = data1_e.cpu()
data1_e1.numpy()[1]


In [None]:
result_e

In [None]:
predicted_e

In [None]:
sentence1_val_data[37], sentence2_val_data[37], val_target[37]