In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import bcolz
import pickle
import spacy
from tqdm import tqdm_notebook
import string
from collections import defaultdict
from functools import reduce
from collections import Counter
import ast
import torch.backends.cudnn as cudnn
import io
import matplotlib.pylab as plt
import json

## HyperParameter


In [None]:
PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 100
EMBED_DIM = 300
LEARNING_RATE = 0.001
MAX_SENT_LENGTH1=34
MAX_SENT_LENGTH2=34
mnli_MAX_SENTENCE_LENGTH1 = 25 
mnli_MAX_SENTENCE_LENGTH2 = 25
EPOCH_NUM = 10
MAX_VOCAB = 20000
HIDDEN_SIZE = [100, 150, 200]
KERNEL_SIZE = 3
DROP_OUT = 0
CONCAT = ['Concatenation']

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
glove_path = '/scratch/wz1218/hw2_data'
print(device)

## Tokenize

In [None]:
def tokenize_dataset(dataset):
    token_dataset_sn1 = []
    all_tokens = []
    for i in dataset.sentence1:
        token = i.lower().strip().split(" ")
        token_dataset_sn1.append(token)
        all_tokens += token
        
    token_dataset_sn2 = []
    for i in dataset.sentence2:
        token = i.lower().strip().split(" ")
        token_dataset_sn2.append(token)
        all_tokens += token
        
    num_label = []
    for i in dataset.label:
        if i == "contradiction":
            label = 0
        elif i == 'entailment':
            label = 1
        elif i == 'neutral':
            label = 2
        num_label.append(label)
    return token_dataset_sn1, token_dataset_sn2 , all_tokens, num_label

In [None]:
snli_train = pd.read_csv(f"{glove_path}/snli_train.tsv", '\t')
snli_val = pd.read_csv(f"{glove_path}/snli_val.tsv", '\t')

In [None]:
tr_sen1, tr_sen2, all_tokens, tr_num_label = tokenize_dataset(snli_train)
val_sen1, val_sen2, _, val_num_label = tokenize_dataset(snli_val)

## Load fasttext

In [None]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for index, line in enumerate(fin):
        if index > 50000:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [float(i) for i in tokens[1:]]
    return data

fname = f'{glove_path}/fasttext300d.vec'
fasttext = load_vectors(fname)

## Token2id & Id2token

In [None]:
def build_vocab(all_tokens):
    max_vocab_size = MAX_VOCAB
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2, 2 + len(vocab))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

## Create Weight Matrix

In [None]:
token2id, id2token = build_vocab(all_tokens)
unique_token = token2id.keys()
emb_dim = EMBED_DIM
matrix_len = len(unique_token)
weights_matrix = np.zeros((matrix_len, emb_dim))
words_found = 0

for i, word in enumerate(unique_token):
    if word in fasttext.keys():
        weights_matrix[i] = fasttext[word]
        words_found += 1
    else:
        #weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
        weights_matrix[i] = np.zeros(shape=(emb_dim, ))

In [None]:
print(words_found)

## DataLoader For SNIL DataSet

In [None]:
class SNILDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, sent1, sent2, num_label, token2id):
        """
        @param data_list: list of character
        @param target_list: list of targets
        """
        self.token2id = token2id
        self.sentense1 = sent1
        self.sentense2 = sent2
        self.target = num_label

    def __len__(self):
        return len(self.sentense1)

    def __getitem__(self, key):

        sen1_idx = [self.token2id[vocab] if vocab in self.token2id.keys() else
                    UNK_IDX for vocab in self.sentense1[key][:MAX_SENT_LENGTH1]]
        sen2_idx = [self.token2id[vocab] if vocab in self.token2id.keys() else
                    UNK_IDX for vocab in self.sentense2[key][:MAX_SENT_LENGTH2]]
        label = self.target[key]
        return[sen1_idx, sen2_idx, len(sen1_idx), len(sen2_idx), label]



def sent_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sent1_data = []
    sent2_data = []
    label_list = []
    sent1_length_list = []
    sent2_length_list = []

    for data in batch:
        label_list.append(data[4])
        sent1_length_list.append(data[2])
        sent2_length_list.append(data[3])

    max_sen1_length = np.max(sent1_length_list)
    max_sen2_length = np.max(sent2_length_list)
    for datum in batch:

        padded_vec_sen1 = np.pad(np.array(datum[0]),
                                pad_width=((0,max_sen1_length-datum[2])),
                                mode="constant", constant_values=0)
        padded_vec_sen2 = np.pad(np.array(datum[1]),
                                pad_width=((0,max_sen2_length-datum[3])),
                                mode="constant", constant_values=0)
        sent1_data.append(padded_vec_sen1)
        sent2_data.append(padded_vec_sen2)

    return [torch.from_numpy(np.array(sent1_data)), torch.from_numpy(np.array(sent2_data)),
            torch.LongTensor(sent1_length_list), torch.LongTensor(sent2_length_list),
            torch.LongTensor(label_list)]

In [None]:
train_data = SNILDataset(tr_sen1, tr_sen2, tr_num_label, token2id)
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=True)

val_data = SNILDataset(val_sen1, val_sen2, val_num_label, token2id)
val_loader = torch.utils.data.DataLoader(dataset=val_data,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=sent_collate_func,
                                           shuffle=False)


## Create Embedding Layer

In [None]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix))
    emb_layer = emb_layer.to(device)
    return emb_layer, num_embeddings, embedding_dim

## GRU Model

In [None]:
class GRU(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, dropout, concat):
        super(GRU, self).__init__()
        self.num_layers, self.hidden_size, self.dropout, self.concat = num_layers, hidden_size, dropout, concat
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.embedding.weight.requires_grad = False
        self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional = True, dropout = self.dropout) 
        self.linear1 = nn.Linear(hidden_size * 4, hidden_size)
        self.mullinear = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size , num_classes)
        
    def init_hidden(self, batch_size):
        hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).to(device)
        return hidden
    
    def forward(self, x1,x2):
        batch_size, seq_len = x1.size()
        self.hidden = self.init_hidden(batch_size)
        
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        rnn_out1, hidden_1 = self.rnn(embed1, self.hidden) 
        rnn_out2, hidden_2 = self.rnn(embed2, self.hidden)
        
        if self.concat == 'Concatenation':
            trans_h1 = hidden_1.view(BATCH_SIZE, -1)
            trans_h2 = hidden_2.view(BATCH_SIZE, -1)
            hidden_combine = torch.cat((trans_h1, trans_h1), dim=1)
            full_1 = self.linear1(hidden_combine)
            full_1 = self.relu(full_1)
            logits = self.linear2(full_1)
        else:
            hidden_combin = hidden1 * hidden2
            mul1 = self.mullinear(hidden_combin)
            mul1 = self.relu(mul1)
            logits = self.linear_2(mul1)
        return logits


## 2 Layer CNN Model

In [None]:
class CNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, kernel_size, num_class, stride, padding, bias):
        super(CNN, self).__init__()
        self.embedding, num_embeddings, self.embedding_dim = create_emb_layer(weights_matrix, True)
        self.hidden_size, self.kernel_size, self.num_class, self.stride, self.padding, self.bias \
            = hidden_size, kernel_size, num_class, stride, padding, bias
        self.conv_net1 = nn.Sequential(
            nn.Conv1d(self.embedding_dim, self.hidden_size, self.kernel_size, 
                            self.stride, self.padding, self.bias),
            nn.ReLU()
        )
        self.conv_net2 = nn.Sequential(
            nn.Conv1d(self.hidden_size, self.hidden_size, self.kernel_size, 
                            self.stride, self.padding, self.bias),
            nn.ReLU()
        )
        
        self.linear1 = nn.Linear(2 * self.hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 3)
        self.relu = nn.ReLU()
        
    
    def forward(self, x1, x2):
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        
        x1 = embed1.transpose(2,1)
        x2 = embed2.transpose(2,1)

        sent1_cnn1 = self.conv_net1(x1)
        hidden1 = self.conv_net2(sent1_cnn1)
        
        
        sent2_cnn1 = self.conv_net1(x2)
        hidden2 = self.conv_net2(sent2_cnn1)
        
        hidden_1, _= torch.max(hidden1, dim=2)
        hidden_2, _ = torch.max(hidden2, dim=2)
        
        hidden_combine = torch.cat((hidden_1, hidden_2), 1)
        
        full_lc1 = self.linear1(hidden_combine)
        full_lc1 = self.relu(full_lc1)
        full_lc2 = self.linear2(full_lc1)
        return full_lc2      

In [None]:
def test_model(loader, model, criterion):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    val_total = 0
    for sent1, sent2, sent1_length, sent2_length, labels in loader:
        sent1, sent2, labels = sent1.to(device), sent2.to(device), labels.to(device)
        output = model(sent1, sent2)
        outputs = F.softmax(output, dim=1)
        val_loss = criterion(outputs, labels)
        val_total += val_loss
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return ((100 * correct / total), val_total)

In [None]:
train_acc_all = defaultdict(list)
train_loss_b= defaultdict(list)
train_loss_e = defaultdict(list)
val_acc_all = defaultdict(list)
val_loss_all = defaultdict(list)
for i in HIDDEN_SIZE:
    #model = GRU(weights_matrix, hidden_size = i, num_layers=1, num_classes=3, dropout=0, concat='Concatenation').to(device)
    model = CNN(weights_matrix, hidden_size = i, kernel_size = KERNEL_SIZE, num_class = 3, stride = 1, padding = 1, bias = True).to(device)
    learning_rate = LEARNING_RATE
    num_epochs = EPOCH_NUM  
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    training_epoch_loss = []
    train_accs = []
    train_losses = []
    val_accs = []
    val_losses = []
    model_acc = 0
    for epoch in range(num_epochs):
        e_loss = 0
        for i, (sent1, sent2, sent1_length, sent2_length, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            sent1, sent2, labels = sent1.to(device), sent2.to(device), labels.to(device)
            outputs = model(sent1, sent2)
            loss = criterion(outputs, labels)
            e_loss += loss
            loss.backward()
            optimizer.step()
            if i > 0 and i % 400 == 0:
                tr_acc, tr_loss = test_model(train_loader, model, criterion)
                val_acc, val_loss = test_model(val_loader, model, criterion)
                val_accs.append(val_acc)
                val_losses.append(val_loss)
                train_accs.append(tr_acc)
                train_losses.append(tr_loss)
                '''
                save model
                '''
                if val_acc > model_acc:
                    torch.save(model.state_dict(), 'bestmodel_RNN.pth')
                    #torch.save(model.state_dict(), 'bestmodel_CNN.pth')
                print('Epoch: [{}/{}], Step: [{}/{}], Training Acc: {}, Training Loss: {}, Validation Acc: {}, Validation Loss: {}'.format(
                    epoch + 1, num_epochs, i + 1, len(train_loader), tr_acc, tr_loss, val_acc, val_loss))
        
        training_epoch_loss.append(e_loss)
        print("epoch {0}, training loss = {1}".format(epoch, e_loss))
        train_acc_all[i] = train_accs
        train_loss_b[i] = train_losses
        train_loss_e = training_epoch_loss
        val_acc_all = val_accs
        val_loss_all = val_losses

### Save Result to txt file
'''
CNN Hidden Tuning
'''
exDict = {'train_acc': train_acc_all, 'val_acc': val_acc_all}
with open('cnn_train_hidden.txt', 'w') as file:
     file.write(json.dumps(exDict))
'''
CNN Kernel Size Tuning
'''
# exDict = {'train_acc': train_acc_all, 'val_acc': val_acc_all}
# with open('cnn_train_kernel.txt', 'w') as file:
#      file.write(json.dumps(exDict))


In [None]:
'''
CNN Kernel Tuning
'''
with open('cnn_train_kernel.txt', 'r') as f:
    result = json.load(f)
tr_acc = result['tr_acc']
val_acc = result['val_acc']
length = range(len(tr_acc['3']))

plt.figure(figsize = (20, 10))
for i in tr_acc:
    plt.plot(length, tr_acc[i], label = 'Training Accuracy with kernel size {}*{}'.format(i, i))
    plt.plot(length, val_acc[i], label = 'Validation Accuracy with kernel size {}*{}'.format(i, i))
plt.xlabel("Steps")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

for i in tr_acc:
    print(np.max(val_acc[i]))
    print(tr_acc[i][np.argmax(val_acc[i])])

In [None]:
'''
CNN Hidden Size Tuning
'''
with open('cnn_train_hidden.txt', 'r') as f:
    result = json.load(f)
tr_acc = result['tr_acc']
val_acc = result['val_acc']
length = range(len(tr_acc['100']))

plt.figure(figsize = (20, 10))
for i in tr_acc:
    plt.plot(length, tr_acc[i], label = 'Training Accuracy with hidden dimension {}'.format(i))
    plt.plot(length, val_acc[i], label = 'Validation Accuracy with hidden dimension {}'.format(i))
plt.xlabel("Steps")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

for i in tr_acc:
    print(np.max(val_acc[i]))
    print(tr_acc[i][np.argmax(val_acc[i])])

## Load Model

In [None]:
'''
load_model
'''
state_dict_RNN = torch.load('bestmodel_RNN.pth')
state_dict_CNN = torch.load('bestmodel_CNN.pth')

rnn_model = RNN(weights_matrix, hidden_size = 200, num_layers=1, num_classes=3, dropout=0, condat = 'Concatenation').to(device)
rnn_model.load_state_dict(state_dict_RNN)

cnn_model = CNN(weights_matrix, hidden_size = 200, kernel_size = 3, num_class = 3, stride = 1, padding = 1, bias = True).to(device)
cnn_model.load_state_dict(state_dict_CNN)

## Pick up correct/incorrect

In [None]:
'''
pick up correct/incorrect
'''
def get_data(loader, model):
    model.eval()
    Result = []
    for sent1, sent2, sent1_length, sent2_length, labels in loader:
        sent1, sent2, labels = sent1.to(device), sent2.to(device), labels.to(device)
        output = model(sent1, sent2)
        outputs = F.softmax(output, dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        #check if the prediction is correct and get the data
        Result.append(sent1[0])
        Result.append(sent2[0])
        Result.append(labels.view_as(predicted)[0])
        Result.append(predicted[0])
    return Result

Examples = get_data(rnn_model, val_loader)
Example1 = (Examples[0][0], Examples[1][0], Examples[2][0], Examples[3][0])
sent1 = []
for i in Example1[0]:
    sent1 += [id2token[i]]
print(" ".join(sent1), '\n')
sent2 = []
for i in Example1[1]:
    sent2 += [id2token[i]]
print(" ".join(sent2), '\n')
print(Example1[2], Example1[3])

## Tokenize Mnli data & Make Data Loader

In [None]:
#mnli_train = pd.read_csv(path + 'mnli_train.tsv', sep='\t')
mnli_val = pd.read_csv(path + 'mnli_val.tsv', sep='\t')

In [None]:
#tr_sen1, tr_sen2, all_tokens, tr_num_label = tokenize_dataset(mnli_train)
mnli_val_sen1, mnli_val_sen2, _, mnli_val_num_label = tokenize_dataset(mnli_val)
val_genre = []
for i in mnli_val.genre:
        if i == "telephone":
            genre = 0
        elif i == 'fiction':
            genre = 1
        elif i == 'slate':
            genre = 2
        elif i == 'government':
            genre = 3
        elif i == 'travel':
            genre = 4
        val_genre.append(genre)      

In [None]:
class MnliDataset(Dataset):

    def __init__(self, sent1, sent2, num_label, token2id, genre):
   
        self.token2id = token2id
        self.sentense1 = sent1
        self.sentense2 = sent2
        self.target = num_label
        self.genre = genre
        
    def __len__(self):
        return len(self.sentense1)

    def __getitem__(self, key):
        sen1_idx = [self.token2id[vocab] if vocab in self.token2id.keys() else
                    UNK_IDX for vocab in self.sentense1[key][:mnli_MAX_SENTENCE_LENGTH1]]
        sen2_idx = [self.token2id[vocab] if vocab in self.token2id.keys() else
                    UNK_IDX for vocab in self.sentense2[key][:mnli_MAX_SENTENCE_LENGTH2]]
        label = self.target[key]
        genre = self.genre[key]
        return[sen1_idx, sen2_idx, len(sen1_idx), len(sen2_idx), label, genre]

    
def MNLIsent_collate_func(batch):

    sent1_data = []
    sent2_data = []
    label_list = []
    sent1_length_list = []
    sent2_length_list = []
    genre = []

    for datum in batch:
        label_list.append(datum[4])
        sent1_length_list.append(datum[2])
        sent2_length_list.append(datum[3])
        genre.append(datum[5])
    
    for datum in batch:
        padded_vec_sen1 = np.pad(np.array(datum[0]),
                                pad_width=((0,mnli_MAX_SENTENCE_LENGTH1-datum[2])),
                                mode="constant", constant_values=0)
        padded_vec_sen2 = np.pad(np.array(datum[1]),
                                pad_width=((0,mnli_MAX_SENTENCE_LENGTH2-datum[3])),
                                mode="constant", constant_values=0)
        sent1_data.append(padded_vec_sen1)
        sent2_data.append(padded_vec_sen2)

    return [torch.from_numpy(np.array(sent1_data)), torch.from_numpy(np.array(sent2_data)),
            torch.LongTensor(sent1_length_list), torch.LongTensor(sent2_length_list),
            torch.LongTensor(label_list), torch.LongTensor(genre)]

In [None]:
mnli_val_data = MnliDataset(mnli_val_sen1, mnli_val_sen2, mnli_val_num_label, token2id, mnli_val)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_data,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MNLIsent_collate_func,
                                           shuffle=False)

## Test RNN/CNN best model on Mnli

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    model.eval()
    Total_Acc = {}
    for sent1, sent2, sent1_length, sent2_length, labels, genre in loader:
        sent1, sent2, labels, genre = sent1.to(device), sent2.to(device), labels.to(device), genre.to(device)
        output = model(sent1, sent2)
        outputs = F.softmax(output, dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        unique_genre = genre.unique()
        for i in unique_genre:
            if i == 0:
                genre = "telephone"
            elif i == 1:
                genre = 'fiction'
            elif i == 2:
                genre = 'slate'
            elif i == 3:
                genre = 'government'
            elif i == 4:
                genre = 'travel'
            total = genre.eq(i).sum().item()
            g = genre.eq(i).nonzero().view(total)
            correct += predicted[g].eq(label_batch[g].view_as(predicted[g])).sum().item()
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
        Total_Acc{i} = 100 * correct / total
    return Total_Acc

In [None]:
test_model(mnli_val_loader, rnn_model)
test_model(mnli_val_loader, cnn_model)