In [2]:
import pandas as pd
import os
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import itertools
import glob

In [4]:
path = os.getcwd() + '/hw2_data/'
snli_train = pd.read_csv(path + 'snli_train.tsv', sep='\t')
snli_val = pd.read_csv(path + 'snli_val.tsv', sep='\t')
mnli_train = pd.read_csv(path + 'mnli_train.tsv', sep='\t')
mnli_val = pd.read_csv(path + 'mnli_val.tsv', sep='\t')

label_dict = {0: 'neutral', 1: 'entailment', 2: 'contradiction'}
genre_dict = {0: 'telephone', 1: 'fiction', 2: 'slate', 3: 'government', 4: 'travel'}

for idx, label in label_dict.items():
    snli_train['label'].loc[snli_train['label'] == label] = idx
    snli_val['label'].loc[snli_val['label'] == label] = idx
    mnli_train['label'].loc[mnli_train['label'] == label] = idx
    mnli_val['label'].loc[mnli_val['label'] == label] = idx

for idx, genre in genre_dict.items():
    mnli_train['genre'].loc[mnli_train['genre'] == genre] = idx
    mnli_val['genre'].loc[mnli_val['genre'] == genre] = idx

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


__Load Fasttext Embedding__

In [5]:
ft_path = os.getcwd() + '/wiki-news-300d-1M.vec'
words_to_load = 50000

PAD_IDX = 0
UNK_IDX = 1

fin = io.open(ft_path, 'r', encoding='utf-8', newline='\n', errors='ignore')

n, d = map(int, fin.readline().split())
vocab_size = words_to_load + 2
embedding_dim = d

embedding_mat = np.zeros((vocab_size, embedding_dim))
token2id = {}
id2token = {}
all_tokens = []

for i, line in enumerate(fin):
    if i >= words_to_load:
        break
    s = line.rstrip().split(' ')
    embedding_mat[i+2, :] = np.asarray(s[1:])
    token2id[s[0]] = i+2
    id2token[i+2] = s[0]
    all_tokens.append(s[0])

    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    id2token[PAD_IDX] = '<pad>'
    id2token[UNK_IDX] = '<unk>'
    embedding_mat[0, :] = np.zeros((1,d))
    #generate normal dist 1d array for UNK token
    embedding_mat[1, :] = np.random.normal(size=d) 

In [6]:
def token2index_dataset(tokens_data):
    indices_data = []
    for i in tokens_data:
        tokens = i.split()
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [7]:
train_sent1_indices = token2index_dataset(snli_train['sentence1'])
train_sent2_indices = token2index_dataset(snli_train['sentence2'])
val_sent1_indices = token2index_dataset(snli_val['sentence1'])
val_sent2_indices = token2index_dataset(snli_val['sentence2'])
train_y = [i for i in snli_train['label']]
val_y = [i for i in snli_val['label']]

mnli_train_sent1_indices = token2index_dataset(mnli_train['sentence1'])
mnli_train_sent2_indices = token2index_dataset(mnli_train['sentence2'])
mnli_val_sent1_indices = token2index_dataset(mnli_val['sentence1'])
mnli_val_sent2_indices = token2index_dataset(mnli_val['sentence2'])
mnli_train_y = [i for i in mnli_train['label']]
mnli_train_genre = [i for i in mnli_train['genre']]
mnli_val_y = [i for i in mnli_val['label']]
mnli_val_genre = [i for i in mnli_val['genre']]

In [8]:
# Max sent length = 99% length in the training set
# Tend to have longer premise sentences than hypothesis sentences. 
# Define two separate "max lengths" to reflect this.
sent_length_1 = [len(train_sent1_indices[i]) for i in range(len(train_sent1_indices))]
sent_length_2 = [len(train_sent2_indices[i]) for i in range(len(train_sent2_indices))]
MAX_SENTENCE_LENGTH_1 = int(np.percentile(sent_length_1,99))
MAX_SENTENCE_LENGTH_2 = int(np.percentile(sent_length_2,99))

mnli_sent_length_1 = [len(mnli_train_sent1_indices[i]) for i in range(len(mnli_train_sent1_indices))]
mnli_sent_length_2 = [len(mnli_train_sent2_indices[i]) for i in range(len(mnli_train_sent2_indices))]
mnli_MAX_SENTENCE_LENGTH_1 = int(np.percentile(mnli_sent_length_1,99))
mnli_MAX_SENTENCE_LENGTH_2 = int(np.percentile(mnli_sent_length_2,99))

__Data Loader__

In [9]:
class SnliDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_list_1, data_list_2, target_list):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.data_list_1 = data_list_1
        self.data_list_2 = data_list_2
        self.target_list = target_list
        assert (len(self.data_list_1) == len(self.target_list))
        assert (len(self.data_list_2) == len(self.target_list))
        assert (len(self.data_list_1) == len(self.data_list_2))

    def __len__(self):
        return len(self.data_list_1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx_1 = self.data_list_1[key][:MAX_SENTENCE_LENGTH_1]
        token_idx_2 = self.data_list_2[key][:MAX_SENTENCE_LENGTH_2]
        label = self.target_list[key]
        return [token_idx_1, len(token_idx_1), token_idx_2, len(token_idx_2), label]

def Snil_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []
    
    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[1])
        length_list_2.append(datum[3])
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH_1-datum[1])), 
                                mode="constant", constant_values=0).tolist()
        padded_vec_2 = np.pad(np.array(datum[2]), 
                        pad_width=((0,MAX_SENTENCE_LENGTH_2-datum[3])), 
                        mode="constant", constant_values=0).tolist()
        data_list_1.append(padded_vec_1)
        data_list_2.append(padded_vec_2)
    return [torch.from_numpy(np.array(data_list_1)), torch.LongTensor(length_list_1), 
            torch.from_numpy(np.array(data_list_2)), torch.LongTensor(length_list_2),
            torch.LongTensor(label_list)]

# create pytorch dataloader
BATCH_SIZE = 32
train_dataset = SnliDataset(train_sent1_indices, train_sent2_indices, train_y)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=Snil_collate_func,
                                           shuffle=True)

val_dataset = SnliDataset(val_sent1_indices, val_sent2_indices, val_y)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=BATCH_SIZE,
                                         collate_fn=Snil_collate_func,
                                         shuffle=True)

In [10]:
class MnliDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_list_1, data_list_2, target_list, genre_list):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.data_list_1 = data_list_1
        self.data_list_2 = data_list_2
        self.target_list = target_list
        self.genre_list = genre_list
        assert (len(self.data_list_1) == len(self.target_list))
        assert (len(self.data_list_2) == len(self.target_list))
        assert (len(self.data_list_1) == len(self.data_list_2))
        assert (len(self.data_list_2) == len(self.genre_list))

    def __len__(self):
        return len(self.data_list_1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx_1 = self.data_list_1[key][:mnli_MAX_SENTENCE_LENGTH_1]
        token_idx_2 = self.data_list_2[key][:mnli_MAX_SENTENCE_LENGTH_2]
        label = self.target_list[key]
        genre = self.genre_list[key]
        return [token_idx_1, len(token_idx_1), token_idx_2, len(token_idx_2), label, genre]

def Mnil_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []
    genre_list = []
    
    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[1])
        length_list_2.append(datum[3])
        genre_list.append(datum[5])
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,mnli_MAX_SENTENCE_LENGTH_1-datum[1])), 
                                mode="constant", constant_values=0).tolist()
        padded_vec_2 = np.pad(np.array(datum[2]), 
                        pad_width=((0,mnli_MAX_SENTENCE_LENGTH_2-datum[3])), 
                        mode="constant", constant_values=0).tolist()
        data_list_1.append(padded_vec_1)
        data_list_2.append(padded_vec_2)
    return [torch.from_numpy(np.array(data_list_1)), torch.LongTensor(length_list_1), 
            torch.from_numpy(np.array(data_list_2)), torch.LongTensor(length_list_2),
            torch.LongTensor(label_list), torch.LongTensor(genre_list)]

# create pytorch dataloader
BATCH_SIZE = 5000
mnli_train_dataset = MnliDataset(mnli_train_sent1_indices, mnli_train_sent2_indices, mnli_train_y, mnli_train_genre)
mnli_train_loader = torch.utils.data.DataLoader(dataset=mnli_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=Mnil_collate_func,
                                           shuffle=True)

mnli_val_dataset = MnliDataset(mnli_val_sent1_indices, mnli_val_sent2_indices, mnli_val_y, mnli_val_genre)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                                         batch_size=BATCH_SIZE,
                                         collate_fn=Mnil_collate_func,
                                         shuffle=True)

__RNN__

In [26]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, linear_dim, concat):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        super(RNN, self).__init__()
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding_mat), freeze = True)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, bidirectional = True, batch_first = True)
        self.linear_1 = nn.Linear(hidden_size * 2, linear_dim)
        self.linear_1_mult = nn.Linear(hidden_size, linear_dim)
        self.leaky_relu = nn.LeakyReLU(inplace=True)
        self.linear_2 = nn.Linear(linear_dim, num_classes)
        self.concat = concat

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        if use_gpu:
            self.hidden = torch.randn(num_layers * 2, batch_size, self.hidden_size).cuda()
        else:
            self.hidden = torch.randn(num_layers * 2, batch_size, self.hidden_size)
        return self.hidden

    def forward(self, x1, lengths_1, x2, lengths_2):
        # reset hidden state
        batch_size, seq_len_1 = x1.size()
        batch_size, seq_len_2 = x2.size()
        self.hidden = self.init_hidden(batch_size)
        
        # Compute sorted sequence lengths
        _, idx_sort_1 = torch.sort(lengths_1, dim=0, descending=True)
        _, idx_sort_2 = torch.sort(lengths_2, dim=0, descending=True)
        _, idx_unsort_1 = torch.sort(idx_sort_1, dim=0)
        _, idx_unsort_2 = torch.sort(idx_sort_2, dim=0)
        
        # get embedding of characters
        embed_1 = self.embedding(x1).float()
        embed_2 = self.embedding(x2).float()
        
        # Sort embedding and length
        embed_1 = embed_1.index_select(0, idx_sort_1)
        embed_2 = embed_2.index_select(0, idx_sort_2)
        lengths_1 = lengths_1.index_select(0, idx_sort_1)
        lengths_2 = lengths_2.index_select(0, idx_sort_2)
        
        # Pack padded sequence
        if use_gpu:
            embed_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, lengths_1.cpu().numpy(), batch_first=True).cuda()
            embed_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, lengths_2.cpu().numpy(), batch_first=True).cuda()
        else:
            embed_1 = torch.nn.utils.rnn.pack_padded_sequence(embed_1, lengths_1.cpu().numpy(), batch_first=True)
            embed_2 = torch.nn.utils.rnn.pack_padded_sequence(embed_2, lengths_2.cpu().numpy(), batch_first=True)
        
        # fprop through GRU
        rnn_out_1, hn_1 = self.rnn(embed_1, self.hidden)
        rnn_out_2, hn_2 = self.rnn(embed_2, self.hidden)
        
        # Sum two bidirectional hidden states over direction
        hn_1 = torch.sum(hn_1, dim=0)
        hn_2 = torch.sum(hn_2, dim=0)        
        
        # Unsort last hidden unit
        hn_1 = hn_1.index_select(0, idx_unsort_1)
        hn_2 = hn_2.index_select(0, idx_unsort_2)
        
        # Concat two sentence vectors
        if self.concat == 'Concatenation':
            hn_concat = torch.cat((hn_1, hn_2), 1)
            out = self.linear_1(hn_concat)
            out = self.leaky_relu(out)
            logits = self.linear_2(out)
        else:
            hn_mult = hn_1 * hn_2
            out = self.linear_1_mult(hn_mult)
            out = self.leaky_relu(out)
            logits = self.linear_2(out)
        return logits

In [27]:
#Parameters
emb_size = embedding_dim
hidden_size_ls = [100, 200, 300, 400, 500]
num_layers = 1
num_classes = 3
linear_dim = 50
concat_ls = ['Concatenation', 'Multiplication']
learning_rate_ls = [1e-2, 1e-3, 1e-4]
num_epochs = 20 # number epoch to train
use_gpu = torch.cuda.is_available()


# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sent_1, len_1, sent_2, len_2, label in loader:
        if use_gpu:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda()
        else:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1, len_1, sent_2, len_2, label
        outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += label_batch.size(0)
        correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)


for i in itertools.product(hidden_size_ls, concat_ls, learning_rate_ls):
    hidden_size, concat, learning_rate = i[0], i[1], i[2]
    loss_hist = []
    train_acc_hist = []
    val_acc_hist = []
    best_val_acc = None
    init_learning_rate = learning_rate
    save_path = os.getcwd() + '/SnilRNN (Hidden Size-{} | {} | LR-{}).pt'.format(hidden_size, concat, init_learning_rate)

    model = RNN(emb_size, hidden_size, num_layers, num_classes, linear_dim, concat)
    if use_gpu:
        model = model.cuda()
    criterion = torch.nn.CrossEntropyLoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for i, (sent_1, len_1, sent_2, len_2, label) in enumerate(train_loader):
            model.train()
            if use_gpu:
                sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda()
            else:
                sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1, len_1, sent_2, len_2, label
            optimizer.zero_grad()
            outputs = model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            if i > 0 and i % 1000 == 0:
                train_acc = test_model(train_loader, model)
                val_acc = test_model(val_loader, model)
                loss_ = loss.item()
                loss_hist.append(loss_)
                train_acc_hist.append(train_acc)
                val_acc_hist.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Training Loss: {}, Train Acc: {}, Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), loss_, train_acc, val_acc))

                if not best_val_acc or val_acc > best_val_acc:
                    torch.save({
                                'epoch': epoch,
                                'model_state_dict': model.state_dict(),
                                'optimizer_state_dict': optimizer.state_dict(),
                                'train_loss': loss_,
                                'best_val_accuracy': best_val_acc
                                }, save_path)
                    best_val_acc = val_acc
                # else:
                #     # Anneal the learning rate if no improvement has been seen in the validation dataset.
                #     learning_rate /= 4.0
            
    fig = plt.figure(figsize=(13,3.5))
    ax1 = fig.add_subplot(1,3,1)
    ax1.plot(loss_hist, 
             label = 'RNN Hidden Size: {} | {} | LR: {}'.format(hidden_size, concat, init_learning_rate))
    ax1.set_xlabel('Step')
    ax1.set_ylabel('Train Loss')
    ax1.set_title('Train Loss')
    ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2))

    ax2 = fig.add_subplot(1,3,2)
    ax2.plot(train_acc_hist)
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Train Accuracy')
    ax2.set_title('Train Accuracy')

    ax2 = fig.add_subplot(1,3,3)
    ax2.plot(val_acc_hist)
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Validation Accuracy')
    ax2.set_title('Validation Accuracy')

    plt.tight_layout()
    fig.savefig('RNN (Hidden Size-{} | {} | LR-{}).png'.format(hidden_size, concat, init_learning_rate), 
                dpi = 100, bbox_inches='tight')
    plt.show()

Epoch: [1/20], Step: [1001/3125], Training Loss: 1.0279985666275024, Train Acc: 58.2, Validation Acc: 57.6


KeyboardInterrupt: 

__3 Correct and 3 Incorrect Predictions__

In [28]:
RNN_path = os.getcwd() + '/RNN Results/'
config = []
best_val_acc = []
train_loss = []

for i in glob.glob(os.path.join(RNN_path, '*.pt')):
    checkpoint = torch.load(i)
    config.append(i)
    
    if not checkpoint['best_val_accuracy']: best = 0
    else: best = checkpoint['best_val_accuracy']
    best_val_acc.append(best)
    train_loss.append(checkpoint['train_loss'])

best_idx = np.argmax(best_val_acc)
best_checkpoint = torch.load(config[best_idx])
print(config[best_idx])

model = RNN(emb_size = 300, hidden_size = 300, num_layers = 1, num_classes = 3, linear_dim = 50, concat = 'Concatenation')
if use_gpu:
    model = model.cuda()
model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model.eval()

/home/ms6771/HW2/RNN Results/SnilRNN (Hidden Size-300 | Concatenation | LR-0.001).pt


RNN(
  (embedding): Embedding(50002, 300)
  (rnn): GRU(300, 300, batch_first=True, bidirectional=True)
  (linear_1): Linear(in_features=600, out_features=50, bias=True)
  (linear_1_mult): Linear(in_features=300, out_features=50, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.01, inplace)
  (linear_2): Linear(in_features=50, out_features=3, bias=True)
)

In [29]:
sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = next(iter(val_loader))
if use_gpu:
    sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1_batch.cuda(), len_1_batch.cuda(), sent_2_batch.cuda(), len_2_batch.cuda(), label_batch.cuda()
        
outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
predicted = outputs.max(1, keepdim=True)[1]
correct = predicted.eq(label_batch.view_as(predicted))

In [17]:
correct_idx = correct.eq(1).nonzero()[:,0][0:3]
incorrect_idx = correct.eq(0).nonzero()[:,0][0:3]
correct_dict = {}
incorrect_dict = {}

for i in correct_idx:
    words = []
    for j in sent_1_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    correct_dict[i.item()] = [sent]
    
    words = []
    for j in sent_2_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    correct_dict[i.item()].append(sent)
    
    correct_dict[i.item()].append("Label: {}".format(label_dict[label_batch[i].item()]))
    correct_dict[i.item()].append("Predicted: {}".format(label_dict[predicted.view(label_batch.size())[i].item()]))
      
for i in correct_dict.keys():
    print("Three Correct Predictions:\nSentence 1: {}\nSentence 2: {}\n{}\n{}\n".format(correct_dict[i][0], correct_dict[i][1], correct_dict[i][2], correct_dict[i][3]))
    
    
for i in incorrect_idx:
    words = []
    for j in sent_1_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    incorrect_dict[i.item()] = [sent]
    
    words = []
    for m in sent_2_batch[i]:
        words.append(id2token[m.item()])
    sent = ' '.join(words)
    incorrect_dict[i.item()].append(sent)

    incorrect_dict[i.item()].append("Label: {}".format(label_dict[label_batch[i].item()]))
    incorrect_dict[i.item()].append("Predicted: {}".format(label_dict[predicted.view(label_batch.size())[i].item()]))
      
for i in incorrect_dict.keys():
    print("Three Incorrect Predictions:\nSentence 1: {}\nSentence 2: {}\n{}\n{}\n".format(incorrect_dict[i][0], incorrect_dict[i][1], incorrect_dict[i][2], incorrect_dict[i][3]))

Three Correct Predictions:
Sentence 1: Two girls laying in the grass smile as their picture is taken . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence 2: The girls are sisters . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Label: neutral
Predicted: neutral

Three Correct Predictions:
Sentence 1: A man with a mustache who is wearing white and beige carefully <unk> a sand castle . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence 2: A man building a sand castle <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Label: entailment
Predicted: entailment

Three Correct Predictions:
Sentence 1: The view of a man with a shaved head and blue shirt through a <unk> fence . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence 2: A

__Evaluate on MNLI Validation Dataset__

In [30]:
def mnli_test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    model.eval()
    
    for sent_1, len_1, sent_2, len_2, label, genre in loader:
        if use_gpu:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch, genre_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda(), genre.cuda()
        else:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch, genre_batch = sent_1, len_1, sent_2, len_2, label, genre
        outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        genre_acc = {}
        for i in genre_batch.unique():
            correct = 0
            total = 0
            
            total = genre_batch.eq(i).sum().item()
            genre_id = genre_batch.eq(i).nonzero().view(total)
            correct += predicted[genre_id].eq(label_batch[genre_id].view_as(predicted[genre_id])).sum().item()
            genre_acc[genre_dict[i.item()]] = 100 * correct / total
    return genre_acc

In [31]:
mnli_test_model(mnli_val_loader, model)

{'telephone': 49.75124378109453,
 'fiction': 48.34170854271357,
 'slate': 46.706586826347305,
 'government': 50.39370078740158,
 'travel': 47.45417515274949}

__CNN__

In [32]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, kernel_size):
        super(CNN, self).__init__()
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding_mat), freeze = True)
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, padding=1)
        self.linear = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x1, lengths_1, x2, lengths_2):
        batch_size, seq_len_1 = x1.size()
        batch_size, seq_len_2 = x2.size()

        embed_1 = self.embedding(x1).float()
        embed_2 = self.embedding(x2).float()
        
        hidden_1 = self.conv1(embed_1.transpose(1,2)).transpose(1,2)
        hidden_2 = self.conv1(embed_2.transpose(1,2)).transpose(1,2)
        
        hidden_1 = F.relu(hidden_1.contiguous().view(-1, hidden_1.size(-1))).view(batch_size, hidden_1.size(1), hidden_1.size(-1))
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(1), hidden_2.size(-1))

        hidden_1 = self.conv2(hidden_1.transpose(1,2)).transpose(1,2)
        hidden_2 = self.conv2(hidden_2.transpose(1,2)).transpose(1,2)
        
        hidden_1 = F.relu(hidden_1.contiguous().view(-1, hidden_1.size(-1))).view(batch_size, hidden_1.size(1), hidden_1.size(-1))
        hidden_2 = F.relu(hidden_2.contiguous().view(-1, hidden_2.size(-1))).view(batch_size, hidden_2.size(1), hidden_2.size(-1))

        hidden_1 = torch.max(hidden_1, dim=1, keepdim=False)[0]
        hidden_2 = torch.max(hidden_2, dim=1, keepdim=False)[0]
        
        hidden_concat = torch.cat((hidden_1, hidden_2), 1)
        logits = self.linear(hidden_concat)
        return logits

In [33]:
#Parameters
emb_size = embedding_dim
hidden_size_ls = [200, 300, 400, 500]
num_layers = 1
num_classes = 3
kernel_size_ls = [4, 5]
learning_rate_ls = [1e-2, 1e-3,1e-4]
num_epochs = 20 # number epoch to train
use_gpu = torch.cuda.is_available()

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sent_1, len_1, sent_2, len_2, label in loader:
        if use_gpu:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda()
        else:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1, len_1, sent_2, len_2, label
        outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += label_batch.size(0)
        correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)

for i in itertools.product(hidden_size_ls, kernel_size_ls, learning_rate_ls):
    hidden_size, kernel_size, learning_rate = i[0], i[1], i[2]
    loss_hist = []
    train_acc_hist = []
    val_acc_hist = []
    best_val_acc = None
    init_learning_rate = learning_rate
    save_path = os.getcwd() + '/CNN Results/SnilCNN(Hidden Size-{} | Kernel Size-{} | LR-{}).pt'.format(hidden_size, kernel_size, init_learning_rate)
    model = CNN(emb_size, hidden_size, num_layers, num_classes, kernel_size)
    if use_gpu:
        model = model.cuda()
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        for i, (sent_1, len_1, sent_2, len_2, label) in enumerate(train_loader):
            model.train()
            if use_gpu:
                sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda()
            else:
                sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1, len_1, sent_2, len_2, label
            optimizer.zero_grad()
            outputs = model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()

            if i > 0 and i % 1000 == 0:
                train_acc = test_model(train_loader, model)
                val_acc = test_model(val_loader, model)
                loss_ = loss.item()
                loss_hist.append(loss_)
                train_acc_hist.append(train_acc)
                val_acc_hist.append(val_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Training Loss: {}, Train Acc: {}, Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), loss_, train_acc, val_acc))

                if not best_val_acc or val_acc > best_val_acc:
                    torch.save({
                                'epoch': epoch,
                                'model_state_dict': model.state_dict(),
                                'optimizer_state_dict': optimizer.state_dict(),
                                'train_loss': loss_,
                                'best_val_accuracy': best_val_acc
                                }, save_path)
                    best_val_acc = val_acc
                # else:
                #     # Anneal the learning rate if no improvement has been seen in the validation dataset.
                #     learning_rate /= 4.0
                    
    fig = plt.figure(figsize=(13,3.5))
    ax1 = fig.add_subplot(1,3,1)
    ax1.plot(loss_hist, 
             label = 'CNN Hidden: {} | Kernel: {} | LR: {}'.format(hidden_size, kernel_size, init_learning_rate))
    ax1.set_xlabel('Step')
    ax1.set_ylabel('Train Loss')
    ax1.set_title('Train Loss')
    ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2))

    ax2 = fig.add_subplot(1,3,2)
    ax2.plot(train_acc_hist)
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Train Accuracy')
    ax2.set_title('Train Accuracy')

    ax2 = fig.add_subplot(1,3,3)
    ax2.plot(val_acc_hist)
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Validation Accuracy')
    ax2.set_title('Validation Accuracy')

    plt.tight_layout()
    fig.savefig('CNN (Hidden Size-{} | Kernel Size-{} | LR-{}).png'.format(hidden_size, kernel_size, init_learning_rate), 
                dpi = 100, bbox_inches='tight')
    plt.show()

Epoch: [1/20], Step: [1001/3125], Training Loss: 1.0018014907836914, Train Acc: 56.846, Validation Acc: 54.6


KeyboardInterrupt: 

__3 Correct and 3 Incorrect Predictions__

In [34]:
CNN_path = os.getcwd() + '/CNN Results/'
config = []
best_val_acc = []
train_loss = []

for i in glob.glob(os.path.join(CNN_path, '*.pt')):
    checkpoint = torch.load(i)
    config.append(i)
    
    if not checkpoint['best_val_accuracy']: best = 0
    else: best = checkpoint['best_val_accuracy']
    best_val_acc.append(best)
    train_loss.append(checkpoint['train_loss'])

best_idx = np.argmax(best_val_acc)
best_checkpoint = torch.load(config[best_idx])
print(config[best_idx])

model = CNN(emb_size = 300, hidden_size = 400, num_layers = 1, num_classes = 3, kernel_size = 3)
if use_gpu:
    model = model.cuda()
model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model.eval()

/home/ms6771/HW2/CNN Results/SnilCNN(Hidden Size-400 | Kernel Size-3 | LR-0.001).pt


CNN(
  (embedding): Embedding(50002, 300)
  (conv1): Conv1d(300, 400, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(400, 400, kernel_size=(3,), stride=(1,), padding=(1,))
  (linear): Linear(in_features=800, out_features=3, bias=True)
)

In [21]:
sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = next(iter(val_loader))
if use_gpu:
    sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch = sent_1_batch.cuda(), len_1_batch.cuda(), sent_2_batch.cuda(), len_2_batch.cuda(), label_batch.cuda()
        
outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
predicted = outputs.max(1, keepdim=True)[1]
correct = predicted.eq(label_batch.view_as(predicted))

In [22]:
correct_idx = correct.eq(1).nonzero()[:,0][0:3]
incorrect_idx = correct.eq(0).nonzero()[:,0][0:3]
correct_dict = {}
incorrect_dict = {}

for i in correct_idx:
    words = []
    for j in sent_1_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    correct_dict[i.item()] = [sent]
    
    words = []
    for j in sent_2_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    correct_dict[i.item()].append(sent)
    
    correct_dict[i.item()].append("Label: {}".format(label_dict[label_batch[i].item()]))
    correct_dict[i.item()].append("Predicted: {}".format(label_dict[predicted.view(label_batch.size())[i].item()]))
      
for i in correct_dict.keys():
    print("Three Correct Predictions:\nSentence 1: {}\nSentence 2: {}\n{}\n{}\n".format(correct_dict[i][0], correct_dict[i][1], correct_dict[i][2], correct_dict[i][3]))
    
    
for i in incorrect_idx:
    words = []
    for j in sent_1_batch[i]:
        words.append(id2token[j.item()])
    sent = ' '.join(words)
    incorrect_dict[i.item()] = [sent]
    
    words = []
    for m in sent_2_batch[i]:
        words.append(id2token[m.item()])
    sent = ' '.join(words)
    incorrect_dict[i.item()].append(sent)

    incorrect_dict[i.item()].append("Label: {}".format(label_dict[label_batch[i].item()]))
    incorrect_dict[i.item()].append("Predicted: {}".format(label_dict[predicted.view(label_batch.size())[i].item()]))
      
for i in incorrect_dict.keys():
    print("Three Incorrect Predictions:\nSentence 1: {}\nSentence 2: {}\n{}\n{}\n".format(incorrect_dict[i][0], incorrect_dict[i][1], incorrect_dict[i][2], incorrect_dict[i][3]))

Three Correct Predictions:
Sentence 1: Two asian men in a wood shop . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence 2: There are two asian men in that wood shop . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Label: entailment
Predicted: entailment

Three Correct Predictions:
Sentence 1: A silhouette at the bottom of an escalator . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence 2: The <unk> is creeping out the children . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Label: neutral
Predicted: neutral

Three Correct Predictions:
Sentence 1: A black and white dog prepares to catch a <unk> . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Sentence

__Evaluate on MNLI Validation Dataset__

In [35]:
def mnli_test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    model.eval()
    
    for sent_1, len_1, sent_2, len_2, label, genre in loader:
        if use_gpu:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch, genre_batch = sent_1.cuda(), len_1.cuda(), sent_2.cuda(), len_2.cuda(), label.cuda(), genre.cuda()
        else:
            sent_1_batch, len_1_batch, sent_2_batch, len_2_batch, label_batch, genre_batch = sent_1, len_1, sent_2, len_2, label, genre
        outputs = F.softmax(model(sent_1_batch, len_1_batch, sent_2_batch, len_2_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        genre_acc = {}
        for i in genre_batch.unique():
            correct = 0
            total = 0
            
            total = genre_batch.eq(i).sum().item()
            genre_id = genre_batch.eq(i).nonzero().view(total)
            correct += predicted[genre_id].eq(label_batch[genre_id].view_as(predicted[genre_id])).sum().item()
            genre_acc[genre_dict[i.item()]] = 100 * correct / total
    return genre_acc

In [36]:
mnli_test_model(mnli_val_loader, model)

{'telephone': 45.97014925373134,
 'fiction': 43.618090452261306,
 'slate': 41.71656686626746,
 'government': 44.389763779527556,
 'travel': 45.010183299389}