In [1]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import spacy
import string
import csv
import io
import argparse
random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 128
MAX_WORD_LENGTH = 78

#parser = argparse.ArgumentParser(description='PyTorch CNN text classifier.')
#parser.add_argument('--hidden_dim', type=int, default=200, metavar='N',
#                    help='the hidden dimension of the encoder (default: 200)')
#parser.add_argument('--ker_size', type=int, default=3, metavar='N',
#                    help='the kernel size of the encoder (default: 3)')
#parser.add_argument('--epochs', type=int, default=6, metavar='N',
#                    help='number of epochs to train (default: 6)')
#parser.add_argument('--lr', type=float, default=3e-4, metavar='LR',
#                        help='learning rate (default: 3e-4)')
#args = parser.parse_args()
#print("training epochs: {}, learning rate: {}, hidden dimension: {}, kernel size: {}".
#      format(args.epochs, args.lr, args.hidden_dim, args.ker_size))

In [2]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        #data[tokens[0]] = map(float, tokens[1:])
        data[tokens[0]] = tokens[1:]
    return data

# entailment=0, contradict=1, neural=2
def read_snli_tsv(path):
    readin_data = [] 
    with open(path) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            if row[2] == 'label':
                continue
            if row[2] == 'entailment':
                readin_data.append((row[0],row[1], 0))
            elif row[2] == 'contradiction':
                readin_data.append((row[0],row[1], 1))
            else:
                readin_data.append((row[0],row[1], 2))
    return readin_data

data_vector = load_vectors('wiki-news-300d-1M.vec')

In [3]:
def build_vocab(data):
    # Returns:
    # id2char: list of chars, where id2char[i] returns char that corresponds to char i
    # char2id: dictionary where keys represent chars and corresponding values represent indices
    # some preprocessing
    max_len1 = max([len(word[0]) for word in data])
    max_len2 = max([len(word[1]) for word in data])
    max_len = max(max_len1, max_len2)
    all_chars = []
    for word in data:
        all_chars += word[0]
        all_chars += word[1]
    unique_chars = list(set(all_chars))

    id2char = unique_chars
    char2id = dict(zip(unique_chars, range(2,2+len(unique_chars))))
    id2char = ['<pad>', '<unk>'] + id2char
    char2id['<pad>'] = PAD_IDX
    char2id['<unk>'] = UNK_IDX

    return char2id, id2char, max_len

def covert_to_token(data):
    return [(tokenize(sample[0]), tokenize(sample[1]), sample[2]) for sample in data]

def tokenize(sent):
   tokens = tokenizer(sent)
   out_tokens = []
   for token in tokens:
        if token.lemma_ not in punctuations:
            if token.lemma_ in data_vector:
                out_tokens.append(token.lemma_.lower())
            else:
                out_tokens.append('<unk>')
   return out_tokens

### Function that preprocessed dataset
def read_data():
    train_data = read_snli_tsv('./hw2_data/snli_train.tsv')
    val_data = read_snli_tsv('./hw2_data/snli_val.tsv')
    train_data, val_data = covert_to_token(train_data), covert_to_token(val_data)
    char2id, id2char, max_len = build_vocab(train_data)
    return train_data, val_data, char2id, id2char, max_len



In [4]:
#train_data, val_data, char2id, id2char, MAX_WORD_LENGTH = read_data()
#print ("Maximum word length of dataset is {}".format(MAX_WORD_LENGTH))
#print ("Number of characters in dataset is {}".format(len(id2char)))
#print ("Characters:")
#print (char2id.keys())
#print (len(char2id.keys()))

In [5]:
import pickle
#pickle.dump((train_data, val_data, char2id, id2char, MAX_WORD_LENGTH), open("snli_predata_save.p", "wb"))
train_data, val_data, _, _ = pickle.load(open("snli_data_save.p", "rb"))

In [6]:
all_chars = []
for key in data_vector:
    all_chars.append(key)
unique_chars = list(set(all_chars))  
id2char = unique_chars
char2id = dict(zip(unique_chars, range(2,2+len(unique_chars))))
id2char = ['<pad>', '<unk>'] + id2char
char2id['<pad>'] = PAD_IDX
char2id['<unk>'] = UNK_IDX

In [7]:
def process_data(in_data):
    for i in range(len(in_data)):
        for ii in range(len(in_data[i][0])):
            if in_data[i][0][ii] not in data_vector:
                in_data[i][0][ii] = '<unk>'
        for ii in range(len(in_data[i][1])):
            if in_data[i][1][ii] not in data_vector:
                in_data[i][1][ii] = '<unk>'
    return in_data

train_data = process_data(train_data)
val_data = process_data(val_data)

In [8]:
embedding_matrix = torch.zeros(len(id2char), 300, dtype=torch.float)
for i in range(len(id2char)):
    if id2char[i] == '<unk>' or id2char[i] == '<pad>':
        continue
    embedding_matrix[i,:] = torch.from_numpy(np.reshape(np.array(data_vector[id2char[i]],dtype=np.float32),(1,300)))

In [9]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, char2id):
        """
        @param data_list: list of character
        @param target_list: list of targets
        """
        self.data_list1, self.data_list2, self.target_list = zip(*data_tuple)
        assert (len(self.data_list1) == len(self.target_list))
        self.char2id = char2id

    def __len__(self):
        return len(self.data_list1)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        char_idx1 = [self.char2id[c] if c in self.char2id.keys() else UNK_IDX  for c in self.data_list1[key][:MAX_WORD_LENGTH]]
        char_idx2 = [self.char2id[c] if c in self.char2id.keys() else UNK_IDX  for c in self.data_list2[key][:MAX_WORD_LENGTH]]
        char_idx = (char_idx1, char_idx2)
        len_char = (len(char_idx1), len(char_idx2))
        label = self.target_list[key]
        return [char_idx, len_char, label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []
    for datum in batch:
        label_list.append(datum[2])
        if datum[1][0] > MAX_WORD_LENGTH:
            length_list1.append(MAX_WORD_LENGTH)
        else:
            length_list1.append(datum[1][0])
        if datum[1][1] > MAX_WORD_LENGTH:
            length_list2.append(MAX_WORD_LENGTH)
        else:
            length_list2.append(datum[1][1])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0][0]),
                                pad_width=((0,MAX_WORD_LENGTH-datum[1][0])),
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[0][1]),
                                pad_width=((0,MAX_WORD_LENGTH-datum[1][1])),
                                mode="constant", constant_values=0)
        
        data_list1.append(padded_vec1)
        data_list2.append(padded_vec2)
        
    #ind_dec_order1 = np.argsort(length_list1)[::-1]
    #ind_back_order1 = np.zeros_like(ind_dec_order1)
    #for i in range(len(ind_dec_order1)):
    #    ind_back_order1[ind_dec_order1[i]] = i
    #ind_dec_order2 = np.argsort(length_list2)[::-1]
    #ind_back_order2 = np.zeros_like(ind_dec_order2)
    #for i in range(len(ind_dec_order2)):
    #    ind_back_order2[ind_dec_order2[i]] = i
    
    
    #data_list1 = np.array(data_list1)[ind_dec_order1]
    #data_list2 = np.array(data_list2)[ind_dec_order2]
    
    #length_list1 = np.array(length_list1)[ind_dec_order1]
    #length_list2 = np.array(length_list2)[ind_dec_order2]
    
    #label_list1 = np.array(label_list1)[ind_dec_order1]
    #label_list2 = np.array(label_list2)[ind_dec_order2]
    
    out_data = [torch.from_numpy(np.array(data_list1)), torch.from_numpy(np.array(data_list2))]
    #out_length = [torch.LongTensor(np.array(length_list1)), torch.LongTensor(np.array(length_list2))]
    #ind_back = (ind_back_order1, ind_back_order2)
    label_list = torch.LongTensor(np.array(label_list))
    return (out_data, label_list)



In [10]:
# Build train, valid and test dataloaders

train_dataset = VocabDataset(train_data, char2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_data, char2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

#test_dataset = VocabDataset(test_data, char2id)
#test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                           batch_size=BATCH_SIZE,
#                                           collate_fn=vocab_collate_func,
#                                           shuffle=False)

# Now lets implement basic Convolutional Neural Net model for text

In [11]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        #self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        #self.conv1_1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        #self.conv1_2 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear1 = nn.Linear(hidden_size*2, int(hidden_size*0.5))
        self.linear2 = nn.Linear(int(hidden_size*0.5), num_classes)
        
    def forward(self, x):
        batch_size, seq_len1 = x[0].size()
        batch_size, seq_len2 = x[1].size()

        embed1 = embedding_matrix[x[0],:]
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))
        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len1, hidden1.size(-1))
        hidden1,_ = torch.max(hidden1, dim=1)
        
        embed2 = embedding_matrix[x[1],:]
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len2, hidden2.size(-1))
        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len2, hidden2.size(-1))
        hidden2,_ = torch.max(hidden2, dim=1)
        
        hidden = torch.cat((hidden1, hidden2), 1)
        hidden = F.relu(self.linear1(hidden))
        logits = self.linear2(hidden)
        return logits

In [39]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, labels in loader:
        data_batch, label_batch = data, labels
        outputs = F.softmax(model(data_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

model = CNN(emb_size=300, hidden_size=200, num_layers=2, num_classes=3, vocab_size=len(id2char))
test_num = 0
val_acc = []
num_epochs = 7
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc.append(test_model(val_loader, model))
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc[test_num]))                
            test_num += 1
print('training acc: ',test_model(train_loader, model))

Epoch: [1/4], Step: [101/782], Validation Acc: 48.9
Epoch: [1/4], Step: [101/782], Validation Acc: 48.9
Epoch: [1/4], Step: [201/782], Validation Acc: 56.6
Epoch: [1/4], Step: [201/782], Validation Acc: 56.6
Epoch: [1/4], Step: [301/782], Validation Acc: 59.2
Epoch: [1/4], Step: [301/782], Validation Acc: 59.2
Epoch: [1/4], Step: [401/782], Validation Acc: 58.7
Epoch: [1/4], Step: [401/782], Validation Acc: 58.7
Epoch: [1/4], Step: [501/782], Validation Acc: 59.3
Epoch: [1/4], Step: [501/782], Validation Acc: 59.3
Epoch: [1/4], Step: [601/782], Validation Acc: 59.6
Epoch: [1/4], Step: [601/782], Validation Acc: 59.6
Epoch: [1/4], Step: [701/782], Validation Acc: 60.6
Epoch: [1/4], Step: [701/782], Validation Acc: 60.6
Epoch: [2/4], Step: [101/782], Validation Acc: 61.6
Epoch: [2/4], Step: [101/782], Validation Acc: 61.6
Epoch: [2/4], Step: [201/782], Validation Acc: 61.6
Epoch: [2/4], Step: [201/782], Validation Acc: 61.6
Epoch: [2/4], Step: [301/782], Validation Acc: 63.0
Epoch: [2/4]

In [None]:
print("training epochs: {}, learning rate: {}, hidden dimension: {}, kernel size: {}".
      format(args.epochs, args.lr, args.hidden_dim, args.ker_size))
torch.save(model.state_dict(), './model/cnn_text_e{}_h{}_k{}.pth'.format(args.epochs,
                                                                    args.hidden_dim, args.ker_size))

In [12]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, labels in loader:
        data_batch, label_batch = data, labels
        outputs = F.softmax(model(data_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

model = CNN(emb_size=300, hidden_size=200, num_layers=2, num_classes=3, vocab_size=len(id2char))
model.load_state_dict(torch.load('./model/cnn_text_e7_h200_k3.pth'))

In [14]:
print('validation acc: ',test_model(val_loader, model))

validation acc:  67.4


In [15]:
# entailment=0, contradict=1, neural=2
def read_mnli_tsv(path):
    telephone_data = [] 
    travel_data = [] 
    fiction_data = [] 
    government_data = [] 
    slate_data = [] 
    with open(path) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            if row[2] == 'label':
                continue
            if row[3] == 'telephone': 
                if row[2] == 'entailment':
                    telephone_data.append((row[0],row[1], 0))
                elif row[2] == 'contradiction':
                    telephone_data.append((row[0],row[1], 1))
                else:
                    telephone_data.append((row[0],row[1], 2))
            if row[3] == 'travel': 
                if row[2] == 'entailment':
                    travel_data.append((row[0],row[1], 0))
                elif row[2] == 'contradiction':
                    travel_data.append((row[0],row[1], 1))
                else:
                    travel_data.append((row[0],row[1], 2))
            if row[3] == 'fiction': 
                if row[2] == 'entailment':
                    fiction_data.append((row[0],row[1], 0))
                elif row[2] == 'contradiction':
                    fiction_data.append((row[0],row[1], 1))
                else:
                    fiction_data.append((row[0],row[1], 2))
            if row[3] == 'government': 
                if row[2] == 'entailment':
                    government_data.append((row[0],row[1], 0))
                elif row[2] == 'contradiction':
                    government_data.append((row[0],row[1], 1))
                else:
                    government_data.append((row[0],row[1], 2))
            if row[3] == 'slate': 
                if row[2] == 'entailment':
                    slate_data.append((row[0],row[1], 0))
                elif row[2] == 'contradiction':
                    slate_data.append((row[0],row[1], 1))
                else:
                    slate_data.append((row[0],row[1], 2))
                    
    return (telephone_data, travel_data, fiction_data, government_data, slate_data)

In [22]:
telephone_data, travel_data, fiction_data, government_data, slate_data = read_mnli_tsv('./hw2_data/mnli_val.tsv')

In [None]:
telephone_data = covert_to_token(telephone_data)
travel_data = covert_to_token(travel_data)
fiction_data = covert_to_token(fiction_data)
government_data = covert_to_token(government_data)
slate_data = covert_to_token(slate_data)

In [18]:
telephone_data = process_data(telephone_data)
travel_data = process_data(travel_data)
fiction_data = process_data(fiction_data)
government_data = process_data(government_data)
slate_data = process_data(slate_data)

In [19]:
telephone_dataset = VocabDataset(telephone_data, char2id)
telephone_loader = torch.utils.data.DataLoader(dataset=telephone_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)
travel_dataset = VocabDataset(travel_data, char2id)
travel_loader = torch.utils.data.DataLoader(dataset=travel_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)
fiction_dataset = VocabDataset(fiction_data, char2id)
fiction_loader = torch.utils.data.DataLoader(dataset=fiction_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)
government_dataset = VocabDataset(government_data, char2id)
government_loader = torch.utils.data.DataLoader(dataset=government_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)
slate_dataset = VocabDataset(slate_data, char2id)
slate_loader = torch.utils.data.DataLoader(dataset=slate_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

In [20]:
print('telephone', test_model(telephone_loader, model))
print('travel', test_model(travel_loader, model))
print('fiction', test_model(fiction_loader, model))
print('government', test_model(government_loader, model))
print('slate', test_model(slate_loader, model))

telephone 45.37313432835821
travel 45.21384928716904
fiction 42.814070351758794
government 44.09448818897638
slate 44.21157684630739
