In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
from tqdm import tqdm_notebook as tqdm
import pdb
import matplotlib.pyplot as plt
import io
random.seed(134)
%matplotlib inline

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32
idx2lab = ["contradiction", "entailment", "neutral"]
max_vocab_size = 20000
results = {}

In [None]:
def read_file(file_name):
    train_data, all_tokens = [], []
    with open(file_name, "r") as f:
        _ = f.readline()
        cnt = 0
        for line in f:
            s1, s2, label = line.split("\t")
            s1, s2, label = s1.split(" "), s2.split(" "), idx2lab.index(label.strip())
            all_tokens.extend(s1)
            all_tokens.extend(s2)
            train_data.append([s1, s2, label])
            cnt += 1
        print(cnt)
        print(line)
        return train_data, all_tokens

In [None]:
def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    print("There are {} unique words. ".format(len(token_counter)))
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def read_data():
    (train_data, all_tokens), (val_data, _) = read_file("snli_train.tsv"), read_file("snli_val.tsv")
    token2id, id2token = build_vocab(all_tokens)
    return train_data, val_data, token2id, id2token

def token2index_dataset(tokens_data):
    indices_data = []
    for samples in tokens_data:
        s1_index_list = [token2id[token] if token in token2id else UNK_IDX for token in samples[0]]
        s2_index_list = [token2id[token] if token in token2id else UNK_IDX for token in samples[1]]        
        indices_data.append([s1_index_list, s2_index_list, samples[2]])
    return indices_data

In [None]:
train_data, val_data, token2id, id2token = read_data()
train_data, val_data = token2index_dataset(train_data), token2index_dataset(val_data)

In [None]:
MAX_SENTENCE_LENGTH = sorted([len(x[0]) for x in train_data])[int(len(train_data) * 0.95)]

In [None]:
def load_vectors(fname, all_tokens):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        if tokens[0] in id2token:
            data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [None]:
# pretrained = load_vectors("/scratch/yn811/wiki-news-300d-1M.vec", id2token)
# pkl.dump(pretrained, open("/scratch/yn811/hw2_pretrained.pickle", "wb"))
pretrained = pkl.load(open("/scratch/yn811/hw2_pretrained.pickle", "rb"))

In [None]:
notPretrained = []
def get_pretrain_emb(pretrained, token):
    if token == '<pad>':
        notPretrained.append(0)
        return [0] * 300
    if token in pretrained:
        notPretrained.append(0)
        return pretrained[token]
    else:
        notPretrained.append(1)
        return [0] * 300
embeddings = [get_pretrain_emb(pretrained, token) for token in id2token]
notPretrained = torch.FloatTensor(np.array(notPretrained)[:, np.newaxis]).cuda()
print("There are {} not pretrained words out of {} total words.".format(notPretrained.sum().cpu().data, len(notPretrained)))

In [None]:
class SNLIDataset(Dataset):
    def __init__(self, data_list):
        self.s1_list, self.s2_list, self.target_list = zip(*data_list)
        assert (len(self.s1_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):    
        s1_idx = self.s1_list[key][:MAX_SENTENCE_LENGTH]
        s2_idx = self.s2_list[key][:MAX_SENTENCE_LENGTH]        
        label = self.target_list[key]
        return [(s1_idx, s2_idx), (len(s1_idx), len(s2_idx)), label]

def sort_unsort(length_list):
    ind_dec_order = np.argsort(length_list)[::-1].copy()
    ind_unsort = np.argsort(ind_dec_order)
    return ind_dec_order, ind_unsort

def _collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    s1_data_list, s2_data_list = [], []
    label_list = []
    s1_length_list, s2_length_list = [], []
    for datum in batch:
        label_list.append(datum[2])
        s1_length_list.append(datum[1][0])
        s2_length_list.append(datum[1][1])        
    # padding
    for datum in batch:
        padded_s1 = np.pad(np.array(datum[0][0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1][0])), 
                                mode="constant", constant_values=0)
        padded_s2 = np.pad(np.array(datum[0][1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1][1])), 
                                mode="constant", constant_values=0)
        s1_data_list.append(padded_s1)
        s2_data_list.append(padded_s2)
    s1_ind_sort, s1_ind_unsort = sort_unsort(s1_length_list)
    s2_ind_sort, s2_ind_unsort = sort_unsort(s2_length_list)
    
    return [(torch.from_numpy(np.array(s1_data_list)).cuda(), torch.from_numpy(np.array(s2_data_list)).cuda()), 
            (torch.LongTensor(s1_length_list).cuda(), torch.LongTensor(s2_length_list).cuda()), 
            (torch.LongTensor(s1_ind_sort).cuda(), torch.LongTensor(s2_ind_sort).cuda()),
            (torch.LongTensor(s1_ind_unsort).cuda(), torch.LongTensor(s2_ind_unsort).cuda()),
            torch.LongTensor(label_list).cuda()]

In [None]:
train_dataset = SNLIDataset(train_data)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(val_data)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=_collate_func,
                                           shuffle=False)

In [None]:
def test_model(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data, lengths, ind_sort, ind_unsort, labels in loader:
        outputs = F.softmax(model(data, lengths, ind_sort, ind_unsort), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [None]:
def init(layer):
    for layer_p in layer._all_weights:
        for p in layer_p:
            if 'weight' in p:
                nn.init.xavier_normal_(layer.__getattr__(p))
    return layer

class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, encoder_dropout, kernel_size, same_encoder):
        super(CNN, self).__init__()
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.cnn = nn.Sequential(nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_size, stride=1, padding=int(kernel_size/2)),
#                                  nn.BatchNorm1d(hidden_size),
                                 nn.ReLU(),
                                 nn.Conv1d(hidden_size, hidden_size, kernel_size=3, stride=1, padding=int(kernel_size/2)),
#                                  nn.BatchNorm1d(hidden_size),
                                 nn.ReLU()
                                )
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        x = x.transpose(1,2)
        out = self.cnn(x) # batch_size * hidden_size * seq_len
        out = out.max(dim=2)[0]
        return out

class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, encoder_dropout, kernel_size, same_encoder):
        super(RNN, self).__init__()
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.same_encoder = same_encoder
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.rnn = init(self.rnn)
        self.bn = nn.BatchNorm1d(2 * hidden_size)
        
    def init_hidden(self, batch_size):
        hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size).cuda()
        nn.init.xavier_normal_(hidden)
        return hidden

    def forward(self, x, lengths):
        batch_size, seq_len, _ = x.size()
        self.hidden = self.init_hidden(batch_size)
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths.cpu().numpy(), batch_first=True)
        _, self.hidden = self.rnn(x, self.hidden)
        self.hidden = self.hidden.transpose(0,1).contiguous().view((batch_size, self.hidden_size * 2)).contiguous()
        if self.same_encoder:
            return self.hidden
        else:
            self.hidden = self.bn(self.hidden)
            return self.hidden
    
class Model(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size, kernel_size,
                 fc_hidden_size, fc_dropout, drop_out_prob, train_oov, encoder, same_encoder, encoder_dropout):
        super(Model, self).__init__()
        
        self.hidden_size = hidden_size
        self.train_oov = train_oov
        self.encoder = encoder
        self.embed1 = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.embed2 = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        encoder_type = RNN if encoder == "RNN" else CNN
        if same_encoder:
            encoder = encoder_type(emb_size, hidden_size, num_layers, encoder_dropout, kernel_size, same_encoder)
            self.encoder1 = encoder
            self.encoder2 = encoder
        else:
            self.encoder1 = encoder_type(emb_size, hidden_size, num_layers, encoder_dropout, kernel_size, same_encoder)
            self.encoder2 = encoder_type(emb_size, hidden_size, num_layers, encoder_dropout, kernel_size, same_encoder)
#         factor = 4 if self.encoder == "RNN" else 2
        factor = 2
        self.fc = nn.Sequential(nn.Linear(factor*hidden_size, fc_hidden_size),
                                nn.ReLU(),
                               nn.Linear(fc_hidden_size, num_classes)) if self.encoder == "RNN" else nn.Sequential(
            nn.BatchNorm1d(factor*hidden_size),
            nn.Linear(factor*hidden_size, fc_hidden_size),
            nn.ReLU(),
            nn.Linear(fc_hidden_size, num_classes))
        
    def forward(self, x, lengths, ind_sort, ind_unsort):
        batch_size = x[0].size(0)
        
        hid1 = self.embed1(x[0])
        hid2 = self.embed1(x[1])
        if self.train_oov:
            self.embed2.weight.data.mul_(notPretrained)
            hid1 += self.embed2(x[0])
            hid2 += self.embed2(x[1])
        
        if self.encoder == "RNN":
            hid1 = self.encoder1(hid1.index_select(0, ind_sort[0]), lengths[0].index_select(0, ind_sort[0]))
            hid2 = self.encoder2(hid2.index_select(0, ind_sort[1]), lengths[1].index_select(0, ind_sort[1]))
            
#             out = torch.cat((hid1.index_select(0, ind_unsort[0]), hid2.index_select(0, ind_unsort[1])), dim=1)
            out = hid1.index_select(0, ind_unsort[0]) * hid2.index_select(0, ind_unsort[1])
        else:
            hid1 = self.encoder1(hid1)
            hid2 = self.encoder2(hid2)
            out = torch.cat((hid1, hid2), dim=1)
#             out = hid1 * hid2

        out = self.fc(out)
        return out

In [None]:
def train(fail_tol, label=""):

    num_epochs = 100

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.1)

    total_step = len(train_loader)
    loss_list, val_acc_list = [], []
    fail_cnt, cur_best = 0, 0
    for epoch in range(num_epochs):
        for i, (data, lengths, ind_sort, ind_unsort, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()

            outputs = model(data, lengths, ind_sort, ind_unsort)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            if i % 20 == 0:
                loss_list.append(loss.item())
            if i > 0 and i % 100 == 0:
                val_acc = test_model(val_loader, model)
                val_acc_list.append(val_acc)
                if (val_acc > cur_best):
                    print("found best! Current best: {}. save model...".format(val_acc))
                    torch.save(model.state_dict(), 'model' + encoder + "-" + label + '.ckpt')
                    print("model saved")
                    cur_best = val_acc
                    fail_cnt = 0
                else:
                    fail_cnt += 1
                print("fail to improve {}/{} times".format(fail_cnt, fail_tol))
                if fail_cnt > fail_tol:
                    return loss_list, val_acc_list

                if i % 100 == 0:
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                               epoch+1, num_epochs, i+1, len(train_loader), val_acc))
                scheduler.step(val_acc)
    return loss_list, val_acc_list

In [None]:
encoder = "CNN"
# --------#
if encoder == "RNN":
    same_encoder = False
    learning_rate = 3e-3
    hidden_size = 64
    kernel_size = 3
    fc_hidden_size = 128
else:
    same_encoder = True
    learning_rate = 3e-4
    hidden_size = 256
    kernel_size = 3
    fc_hidden_size = 192
encoder_dropout = 0
fc_dropout = False
drop_out_prob = 0
emb_requires_grad = False
train_oov = True
model = Model(emb_size=300, hidden_size=hidden_size, num_layers=1, num_classes=3, vocab_size=len(id2token), kernel_size=kernel_size,
              fc_hidden_size=fc_hidden_size, fc_dropout=fc_dropout, drop_out_prob=drop_out_prob, train_oov=train_oov, 
              encoder=encoder, same_encoder=same_encoder, encoder_dropout=encoder_dropout)
model.embed1.weight.data.copy_(torch.from_numpy(np.array(embeddings)))
model.embed1.weight.requires_grad = emb_requires_grad
model.cuda()
#--------#
loss_list, val_acc_list = train(32)
config = "hiddenSize-{}_fcHiddenSize-{}_encoder-{}_shareEncoder-{}_lr-{}_kernelSize-{}_reg-batchnorm_encReg-tbatchnorm".format(
    hidden_size, fc_hidden_size, encoder, same_encoder, learning_rate, kernel_size)
# results[config] = {"loss": loss_list, "val_acc": val_acc_list, "best_val": max(val_acc_list)}
# pkl.dump(results, open("ablation.pkl", "wb"))

In [None]:
def get_info(s):
    info = tuple(s.split("-"))
    if info[0] == "Xavier":
        return ("init", "Xavier")
    elif info[0] == "batchnorm":
        return ("reg", "batchnorm")
    elif (info[0] == "add") or (info[0] == "mul"):
        return ("combo", info[0])
    elif len(info) == 3:
        return (info[0], info[1]+"-"+info[2])
    else:
        return info
fig, ax = plt.subplots(2, 3, figsize=(20, 12))

for key, value in results.items():
#     print(key, value["best_val"])
    info = [get_info(k) for k in key.split("_")]
    info = dict(info)
#     print(key)
    if (info["encoder"] == "CNN"):
#         print(key)
#         print("{:.3} & {}".format(min(value["loss"]), value["best_val"]))
        label = "fcHiddenSize: {}, lr: {}".format(info["fcHiddenSize"], info["lr"])
        ax[0,0].plot([sum(value["loss"][i-5:i])/5 for i in range(5, len(value["loss"]), 5)], label=label)
        ax[0,0].legend()
        ax[1,0].plot(value["val_acc"], label=label)
        ax[1,0].legend()
    if (info["encoder"] == "RNN") and (info["shareEncoder"] == "True"):
#         print(key)
#         print("{:.3} & {}".format(min(value["loss"]), value["best_val"]))
        label = "fcHiddenSize: {}, lr: {}".format(info["fcHiddenSize"], info["lr"])
        ax[0,1].plot([sum(value["loss"][i-5:i])/5 for i in range(5, len(value["loss"]), 5)], label=label)
        ax[0,1].legend()
        ax[1,1].plot(value["val_acc"], label=label)
        ax[1,1].legend()
    if (info["encoder"] == "RNN") and (info["shareEncoder"] == "False"):
#         print(key)
#         print("{:.3} & {}".format(min(value["loss"]), value["best_val"]))
        label = "fcHiddenSize: {}, lr: {}".format(info["fcHiddenSize"], info["lr"])
        ax[0,2].plot([sum(value["loss"][i-5:i])/5 for i in range(5, len(value["loss"]), 5)], label=label)
        ax[0,2].legend()
        ax[1,2].plot(value["val_acc"], label=label)
        ax[1,2].legend()
#         print("{}\t{}\t{}\t{:.3}\t{}".format(info["hiddenSize"], info["fcHiddenSize"], info["fcDropout"], min(value["loss"]), value["best_val"]))

ax[0,0].set_title("CNN based model (loss) -- shared encoder")
ax[0,1].set_title("RNN based model (loss) -- shared encoder")
ax[0,2].set_title("RNN based model (loss) -- separate encoders")
ax[1,0].set_title("CNN based model (val acc) -- shared encoder")
ax[1,1].set_title("RNN based model (val acc) -- shared encoder")
ax[1,2].set_title("RNN based model (val acc) -- separte encoders")
for p in range(3):
    ax[0,p].set_ylabel("moving average of loss (window_size=5)")
    ax[0,p].set_xlabel("num steps/100")
    ax[1,p].set_ylabel("validation accuracy (%)")
    ax[1,p].set_xlabel("num steps/100")
plt.savefig("training_curve")

### test

In [None]:
def read_mnli_file(file_name):
    train_data, all_tokens = [], []
    with open(file_name, "r") as f:
        print(f.readline())
        cnt = 0
        for line in f:
            s1, s2, label, genre = line.split("\t")
            s1, s2, label, genre = s1.split(" "), s2.split(" "), idx2lab.index(label.strip()), idx2genre.index(genre.strip())
            all_tokens.extend(s1)
            all_tokens.extend(s2)
            train_data.append([s1, s2, label, genre])
            cnt += 1
        print(cnt)
        return train_data, all_tokens
def token2index_mnli_dataset(tokens_data):
    indices_data = []
    genres = []
    for samples in tokens_data:
        s1_index_list = [token2id[token] if token in token2id else UNK_IDX for token in samples[0]]
        s2_index_list = [token2id[token] if token in token2id else UNK_IDX for token in samples[1]]        
        indices_data.append([s1_index_list, s2_index_list, samples[2]])
        genres.append(samples[3])
    return indices_data, genres

In [None]:
encoder = "CNN"
# --------#
if encoder == "RNN":
    same_encoder = False
    learning_rate = 3e-3
    hidden_size = 64
    kernel_size = 3
    fc_hidden_size = 128
else:
    same_encoder = True
    learning_rate = 3e-4
    hidden_size = 256
    kernel_size = 3
    fc_hidden_size = 192
encoder_dropout = 0
fc_dropout = False
drop_out_prob = 0
emb_requires_grad = False
train_oov = True
#--------#
model = Model(emb_size=300, hidden_size=hidden_size, num_layers=1, num_classes=3, vocab_size=len(id2token), kernel_size=kernel_size,
              fc_hidden_size=fc_hidden_size, fc_dropout=fc_dropout, drop_out_prob=drop_out_prob, train_oov=train_oov, 
              encoder=encoder, same_encoder=same_encoder, encoder_dropout=encoder_dropout)
model.load_state_dict(torch.load('model' + encoder + '.ckpt'))
model = model.cuda()

In [None]:
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=1,
                                           collate_fn=_collate_func,
                                           shuffle=False)
correct_cnt, wrong_cnt = 0, 0
correct_list = []
wrong_list = []
prediction = []
label = []
model.eval()
for data, lengths, ind_sort, ind_unsort, labels in val_loader:
    outputs = F.softmax(model(data, lengths, ind_sort, ind_unsort), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    prediction.append(predicted.data[0])
    label.append(labels.data[0])
    
    correct = predicted.eq(labels.view_as(predicted)).sum().item()
    if (correct == 1) and (correct_cnt < 3):
        correct_cnt  += 1
        correct_list.append((data[0].data[0], data[1].data[0], predicted.data[0]))
    if (correct == 0) and (wrong_cnt < 3):
        wrong_cnt += 1
        wrong_list.append((data[0].data[0], data[1].data[0], predicted.data[0]))

In [None]:
for ex in correct_list:
    print("\item label:{}\n Premise: {}\n Hypothesis: {}".format(
        idx2lab[ex[2]], " ".join([id2token[i] for i in ex[0] if i > 0]), " ".join([id2token[i] for i in ex[1] if i > 0])))
print("-" * 20)
for ex in wrong_list:
    print("\item predicted label:{}\n Premise: {}\n Hypothesis: {}".format(
        idx2lab[ex[2]], " ".join([id2token[i] for i in ex[0] if i > 0]), " ".join([id2token[i] for i in ex[1] if i > 0])))

In [None]:
idx2genre = ['fiction', 'government', 'slate', 'telephone', 'travel']
val_data, _ = read_mnli_file("mnli_val.tsv")
val_data, genres = token2index_mnli_dataset(val_data)

In [None]:
for genre in range(len(idx2genre)):
    val_dataset = SNLIDataset([val_data[i] for i in range(len(val_data)) if genres[i] == genre])
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                               batch_size=BATCH_SIZE,
                                               collate_fn=_collate_func,
                                               shuffle=False)
    print("{} & {:.3}".format(idx2genre[genre], test_model(val_loader, model)))

### Fine tuning

In [None]:
train_data, _ = read_mnli_file("mnli_train.tsv")
train_data, train_genres = token2index_mnli_dataset(train_data)

In [None]:
for genre in range(len(idx2genre)):
    print(idx2genre[genre])
    train_dataset = SNLIDataset([train_data[i] for i in range(len(train_data)) if train_genres[i] == genre])
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               collate_fn=_collate_func,
                                               shuffle=True)
    val_dataset = SNLIDataset([val_data[i] for i in range(len(val_data)) if genres[i] == genre])
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                               batch_size=BATCH_SIZE,
                                               collate_fn=_collate_func,
                                               shuffle=False)
    learning_rate = 7e-5
    model.load_state_dict(torch.load('model' + encoder + '.ckpt'))
    model = model.cuda()
    train(32, idx2genre[genre])

In [None]:
print(" & {}".format(" & ".join(idx2genre)))
for model_genre in range(len(idx2genre)):
    model.load_state_dict(torch.load('model' + encoder + "-" + idx2genre[model_genre] + '.ckpt'))
    model = model.cuda()
    accs = []
    for genre in range(len(idx2genre)):
        val_dataset = SNLIDataset([val_data[i] for i in range(len(val_data)) if genres[i] == genre])
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=_collate_func,
                                                   shuffle=False)
        accs.append(test_model(val_loader, model))
    print("{} & {}".format(idx2genre[model_genre], " & ".join([str(x)[:5] for x in accs])))
    