In [None]:
#Load Modules
import numpy as np
import csv
from collections import Counter
import io
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

# Load Data
labels = ['contradiction', 'entailment', 'neutral']
label_dict = dict(zip(labels, range(0, len(labels))))


def load_data(file_path):
    sent1 = []
    sent2 = []
    key = []

    with open(file_path) as file:
        rows = csv.reader(file, delimiter="\t")
        next(rows)  # skips first row
        for i,row in enumerate(rows):
            sent1.append(row[0].split())
            sent2.append(row[1].split())
            key.append(row[2].split())
            

        key = [label for labels in key for label in labels]
        target = [float(label_dict[item]) for item in key]

        return list(zip(sent1, sent2)), target


snli_train = load_data("snli_train.tsv")
snli_val = load_data("snli_val.tsv")
mnli_train = load_data("mnli_train.tsv")
mnli_val = load_data("mnli_val.tsv")

# Check Number of Samples
print("The number of samples in snli_train is {:,d}".format(len(snli_train[0])))
print("The number of samples in snli_val is {:,d}".format(len(snli_val[0])))
print("The number of samples in mnli_train is {:,d}".format(len(mnli_train[0])))
print("The number of samples in mnli_val is {:,d}".format(len(mnli_val[0])))

# Create Data Dictionary
PAD_IDX = 0
UNK_IDX = 1


# Load Pre-trained Word Vectors
def load_embeddings(word2vec, word2id, embedding_dim):
    embeddings = np.zeros((len(word2id), embedding_dim))
    for word, index in word2id.items():
        try:
            embeddings[index] = word2vec[word]

        except KeyError:
            embeddings[index] = np.random.normal(scale=0.6, size=(300,))

    return embeddings


def load_vectors(fname, num_vecs=None):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if num_vecs is None:
            pass
        else:
            if len(data) + 1 > num_vecs:
                break

    return data


word_vectors = load_vectors('/Volumes/Samsung USB/Fall 2018/wiki-news-300d-1M.vec',50000)

print("Total number of words embedded is {:,d}".format(len(word_vectors)))


def data_dictionary(tokens, vocab_size_limit):
    token_counter = Counter()
    for token in tokens:
        token_counter[token] += 1

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2, 2 + len(vocab))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token


token2id, id2token = data_dictionary(list(word_vectors.keys()), 50000)

print("Total number of words in token2id is {:,d}".format(len(token2id)))  # Included UNK and PAD index


def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

weights=load_embeddings(word_vectors,token2id,300)
weights.shape

MAX_SENT_LENGTH = max([len(snli_train[0][i][0]) for i in range(len(snli_train[0]))] +
                      [len(snli_train[0][i][1]) for i in range(len(snli_train[0]))])
BATCH_SIZE = 32


class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data, word2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = data
        assert (len(self.data_list) == len(self.target_list))
        self.word2id = word2id

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        sent_idx1 = [self.word2id[w] if w in self.word2id.keys() else UNK_IDX for w in
                     self.data_list[key][1][:MAX_SENT_LENGTH]]
        sent_idx2 = [self.word2id[w] if w in self.word2id.keys() else UNK_IDX for w in
                     self.data_list[key][0][:MAX_SENT_LENGTH]]
        label = self.target_list[key]
        return [sent_idx1, len(sent_idx1), sent_idx2, len(sent_idx2), label]


def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sent1_list = []
    sent2_list = []

    label_list = []
    length1_list = []
    length2_list = []

    for datum in batch:
        label_list.append(datum[4])  # Should be 4
        length1_list.append(datum[1])
        length2_list.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]),
                             pad_width=((0, MAX_SENT_LENGTH - datum[1])),
                             mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[2]),
                             pad_width=((0, MAX_SENT_LENGTH - datum[3])),
                             mode="constant", constant_values=0)

        # Addd to list
        sent1_list.append(padded_vec1)
        sent2_list.append(padded_vec2)

    ind_dec_order = np.argsort(length1_list)[::-1]
    sent1_list = np.array(sent1_list)[ind_dec_order]
    length1_list = np.array(length1_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    return [torch.from_numpy(np.array(sent1_list)), torch.LongTensor(length1_list),
            torch.from_numpy(np.array(sent2_list)), torch.LongTensor(length2_list),
            torch.LongTensor(label_list)]

# Build train, valid and test dataloaders

train_dataset = VocabDataset(snli_train, token2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(snli_val, token2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)


class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        # self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(weights).float())

        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)
        
    

    def forward(self, sent1, length1, sent2, length2):
        batch_size, seq_len = sent1.size()

        embed = self.embedding(sent1)
        embed1 = self.embedding(sent2)

        hidden = self.conv1(embed.transpose(1, 2)).transpose(1, 2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1, 2)).transpose(1, 2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden2 = self.conv1(embed1.transpose(1, 2)).transpose(1, 2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden2.size(-1))

        hidden2 = self.conv2(hidden.transpose(1, 2)).transpose(1, 2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden2.size(-1))

        hidden = torch.sum(torch.cat([hidden, hidden2], dim=1), dim=1)
        logits = self.linear(hidden)
        
        return logits

def test_val_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        test_loss = 0
        model.eval()
        for sent1, length1, sent2, length2, label in loader:
            sent1_batch, length1_batch, sent2_batch, length2_batch, label_batch = sent1, length1, sent2, length2, label
            outputs = F.softmax(model(sent1_batch, length1_batch, sent2_batch, length2_batch), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]
            loss = criterion(outputs, label)
            test_loss += loss

            total += label.size(0)
            correct += predicted.eq(label.view_as(predicted)).sum().item()
        return (100 * correct / total), test_loss.item()

def test_train_model(loader, model):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        model.eval()
        for sent1, length1, sent2, length2, label in loader:
            sent1_batch, length1_batch, sent2_batch, length2_batch, label_batch = sent1, length1, sent2, length2, label
            outputs = F.softmax(model(sent1_batch, length1_batch, sent2_batch, length2_batch), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]

            total += label.size(0)
            correct += predicted.eq(label.view_as(predicted)).sum().item()
        return (100 * correct / total)


#Save Data
v_acc = []
v_loss= []
t_acc = []
t_loss= []
    
    
model = CNN(emb_size=300, hidden_size=100, num_layers=2, num_classes=3, vocab_size=len(token2id))
test_loss = 0
val_loss = 0
learning_rate = .0001
num_epochs = 5  # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (sent1, length1, sent2, length2, label) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(sent1, length1, sent2, length2)
        loss = criterion(outputs, label)
        test_loss += loss.item()

        # Backward and optimize
        loss.backward()
        optimizer.step()

        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_val_model(val_loader, model)
            train_acc = test_train_model(train_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Validation Loss: {} , Test Acc: {}, Test_Loss: {}'
                .format(epoch + 1, num_epochs, i + 1, len(train_loader), val_acc[0], val_acc[1] / i, train_acc,
                        test_loss / i))
            v_acc.append(val_acc[0])
            v_loss.append(val_acc[1]/i)
            t_acc.append(train_acc)
            t_loss.append(test_loss / i)
            test_loss = 0




In [None]:
#Save Results
p=np.column_stack((v_acc,v_loss,t_acc,t_loss))
np.savetxt("CNN-100-weight_deca2y-Hidden.csv", p, delimiter=",")

In [None]:
#Figure out which questions the model got Wrong
id2word=dict(zip(sorted((list(token2id.values()))),id2token))

def convert_to_word(index_array, dictionary):
    words= []
    for index in index_array:
        word= dictionary[index]
        words.append(word)
        word_s = ' '.join(words)
        
    return word_s


sent_1=[]
sent_2= []
hold= []
correct = 0
total = 0
test_loss = 0
model.eval()
for sent1, length1, sent2, length2, label in val_loader:
    sent1_batch, length1_batch, sent2_batch, length2_batch, label_batch = sent1, length1, sent2, length2, label
    outputs = F.softmax(model(sent1_batch, length1_batch, sent2_batch, length2_batch), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    hold.append(predicted.eq(label.view_as(predicted)))
    sent_1.append(sent1)
    sent_2.append(sent2)

#Correct   
(hold[1] == 1).nonzero()[:3]

#Incorrect
(hold[1] == 0).nonzero()[:3]


#Covert to words
a=convert_to_word((sent_1[1][1].numpy()).tolist(),id2word)
b=convert_to_word((sent_1[1][3].numpy()).tolist(),id2word)
c=convert_to_word((sent_1[1][9].numpy()).tolist(),id2word)
d=convert_to_word((sent_2[1][0].numpy()).tolist(),id2word)
e=convert_to_word((sent_2[1][1].numpy()).tolist(),id2word)
f=convert_to_word((sent_2[1][3].numpy()).tolist(),id2word)



In [None]:
#Test of MNLI Ser

genres=['fiction', 'government', 'slate', 'telephone', 'travel']
labels = ['contradiction', 'entailment', 'neutral']

label_dict = dict(zip(labels, range(0, len(labels))))

def load_data(file_path, genre):
    sent1 = []
    sent2 = []
    key = []

    with open(file_path) as file:
        rows = csv.reader(file, delimiter="\t")
        next(rows)  # skips first row
        for i,row in enumerate(rows):
            if row[3] == genre:
                sent1.append(row[0].split())
                sent2.append(row[1].split())
                key.append(row[2].split())
            
            

        key = [label for labels in key for label in labels]
        target = [float(label_dict[item]) for item in key]

        return list(zip(sent1, sent2)), target

#Data
datasets= [load_data("mnli_val.tsv",genre) for genre in genres]

val_by_genre =[]

for data in datasets:
    val_dataset = VocabDataset(data, token2id)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)
    val_acc = test_val_model(val_loader, loaded_model)[0]
    
    val_by_genre.append(val_acc)
    




In [None]:
import pandas as pd
results=pd.DataFrame(np.column_stack((genres,val_by_genre)))
results.to_csv("CNN-MNLI.csv", header = False)

In [None]:
len(mnli_val[0])

In [None]:
torch.save(model.state_dict(), "RNN_best" + "model_states")

In [None]:
loaded_model = CNN(emb_size=300, hidden_size=100, num_layers=2, num_classes=3, vocab_size=len(token2id))
loaded_model.load_state_dict(torch.load('RNN_bestmodel_states'))
model.eval()
val_acc = test_val_model(val_loader, loaded_model)[0]

In [None]:
val_acc