In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import io
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import pdb

BATCH_SIZE = 32
VOCAB_SIZE = 50000

In [3]:
# nltk.download('punkt')

### Helper functions for each step in the pipeline

In [4]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    print(n, d)
    i = 0
    for line in fin:
        if i == VOCAB_SIZE:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
        i += 1
    return data

In [5]:
from collections import Counter
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1
VOCAB_SIZE = 50000

def build_vocab():
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    word_vectors = pkl.load(open("fasttext_word_vectors.p", "rb"))
    id2token = list(word_vectors.keys())
    token2id = dict(zip(word_vectors, range(2,2+len(word_vectors)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return word_vectors, token2id, id2token

In [6]:
def convert_labels_to_integers(data_label):
    for i in range(len(data_label)):
        if data_label[i] == "contradiction":
            data_label[i] = 0
        elif data_label[i] == "entailment":
            data_label[i] = 1
        elif data_label[i] == "neutral":
            data_label[i] = 2
    return data_label

In [7]:
def verify_order(sent1_data, sent2_data, data_label):
    i = random.randint(1, len(sent1_data))
    print(sent1_data[i])
    print(sent2_data[i])
    print(data_label[i])

In [8]:
# Word tokenize each entry in a list of sentences
def tokenize(sentence_list):
    return [word_tokenize(sentence_list[i]) for i in range(len(sentence_list))]

In [9]:
# "one-hot encode": convert each token to id in vocabulary vector (token2id)
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data 

### Creating vocabulary & embedding matrix from FastText vectors

In [10]:
word_vectors, token2id, id2token = build_vocab()

In [11]:
_weights = np.array(list(word_vectors.values()))
pad_vec = np.zeros((1, 300))
unk_vec = np.random.randn(1, 300) * 0.01
pad_unk_vecs = np.vstack((pad_vec, unk_vec))
_WEIGHTS = np.vstack((pad_unk_vecs, _weights))
_WEIGHTS.shape

(50002, 300)

### Function to pre-process data for TwoSentenceModel
#### Shuffle, word tokenize, one-hot index into vocabulary

In [12]:
def data_pipeline(sent1s, sent2s, labels, verify=True):
    labels = convert_labels_to_integers(labels)
    seed = random.randint(1, 100)
    print("Random seed for shuffling: {}".format(seed))
    random.Random(seed).shuffle(sent1s)
    random.Random(seed).shuffle(sent2s)
    random.Random(seed).shuffle(labels)
    
    print("\nVerifying that the data and label match after shuffling")
    if verify:
        verify_order(sent1s, sent2s, labels)
        verify_order(sent1s, sent2s, labels)
          
    print("\nTokenizing sentence 1 list...")    
    sent1s_tokenized = tokenize(sent1s)
    print("done!")
    print("\nTokenizing sentence 2 list... ")  
    sent2s_tokenized = tokenize(sent2s)
    print("done!")
    
    print("\nOne-hot encoding words for sentence 1 list...")  
    sent1s_indices = token2index_dataset(sent1s_tokenized)
    print("done!")
    print("\nOne-hot encoding words for sentence 2 list...")  
    sent2s_indices = token2index_dataset(sent2s_tokenized)
    print("done!")
    
    return (sent1s_indices, sent2s_indices, labels)

### DataLoader

In [13]:
# MAX_SENTENCE_LENGTH = 30

import numpy as np
import torch
from torch.utils.data import Dataset

class TwoSentencesDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    """
    
    def __init__(self, sent1_data_list, sent2_data_list, target_list):
        """
        @param sent1_data_list: list of sentence1's (index matches sentence2's and target_list below)
        @param sent2_data_list: list of sentence2's
        @param target_list: list of correct labels

        """
        self.sent1_data_list = sent1_data_list
        self.sent2_data_list = sent2_data_list
        self.target_list = target_list
        assert (len(self.sent1_data_list) == len(self.target_list) and len(self.sent2_data_list) == len(self.target_list))

    def __len__(self):
        return len(self.sent1_data_list)
        
    def __getitem__(self, key):
        ###
        ### Returns [[sentence, 1, tokens], [sentence, 2, tokens]]
        ###
        """
        Triggered when you call dataset[i]
        """
        sent1_tokens_idx = self.sent1_data_list[key][:MAX_SENTENCE_LENGTH]
        sent2_tokens_idx = self.sent2_data_list[key][:MAX_SENTENCE_LENGTH]
        combined_tokens_idx = [sent1_tokens_idx, sent2_tokens_idx]
        label = self.target_list[key]
        return [combined_tokens_idx, len(sent1_tokens_idx), len(sent2_tokens_idx), label]

def twosentences_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    sent1_data_list = []
    sent2_data_list = []
    sent1_length_list = []
    sent2_length_list = []
    label_list = []
    combined_data_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[3])
        sent1_length_list.append(datum[1])
        sent2_length_list.append(datum[2])
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0][0]), pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        padded_vec_2 = np.pad(np.array(datum[0][1]), pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        combined_data_list.append([padded_vec_1, padded_vec_2])
    return [torch.from_numpy(np.array(combined_data_list)), 
            torch.LongTensor(sent1_length_list), torch.LongTensor(sent2_length_list), torch.LongTensor(label_list)]

### Train Dataset creation

In [14]:
snli_train = pd.read_csv('snli_train.tsv', sep='\t')
TRAIN_SIZE = 100000

sent1_data = list(snli_train["sentence1"])[:TRAIN_SIZE]
sent2_data = list(snli_train["sentence2"])[:TRAIN_SIZE]
data_label = list(snli_train["label"])[:TRAIN_SIZE]
print("Size of training data: {}".format(len(sent1_data)))

Size of training data: 100000


In [15]:
sent1_train_indices, sent2_train_indices, train_label = data_pipeline(sent1_data, sent2_data, data_label)
train_dataset = TwoSentencesDataset(sent1_train_indices, sent2_train_indices, train_label)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=twosentences_collate_func,
                                           #shuffle=True
                                          )
print("Finished creating train_loader.")

Random seed for shuffling: 49

Verifying that the data and label match after shuffling
A little boy is sitting on a dock and with a fishing pole and he is fishing at a lake .
The little boy is out on a boat .
0
A man walks down an empty street noticing objects in the road before him .
a man noticing objects in the road
2

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!
Finished creating train_loader.


### Val Dataset creation

In [16]:
snli_val = pd.read_csv('snli_val.tsv', sep='\t')
sent1_val = list(snli_val["sentence1"])
sent2_val = list(snli_val["sentence2"])
val_label = list(snli_val["label"])
print("Size of val data: {}".format(len(sent1_val)))

Size of val data: 1000


In [17]:
sent1_val_indices, sent2_val_indices, val_label = data_pipeline(sent1_val, sent2_val, val_label)
val_dataset = TwoSentencesDataset(sent1_val_indices, sent2_val_indices, val_label)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=twosentences_collate_func,
                                           #shuffle=True
                                          )

Random seed for shuffling: 66

Verifying that the data and label match after shuffling
A kid in a red and black coat is laying on his back in the snow with his arm in the air and a red sled is next to him .
The kid is on a sugar high .
0
A live band on a lawn jamming out .
A live band on a lawn jamming out for the holiday crowd .
2

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!


In [18]:
pd.Series([len(x) for x in snli_train['sentence1']]).describe()['75%']
MAX_SENTENCE_LENGTH = 40

### CNN Model

In [22]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(_WEIGHTS), freeze=True)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)
        self.maxpool = nn.MaxPool1d(30)
        self.linear1 = nn.Linear(2*hidden_size, 100)
        self.linear2 = nn.Linear(100, num_classes)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x, sent1_lengths, sent2_lengths):
        
        batch_size = x.size()[0]
        seq_len = x.size()[2]
        
        sent1s = torch.tensor(x[:, 0, :]).cuda()
        sent2s = torch.tensor(x[:, 1, :]).cuda()
        ordered_sents = torch.cat([sent1s, sent2s], dim=0).cuda()

        embed = self.embedding(ordered_sents)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(2*batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(2*batch_size, seq_len, hidden.size(-1))
        hidden = self.maxpool(hidden.transpose(1, 2)).transpose(1, 2).squeeze(dim=1)
        
        hidden_sent1s = hidden[0:batch_size, :]
        hidden_sent2s = hidden[batch_size:, :]     
        
        linear1 = self.linear1(torch.cat([hidden_sent1s, hidden_sent2s], dim=1))
#         linear1 = self.linear1(torch.tensor(hidden_sent1s) + torch.tensor(hidden_sent2s))
#         linear1 = self.linear1(torch.tensor(hidden_sent1s)*torch.tensor(hidden_sent2s))
        linear1 = F.relu(linear1.contiguous().view(-1, linear1.size(-1))).view(linear1.shape)
#         linear1 = self.dropout(linear1)
        logits = self.linear2(linear1)
        
        return logits

In [43]:
a = torch.randn(3, 4)
b = torch.randn(3, 4)
print(a)
print(b)
print(a*b)

tensor([[ 1.1231, -0.9311, -1.8957, -0.5239],
        [-0.4075, -0.4562,  2.4520,  1.0612],
        [ 1.9883,  0.8115, -0.7578,  0.2037]])
tensor([[-0.7995,  2.3921, -0.5130, -0.9673],
        [ 0.6436,  0.8357, -1.9722,  0.5505],
        [ 2.8373, -0.2573,  0.7888, -0.6565]])
tensor([[-0.8980, -2.2274,  0.9724,  0.5067],
        [-0.2623, -0.3813, -4.8358,  0.5841],
        [ 5.6414, -0.2088, -0.5977, -0.1338]])


In [23]:
# Function for testing the model
def test_model(loader, model):
    """
    Helper function that tests the model's performance on a dataset
    """
    correct = 0
    total = 0
    model.eval()
    for (data, sent1_lengths, sent2_lengths, labels) in loader:
        data_batch, sent1_length_batch, sent2_length_batch, label_batch = data.cuda(), sent1_lengths.cuda(), sent2_lengths.cuda(), labels.cuda()
        outputs = F.softmax(model(data_batch, sent1_length_batch, sent2_length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        labels = labels.cuda()
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def train_model(model, lr = 0.001, num_epochs = 7, criterion = nn.CrossEntropyLoss()):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) 
    max_val_acc = 0
    for epoch in range(num_epochs):
        for i, (data, sent1_lengths, sent2_lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, sent1_length_batch, sent2_length_batch, label_batch = data.cuda(), sent1_lengths.cuda(), sent2_lengths.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(data_batch, sent1_length_batch, sent2_length_batch)
            loss = criterion(outputs, label_batch)
            
            loss.backward()
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 100 == 0:
                # validate
                val_acc = test_model(val_loader, model)
                if val_acc > max_val_acc:
                    max_val_acc = val_acc
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc))
                print('Epoch: [{}/{}], Step: [{}/{}], Training Loss: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), loss))
                
    print("Max Validation Accuracy: {}".format(max_val_acc))
    return max_val_acc

In [24]:
model = CNN(emb_size = 300, hidden_size=300, num_layers=1, num_classes=3).cuda()
train_model(model)

Epoch: [1/7], Step: [101/3125], Validation Acc: 41.2
Epoch: [1/7], Step: [101/3125], Training Loss: 1.1052814722061157
Epoch: [1/7], Step: [201/3125], Validation Acc: 49.0
Epoch: [1/7], Step: [201/3125], Training Loss: 1.0256174802780151
Epoch: [1/7], Step: [301/3125], Validation Acc: 51.3
Epoch: [1/7], Step: [301/3125], Training Loss: 0.903774619102478
Epoch: [1/7], Step: [401/3125], Validation Acc: 53.3
Epoch: [1/7], Step: [401/3125], Training Loss: 0.9491991400718689
Epoch: [1/7], Step: [501/3125], Validation Acc: 55.0
Epoch: [1/7], Step: [501/3125], Training Loss: 1.0173335075378418
Epoch: [1/7], Step: [601/3125], Validation Acc: 53.6
Epoch: [1/7], Step: [601/3125], Training Loss: 1.0602761507034302
Epoch: [1/7], Step: [701/3125], Validation Acc: 58.9
Epoch: [1/7], Step: [701/3125], Training Loss: 1.2470897436141968
Epoch: [1/7], Step: [801/3125], Validation Acc: 57.9
Epoch: [1/7], Step: [801/3125], Training Loss: 0.8171583414077759
Epoch: [1/7], Step: [901/3125], Validation Acc: 6

Epoch: [3/7], Step: [801/3125], Validation Acc: 64.4
Epoch: [3/7], Step: [801/3125], Training Loss: 0.7579377293586731
Epoch: [3/7], Step: [901/3125], Validation Acc: 65.4
Epoch: [3/7], Step: [901/3125], Training Loss: 0.6923515796661377
Epoch: [3/7], Step: [1001/3125], Validation Acc: 64.2
Epoch: [3/7], Step: [1001/3125], Training Loss: 0.6575340032577515
Epoch: [3/7], Step: [1101/3125], Validation Acc: 62.7
Epoch: [3/7], Step: [1101/3125], Training Loss: 0.7164822816848755
Epoch: [3/7], Step: [1201/3125], Validation Acc: 64.8
Epoch: [3/7], Step: [1201/3125], Training Loss: 0.9904353022575378
Epoch: [3/7], Step: [1301/3125], Validation Acc: 64.9
Epoch: [3/7], Step: [1301/3125], Training Loss: 0.8374388813972473
Epoch: [3/7], Step: [1401/3125], Validation Acc: 64.5
Epoch: [3/7], Step: [1401/3125], Training Loss: 0.8519344329833984
Epoch: [3/7], Step: [1501/3125], Validation Acc: 63.9
Epoch: [3/7], Step: [1501/3125], Training Loss: 0.8755441904067993
Epoch: [3/7], Step: [1601/3125], Val

Epoch: [5/7], Step: [1501/3125], Validation Acc: 63.9
Epoch: [5/7], Step: [1501/3125], Training Loss: 0.6899547576904297
Epoch: [5/7], Step: [1601/3125], Validation Acc: 63.9
Epoch: [5/7], Step: [1601/3125], Training Loss: 0.5691060423851013
Epoch: [5/7], Step: [1701/3125], Validation Acc: 65.2
Epoch: [5/7], Step: [1701/3125], Training Loss: 0.4655872583389282
Epoch: [5/7], Step: [1801/3125], Validation Acc: 65.3
Epoch: [5/7], Step: [1801/3125], Training Loss: 0.6965263485908508
Epoch: [5/7], Step: [1901/3125], Validation Acc: 66.6
Epoch: [5/7], Step: [1901/3125], Training Loss: 0.7459343075752258
Epoch: [5/7], Step: [2001/3125], Validation Acc: 65.4
Epoch: [5/7], Step: [2001/3125], Training Loss: 0.7615896463394165
Epoch: [5/7], Step: [2101/3125], Validation Acc: 65.4
Epoch: [5/7], Step: [2101/3125], Training Loss: 0.7232452630996704
Epoch: [5/7], Step: [2201/3125], Validation Acc: 65.1
Epoch: [5/7], Step: [2201/3125], Training Loss: 0.5541829466819763
Epoch: [5/7], Step: [2301/3125],

Epoch: [7/7], Step: [2201/3125], Validation Acc: 65.5
Epoch: [7/7], Step: [2201/3125], Training Loss: 0.5588942766189575
Epoch: [7/7], Step: [2301/3125], Validation Acc: 64.6
Epoch: [7/7], Step: [2301/3125], Training Loss: 0.9125675559043884
Epoch: [7/7], Step: [2401/3125], Validation Acc: 65.4
Epoch: [7/7], Step: [2401/3125], Training Loss: 0.6763523817062378
Epoch: [7/7], Step: [2501/3125], Validation Acc: 65.5
Epoch: [7/7], Step: [2501/3125], Training Loss: 0.5800447463989258
Epoch: [7/7], Step: [2601/3125], Validation Acc: 65.7
Epoch: [7/7], Step: [2601/3125], Training Loss: 0.37735041975975037
Epoch: [7/7], Step: [2701/3125], Validation Acc: 64.7
Epoch: [7/7], Step: [2701/3125], Training Loss: 0.596114456653595
Epoch: [7/7], Step: [2801/3125], Validation Acc: 64.4
Epoch: [7/7], Step: [2801/3125], Training Loss: 0.5707796216011047
Epoch: [7/7], Step: [2901/3125], Validation Acc: 65.7
Epoch: [7/7], Step: [2901/3125], Training Loss: 0.4598122537136078
Epoch: [7/7], Step: [3001/3125],

67.2

### Hyperparameter tuning

**Standard CNN** architecture and training hyperparameters: 2 Convolutional layers (each with kernel size 3) followed by 2 fully connected layers, hidden_size 300, 7 epochs, lr = 0.001. Used embedding layer with FastText vectors, with freeze = False. I've also included the results with freeze = True. 

| Hyperparameters changed                  | CNN (val accuracy) | RNN (val accuracy) |
|------------------------------------------|:------------------:|:------------------:|
| Standard, freeze=False                   |        67.5%       |        69.9%       |
| Standard, freeze=True                    |        67.2%       |        69.2%       |
| Hidden size: 400                         |        67.6%       |        70.0%       |
| Hidden size: 500                         |        67.1%       |        68.8%       |
| Dropout (p = 0.5)                        |        66.6%       |        68.9%       |
| Dropout (p = 0.2)                        |        67.2%       |        69.8%       |
| Interaction: sum                         |        62.2%       |        64.1%       |
| Interaction: element-wise multiplication |        65.8%       |        71.2%       |

### Evaluating on MNLI

In [5]:
mnli_val = pd.read_csv('mnli_val.tsv', sep='\t')

In [6]:
sentence_length_75 = pd.Series([len(x) for x in mnli_val['sentence1']]).describe()['75%']
print("75th percentile for sentence length (in characters): {}".format(sentence_length_75))
MAX_SENTENCE_LENGTH = 40

75th percentile for sentence length (in characters): 151.0


#### Build dictionary of (sent1, sent2, label) data, by genre.

In [9]:
mnli_val_dict = {}
for x in mnli_val['genre'].unique():
    filtered = mnli_val[mnli_val['genre'] == x]
    mnli_val_dict[x] = {}
    mnli_val_dict[x]["sent1s"] = list(filtered["sentence1"])
    mnli_val_dict[x]["sent2s"] = list(filtered["sentence2"])
    mnli_val_dict[x]["label"] = convert_labels_to_integers(list(filtered["label"]))

In [10]:
for x in mnli_val_dict.keys():
    print(len(mnli_val_dict[x]["sent1s"]))
    print(np.unique(mnli_val_dict[x]["label"]))

995
[0 1 2]
1005
[0 1 2]
1002
[0 1 2]
1016
[0 1 2]
982
[0 1 2]


In [16]:
# quick verify
verify_order(mnli_val_dict['travel']["sent1s"], mnli_val_dict['travel']["sent2s"], mnli_val_dict['travel']["label"])

To the south , the former fishing villages of Sorrento and Positano spill down the craggy cliffs of the serpentine Amalfi coast , justifiably tauted as one of the world 's most beautiful drives .
Sorrento used to be a fishing village .
1


In [41]:
# for each genre, build validation set and evaluate on it. 
cnn_results = {}
for genre in mnli_val_dict.keys():
    sent1_val_indices, sent2_val_indices, val_label = data_pipeline(mnli_val_dict[genre]["sent1s"], 
                                                                    mnli_val_dict[genre]["sent2s"], 
                                                                    mnli_val_dict[genre]["label"])
    val_dataset = TwoSentencesDataset(sent1_val_indices, sent2_val_indices, val_label)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                             batch_size=BATCH_SIZE, 
                                             collate_fn=twosentences_collate_func,
                                             #shuffle=True
                                             )
    cnn_results[genre] = test_model(val_loader, model)
    print("Genre {} has validation accuracy: {}".format(genre, cnn_results[genre]))
    

Random seed for shuffling: 51

Verifying that the data and label match after shuffling
Good sir , Jon began .
Jon addressed the man .
1
The day my deadline came , I got a business card .
The deadline to accept my promotion arrived and I got a business card with my new title .
2

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!
Genre fiction has validation accuracy: 46.130653266331656
Random seed for shuffling: 29

Verifying that the data and label match after shuffling
really oh i thought it was great yeah
That was horrible
0
i think there would be an awful lot of resentment and um i i really do n't think it would be feasible on our country
The war would lead to a bunch of resentment among civilians .
2

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for 

### MNLI results

| genre      | CNN (val accuracy) | RNN (val accuracy) |
|------------|--------------------|--------------------|
| fiction    |       46.13%       |                    |
| telephone  |       45.97%       |                    |
| travel     |       48.17%       |                    |
| slate      |       44.61%       |                    |
| government |       51.38%       |                    |

In [None]:
###
### Ideas
###

# Dropout layers --> prob 0.5
# weight decay. 

#rnn --> layer normalize

# CNN masking
# do not backpropagate
# after conv, cresate tensor masked not update.
# right after regular linear layer
# 
# set all elements until padding to 1