In [0]:
from os.path import exists
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
import csv
import numpy as np
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
%matplotlib inline
import pickle as pkl

In [0]:
def data_preprocess(path, size):
    
    premise=[]
    hypo = []
    label =[]
    genre=[]
    with open(path) as files:
        file = csv.reader(files, delimiter='\t')
        for row in file:
            if row != ['sentence1', 'sentence2', 'label', 'genre']:
                premise.append(row[0].split())
                hypo.append(row[1].split())
                if row[2] == 'contradiction':
                    label.append(0.0)
                elif row[2]=='entailment':
                    label.append(1.0)
                elif row[2]=='neutral': 
                    label.append(2.0)
                else:
                    print('Error Label')
                genre.append(row[3])    
    return premise, hypo, label, genre 

In [0]:
s1_train, s2_train, y_train,gen_train = data_preprocess('/content/drive/hw2_data/mnli_train.tsv', 100000)

s1_val, s2_val, y_val, gen_val = data_preprocess('/content/drive/hw2_data/mnli_val.tsv', 1000)


In [42]:
unique_gen = set(gen_train)
unique_gen

{'fiction', 'government', 'slate', 'telephone', 'travel'}

In [0]:
import io

def load_vectors(fname,size):
    PAD_IDX = 0
    UNK_IDX = 1
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = np.zeros((size+2, 300))
    
    token2id = {'<pad>':PAD_IDX, '<unk>':UNK_IDX}
    id2token = {PAD_IDX:'<pad>', UNK_IDX:'<unk>'}
    np.random.seed(1)
    data[UNK_IDX] = np.random.rand(300)
    i =0
    for line in fin:
        i +=1
        if i>size:
            break
        tokens = line.rstrip().split(' ')
        data[i+1, :] = np.asarray(tokens[1:])
        token2id[tokens[0]] = i+1
        id2token[i+1] = tokens[0]

    return data, token2id, id2token

In [0]:
dictionary_size= 100000
PAD_IDX = 0
UNK_IDX = 1
vector_load, token2id, id2token= load_vectors('/content/drive/hw2_data/wiki-news-300d-1M.vec',dictionary_size)

In [0]:

s1_val_gove = []
s2_val_gove =[]
y_val_gove = []


s1_val_tele = []
s2_val_tele =[]
y_val_tele = []

s1_val_slate = []
s2_val_slate=[]
y_val_slate = []

s1_val_trave = []
s2_val_trave =[]
y_val_trave = []


s1_val_fict = []
s2_val_fict =[]
y_val_fict = []

for ind, val in enumerate(s1_val):
  if gen_val[ind]=='fiction':    
    s1_val_fict.append(s1_val[ind])
    s2_val_fict.append(s2_val[ind])
    y_val_fict.append(y_val[ind])
  elif gen_val[ind]=='government': 
    s1_val_gove.append(s1_val[ind])
    s2_val_gove.append(s2_val[ind])
    y_val_gove.append(y_val[ind])
  elif gen_val[ind]=='telephone': 
    s1_val_tele.append(s1_val[ind])
    s2_val_tele.append(s2_val[ind])
    y_val_tele.append(y_val[ind])    

  elif gen_val[ind]=='slate': 
    s1_val_slate.append(s1_val[ind])
    s2_val_slate.append(s2_val[ind])
    y_val_slate.append(y_val[ind])
  else:
    s1_val_trave.append(s1_val[ind])
    s2_val_trave.append(s2_val[ind])
    y_val_trave.append(y_val[ind])   
    




In [0]:
import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list1,data_list2, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        self.target_list = target_list
        assert (len(self.data_list1) == len(self.target_list))
        assert (len(self.data_list2) == len(self.target_list))
    def __len__(self):
        return len(self.data_list1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH_1]
        token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH_2]
        label = self.target_list[key]
        return [token_idx1, len(token_idx1), token_idx2, len(token_idx2), label]
              

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []

    
    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[1])
        length_list2.append(datum[3])

    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH_1-datum[1])), 
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)        
        
        
        padded_vec2 = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH_2-datum[3])), 
                                mode="constant", constant_values=0)

        data_list2.append(padded_vec2)
    
    return [torch.from_numpy(np.array(data_list1)), torch.LongTensor(length_list1), 
            torch.from_numpy(np.array(data_list2)), torch.LongTensor(length_list2),torch.LongTensor(label_list)]
  
BATCH_SIZE = 32
train_dataset = NewsGroupDataset(s1_train_indices, s2_train_indices, y_train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(s1_val_indices, s2_val_indices, y_val)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)  




In [8]:
model_rnn = torch.load('RNN_model4 .pth')
model_cnn = torch.load('CNN_model3.pth')

1

In [0]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data


In [0]:
s1_val_gove_indices = token2index_dataset(s1_val_gove)
s2_val_gove_indices = token2index_dataset(s2_val_gove)

s1_val_tele_indices = token2index_dataset(s1_val_tele)
s2_val_tele_indices = token2index_dataset(s2_val_tele)



s1_val_trave_indices = token2index_dataset(s1_val_trave)
s2_val_trave_indices = token2index_dataset(s2_val_trave)

s1_val_fict_indices = token2index_dataset(s1_val_fict)
s2_val_fict_indices = token2index_dataset(s2_val_fict)

s1_val_slate_indices = token2index_dataset(s1_val_slate)
s2_val_slate_indices = token2index_dataset(s2_val_slate)



In [0]:
def test_model(loader, model):
    model.eval()
    cor = 0
    cnt = 0


    for s1, s1_l,  s2, s2_1, labels in loader:
        s1_batch1, batch1_1, s2_batch2, batch2_1, label_batch = s1, s1_l, s2, s2_1, labels
        
        
        outputs = F.softmax(model(s1_batch1, batch1_1, s2_batch2, batch2_1), dim=1)
        predict = outputs.max(1, keepdim=True)[1]

        cnt += labels.size(0)
        cor += predicted.eq(labels.view_as(predict)).sum().item()
        accu = cor / cnt * 100
        
        
    return (accu)

In [35]:
val_dataset_gove = SNLIDataset(s1_val_gove_indices, s2_val_gove_indices, y_val_gove)
val_loader_gove = torch.utils.data.DataLoader(dataset=val_dataset_gove, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

print("CNN Val Acuu: {} ; RNN Val Acuu:{} ".format(test_model(val_loader_fict, model_cnn),test_model(val_loader_fict, model_rnn))
      

CNN Val Acuu: 41.1 ; RNN Val Acuu: 44.2 


In [36]:
val_dataset_trave = SNLIDataset(s1_val_travee_indices, s2_val_trave_indices, y_val_trave)
val_loader_trave = torch.utils.data.DataLoader(dataset=val_dataset_trave, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

print("CNN Val Acuu:{} ; RNN Val Acuu: {} ".format(test_model(val_loader_fict, model_cnn),test_model(val_loader_fict, model_rnn))

CNN Val Acuu: 40.6 ; RNN Val Acuu: 42.3 


In [37]:
val_dataset_tele = SNLIDataset(s1_val_tele_indices, s2_val_tele_indices, y_val_tele )
val_loader_tele = torch.utils.data.DataLoader(dataset=val_dataset_tele, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

print("CNN Val Acuu: {} ; RNN Val Acuu: {} ".format(test_model(val_loader_fict, model_cnn),test_model(val_loader_fict, model_rnn))

CNN Val Acuu: 40.3 ; RNN Val Acuu: 45.3 


In [38]:
val_dataset_slate = SNLIDataset(s1_val_slate_indices, s2_val_slate_indices, y_slate_gove)
val_loader_slate = torch.utils.data.DataLoader(dataset=val_dataset_slate, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

print("CNN Val Acuu: {} ; RNN Val Acuu: {} ".format(test_model(val_loader_fict, model_cnn),test_model(val_loader_fict, model_rnn))

CNN Val Acuu: 42.2 ; RNN Val Acuu: 46.2 


In [39]:
val_dataset_fict = SNLIDataset(s1_val_fict_indices, s2_val_fict_indices, y_val_fict)
val_loader_gove = torch.utils.data.DataLoader(dataset=val_dataset_fict, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

print("CNN Val Acuu: {} ; RNN Val Acuu: {} ".format(test_model(val_loader_fict, model_cnn),test_model(val_loader_fict, model_rnn))

CNN Val Acuu: 42.1 ; RNN Val Acuu: 45.2 
