# load SNLI data

In [8]:
import csv
import numpy as np
label_to_number={"contradiction":0, "entailment":1,  "neutral":2}

In [9]:
def snli_data_loader(filepath):
    data_loaded=[]
    with open(filepath) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for i, (row) in enumerate (reader):
            if i<1:
                pass;
            else:
                sentence1=row[0].split()
                sentence2=row[1].split()
                label=label_to_number[row[2]]
                data_loaded.append([sentence1, sentence2, label])
        
    return data_loaded

In [10]:
snli_data_train = snli_data_loader("./raw_data/snli_train.tsv")
snli_data_val = snli_data_loader("./raw_data/snli_val.tsv")
# max_length1=max ([len(instance[0]) for instance in data])
# max_length2=max ([len(instance[1]) for instance in data])

# Build Vocab

In [11]:
def build_vocab(filepath, words_to_load=100000, PAD_IDX = 0, UNK_IDX = 1):

    with open(filepath) as ft_vec:
        loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
        np.random.seed(1)
        loaded_embeddings_ft[UNK_IDX] = np.random.rand(300)
        
        token2id = {'<pad>':PAD_IDX, '<unk>':UNK_IDX}
        id2token = {PAD_IDX:'<pad>', UNK_IDX:'<unk>'}
        ##bypass thhe first line 
        next(ft_vec)
        for i, line in enumerate(ft_vec):
            if i == words_to_load: 
                break
            else:
                row = line.split()
                loaded_embeddings_ft[i+2] = np.asarray(row[1:])
                token2id[row[0]] = i+2
                id2token[i+2] = row[0]

    return loaded_embeddings_ft, token2id, id2token

In [12]:
loaded_embeddings_ft, token2id, id2token=build_vocab("./fasttext_vector/wiki-news-300d-1M.vec")

# index data

In [6]:
def token2index_dataset(snli_data, token2id, PAD_IDX = 0, UNK_IDX = 1):
    processed_snli_data = []
    
    for instance in snli_data:
        sentence1 = [token2id[token] if token in token2id else UNK_IDX for token in instance[0]]
        sentence2 = [token2id[token] if token in token2id else UNK_IDX for token in instance[1]]
        processed_snli_data.append([sentence1, sentence2, instance[2]])
    
    return processed_snli_data

In [7]:
processed_snli_data_train=token2index_dataset(snli_data_train, token2id)
processed_snli_data_val=token2index_dataset(snli_data_val, token2id)

# Save data to files

In [8]:
import pickle as pkl
pkl.dump(processed_snli_data_train, open("./data/processed_snli_data_train.p", "wb"))
pkl.dump(processed_snli_data_val, open("./data/processed_snli_data_val.p", "wb"))
pkl.dump(loaded_embeddings_ft, open("./data/loaded_embeddings_ft.p", "wb"))

In [1]:
Genres = ["fiction", "telephone", "slate", "government", "travel"]

In [2]:
import csv
def mnli_data_loader(filepath):
    data_loaded={
        "fiction": [],
        "telephone": [],
        "slate": [],
        "government":[],
        "travel":[]
    }
    with open(filepath) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for i, (row) in enumerate (reader):
            if i<1:
                pass;
            else:
                sentence1=row[0].split()
                sentence2=row[1].split()
                label=label_to_number[row[2]]
                genre = row[3]
                data_loaded[genre].append([sentence1, sentence2, label])
    return data_loaded

In [5]:
mnli_data_val= mnli_data_loader("./raw_data/mnli_val.tsv")

In [6]:
def token2index_dataset_mnli(mnli_data, token2id, PAD_IDX = 0, UNK_IDX = 1):
    
    processed_mnli_data = {
        "fiction": [],
        "telephone": [],
        "slate": [],
        "government":[],
        "travel":[]
    }
    for genre in Genres:
        for instance in mnli_data[genre]:
            sentence1 = [token2id[token] if token in token2id else UNK_IDX for token in instance[0]]
            sentence2 = [token2id[token] if token in token2id else UNK_IDX for token in instance[1]]
            processed_mnli_data[genre].append([sentence1, sentence2, instance[2]])
    
    return processed_mnli_data

In [13]:
processed_mnli_data_val=token2index_dataset_mnli(mnli_data_val, token2id)

In [14]:
import pickle as pkl
pkl.dump(processed_mnli_data_val, open("./data/processed_mnli_data_val.p", "wb"))
# pkl.dump(loaded_embeddings_ft, open("./data/loaded_embeddings_ft.p", "wb"))