# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Preprocess" data-toc-modified-id="Data-Preprocess-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preprocess</a></div><div class="lev1 toc-item"><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build Dataset</a></div>

# Data Preprocess

In [1]:
import json
import os
import spacy
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV, NUM, PROPN

In [2]:
nlp = spacy.load('en')
DEV_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json'
TEST_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json'
SEMEVAL_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval.json'

In [3]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

def isNoise(token):
    """
    Check if the token is a noise or not 
    """
    is_noise = False
    pos_tags = []
    if token.pos not in [NOUN, VERB, ADJ, ADV, NUM]:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    return is_noise

def clean(token):
    """
    Clean data
    """
    return token.lemma_

In [4]:
semevalSent = load_data(SEMEVAL_DIR, 'sentence')
semevalLabel = load_data(SEMEVAL_DIR, 'label')

In [5]:
class Data:
    def __init__(self, path):
        self.rawPremise = load_data(path, 'premise')
        self.ask_for = load_data(path, 'asks-for')
        self.rawAlternative1 = load_data(path, 'alternative1')
        self.rawAlternative2 = load_data(path, 'alternative2')
        self.label = load_data(path, 'most-plausible-alternative')
        self.premise = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawPremise]]
        self.alternative1 = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawAlternative1]]
        self.alternative2 = [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawAlternative2]]
            
    def train_data(self):
        t1 = [self.premise[i] + self.alternative1[i] for i in range(len(self.premise))]
        t2 = [self.premise[i] + self.alternative2[i] for i in range(len(self.premise))]
        t3 = [self.alternative1[i] + self.premise[i] for i in range(len(self.premise))]
        t4 = [self.alternative2[i] + self.premise[i] for i in range(len(self.premise))]
        l1, l2, l3, l4 = [], [], [], []
        for i in range(len(self.label)):
            if self.label[i] == '1':
                l1.append(1), l2.append(0), l3.append(1), l4.append(0);
            else:
                l1.append(0), l2.append(1), l3.append(0), l4.append(1);
        return t1+t2+t3+t4, l1+l2+l3+l4
        
    def test_data(self):
        v1, v2 = [], []
        for i in range(len(self.ask_for)):
            if self.ask_for[i] == 'cause':
                v1.append(self.alternative1[i] + self.premise[i])
                v2.append(self.alternative2[i] + self.premise[i])
            else:
                v1.append(self.premise[i] + self.alternative1[i])
                v2.append(self.premise[i] + self.alternative2[i])
        
        return v1, v2, [int(l) for l in self.label]            

# Build Dataset

In [6]:
from keras.preprocessing.text import Tokenizer
import numpy as np
import pickle
import h5py
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [7]:
trainData = Data(DEV_DATA_DIR)
valData = Data(DEV_DATA_DIR)
testData = Data(TEST_DATA_DIR)

xTrain, yTrain = trainData.train_data()
x1Val, x2Val, yVal = trainData.test_data()
x1Test, x2Test, yTest = trainData.test_data()

In [None]:
tok_sentWords = x1Val+x2Val+x1Test+x2Test+semevalSent
tokTexts = [' '.join(i) for i in tok_sentWords]