# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev2 toc-item"><a href="#Load-Train-Data" data-toc-modified-id="Load-Train-Data-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Train Data</a></div><div class="lev2 toc-item"><a href="#Load-Test-Data" data-toc-modified-id="Load-Test-Data-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load Test Data</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Build Dataset</a></div><div class="lev1 toc-item"><a href="#Save-Data" data-toc-modified-id="Save-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Save Data</a></div>

# Load Data

In [1]:
import json
import os

In [2]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

## Load Train Data

In [7]:
premise = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'premise')
alternative1 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'alternative1')
alternative2 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'alternative2')

premise.extend(premise)
alternative = []
alternative.extend(alternative1)
alternative.extend(alternative2)

rawLabel = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'most-plausible-alternative')
rawLabel = [int(l) for l in rawLabel]

l = [0] * len(premise)
for i in range(len(rawLabel)):
    if rawLabel[i] == 1:
        l[i] = 1
        l[i+len(rawLabel)] = -1
    if rawLabel[i] == 2:
        l[i] = -1
        l[i+len(rawLabel)] = 1
        
label = l

## Load Test Data

In [11]:
premiseTest = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'premise')
alternative1 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'alternative1')
alternative2 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'alternative2')

rawLabel = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'most-plausible-alternative')
labelTest = [int(l) for l in rawLabel]

# Word Segmentation

In [13]:
from nltk import regexp_tokenize

In [14]:
replDict = {"woman's": 'woman', "man's": 'man', "patient's": 'patient', "student's": 'student', "boy's": 'boy', 
            "friend's": 'friend', "enemy's": 'enemy', "parent's": 'parent', "humanitarian's": 'humanitarian', 
            "child's": 'child', "professor's": 'professor', "daughter's": 'daughter', "mother's": 'mother', 
            "children's": 'children', "teller's": 'teller', "company's": 'company', "group's": 'group', 
            "laptop's": 'laptop', "girl's": 'girl', "salesman's": 'salesman', "cook's": 'cook', "car's": 'car', 
            "offender's": 'offender', "detective's": 'detective', "librarian's": 'librarian', "caller's": 'caller', 
            "victim's": 'victim', "interviewer's": 'interviewer', "ship's": 'ship', "site's": 'site', 
            "chandelier's": 'chandelier', "bully's": 'bully', "river's": 'river', "puppy's": 'puppy', 
            "pilot's": 'pilot', "girlfriend's": 'girlfriend', "politician's": 'politician', "couple's": 'couple', 
            "son's": 'son', "actor's": 'actor', "neighbor's": 'neighbor', "nation's": 'nation', 
            "classmate's": 'classmate', "businessman's": 'businessman', "architect's": 'architect', 
            "imposter's": 'imposter', "kidnapper's": 'kidnapper', "colleague's": 'colleague', "flower's": 'flower',
            "bull's": 'bull', "employee's": 'employee', "team's": 'team', "other's": 'other', 
            "writer's": 'writer', "baby's": 'baby', "attacker's": 'attacker', "uncle's": 'uncle', "driver's": 'driver'}

In [15]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
            '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data 
    """
    for i in range(len(s)):
        if s[i] == "couldn't":
            s[i] = 'could'
            s.insert(i+1, 'not')
        if s[i] == "wouldn't":
            s[i] = 'would'
            s.insert(i+1, 'not')
    s = [i for i in s if i != '']
    return [replDict.get(i.lower(), i.lower()) for i in s]

In [16]:
preWords = [clean(cut(s)) for s in premise]
altWords = [clean(cut(s)) for s in alternative]
pretestWords = [clean(cut(s)) for s in premiseTest]
alt1Words = [clean(cut(s)) for s in alternative1]
alt2Words = [clean(cut(s)) for s in alternative2]

# Build Dataset

In [23]:
import pickle
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [24]:
MAX_P_LEN = 13
MAX_A_LEN = 11
SEED = 42

In [25]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Sentence_Classification_Glove/data/index.pkl', 'rb') as fp:
    word2index = pickle.load(fp)

In [27]:
preSeq = [[word2index[w] for w in s] for s in preWords]
altSeq = [[word2index[w] for w in s] for s in altWords]

xpTrain = pad_sequences(preSeq, maxlen=MAX_P_LEN, padding='post', truncating='post')
xaTrain = pad_sequences(altSeq, maxlen=MAX_A_LEN, padding='post', truncating='post')

yTrain = np.array(label)

In [30]:
pretestSeq = [[word2index[w] for w in s] for s in pretestWords]
alt1Seq = [[word2index[w] for w in s] for s in alt1Words]
alt2Seq = [[word2index[w] for w in s] for s in alt2Words]

xpTest = pad_sequences(pretestSeq, maxlen=MAX_P_LEN, padding='post', truncating='post')
xa1Test = pad_sequences(alt1Seq, maxlen=MAX_A_LEN, padding='post', truncating='post')
xa2Test = pad_sequences(alt2Seq, maxlen=MAX_A_LEN, padding='post', truncating='post')

yTest = np.array(labelTest)

In [31]:
xVal = np.vstack((xa1Test, xa2Test))

l = [0] * 1000
for i in range(len(yTest)):
    if yTest[i] == 1:
        l[i] = 1
        l[i+len(yTest)] = -1
    if yTest[i] == 2:
        l[i] = -1
        l[i+len(yTest)] = 1

yVal = np.array(l)

In [34]:
xpTrain, _, xaTrain, _ = train_test_split(xpTrain, xaTrain, test_size=0., random_state=SEED)
yTrain, _ = train_test_split(yTrain, test_size=0., random_state=SEED)

# Save Data

In [37]:
import h5py

In [38]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/QA/train.h5', 'w')
fh['xpTrain'] = xpTrain
fh['xaTrain'] = xaTrain
fh['xVal'] = xVal
fh['yTrain'] = yTrain
fh['yVal'] = yVal
fh.close()

In [39]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/QA/test.h5', 'w')
fh['xpTest'] = xpTest
fh['xa1Test'] = xa1Test
fh['xa2Test'] = xa2Test
fh['yTest'] = yTest 
fh.close()