# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev2 toc-item"><a href="#Load-Train-Data" data-toc-modified-id="Load-Train-Data-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Train Data</a></div><div class="lev2 toc-item"><a href="#Load-Test-Data" data-toc-modified-id="Load-Test-Data-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Load Test Data</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Build Dataset</a></div><div class="lev1 toc-item"><a href="#Save-Data" data-toc-modified-id="Save-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Save Data</a></div><div class="lev1 toc-item"><a href="#Checkpoint" data-toc-modified-id="Checkpoint-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Checkpoint</a></div><div class="lev1 toc-item"><a href="#Load-Model" data-toc-modified-id="Load-Model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Load Model</a></div><div class="lev2 toc-item"><a href="#Set-Hyperparameters" data-toc-modified-id="Set-Hyperparameters-61"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Set Hyperparameters</a></div><div class="lev2 toc-item"><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-62"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Import Libraries</a></div><div class="lev2 toc-item"><a href="#Build-Graph" data-toc-modified-id="Build-Graph-63"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Build Graph</a></div><div class="lev2 toc-item"><a href="#Model-Visualization" data-toc-modified-id="Model-Visualization-64"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Model Visualization</a></div><div class="lev1 toc-item"><a href="#Train" data-toc-modified-id="Train-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Train</a></div><div class="lev2 toc-item"><a href="#Load-Weights" data-toc-modified-id="Load-Weights-71"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Load Weights</a></div>

# Load Data

In [1]:
import json
import os

In [2]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

## Load Train Data

In [3]:
premise = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'premise')
asks_for = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'asks-for')
alternative1 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'alternative1')
alternative2 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'alternative2')

question = []
for i in range(len(premise)):
    if asks_for[i] == 'cause':
        question.append('What was the CAUSE of this?')
    else:
        question.append('What happened as a RESULT?')
        
premise.extend(premise)
alternative1.extend(alternative2)
alternative = alternative1
question.extend(question)

premise.extend(alternative)
alternative.extend(premise[:1000])

q = []
for i in range(len(question)):
    if question[i] == 'What was the CAUSE of this?':
        q.append('What happened as a RESULT?')
    else:
        q.append('What was the CAUSE of this?')
question.extend(q)

rawLabel = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json', 'most-plausible-alternative')
rawLabel = [int(l) for l in rawLabel]

l = [0] * 1000
for i in range(len(rawLabel)):
    if rawLabel[i] == 1:
        l[i] = 1
        l[i+len(rawLabel)] = 0
    if rawLabel[i] == 2:
        l[i] = 0
        l[i+len(rawLabel)] = 1

labelCe = l*2
labelHi = [1 if i == 1 else -1 for i in l*2]

## Load Test Data

In [4]:
premiseTest = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'premise')
asks_for = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'asks-for')
alternative1 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'alternative1')
alternative2 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'alternative2')

questionTest = []
for i in range(len(premiseTest)):
    if asks_for[i] == 'cause':
        questionTest.append('What was the CAUSE of this?')
    else:
        questionTest.append('What happened as a RESULT?')
        
rawLabel = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json', 'most-plausible-alternative')
labelTest = [int(l) for l in rawLabel]

# Word Segmentation

In [7]:
from nltk import regexp_tokenize

In [8]:
replDict = {"woman's": 'woman', "man's": 'man', "patient's": 'patient', "student's": 'student', "boy's": 'boy', 
            "friend's": 'friend', "enemy's": 'enemy', "parent's": 'parent', "humanitarian's": 'humanitarian', 
            "child's": 'child', "professor's": 'professor', "daughter's": 'daughter', "mother's": 'mother', 
            "children's": 'children', "teller's": 'teller', "company's": 'company', "group's": 'group', 
            "laptop's": 'laptop', "girl's": 'girl', "salesman's": 'salesman', "cook's": 'cook', "car's": 'car', 
            "offender's": 'offender', "detective's": 'detective', "librarian's": 'librarian', "caller's": 'caller', 
            "victim's": 'victim', "interviewer's": 'interviewer', "ship's": 'ship', "site's": 'site', 
            "chandelier's": 'chandelier', "bully's": 'bully', "river's": 'river', "puppy's": 'puppy', 
            "pilot's": 'pilot', "girlfriend's": 'girlfriend', "politician's": 'politician', "couple's": 'couple', 
            "son's": 'son', "actor's": 'actor', "neighbor's": 'neighbor', "nation's": 'nation', 
            "classmate's": 'classmate', "businessman's": 'businessman', "architect's": 'architect', 
            "imposter's": 'imposter', "kidnapper's": 'kidnapper', "colleague's": 'colleague', "flower's": 'flower',
            "bull's": 'bull', "employee's": 'employee', "wouldn't": 'wouldn', "team's": 'team', "other's": 'other', 
            "writer's": 'writer', "baby's": 'baby', "attacker's": 'attacker', "uncle's": 'uncle', "driver's": 'driver'}

In [9]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
            '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data 
    """
    for i in range(len(s)):
        if s[i] == "couldn't":
            s[i] = 'could'
            s.insert(i+1, 'not')
    s = [i for i in s if i != '']
    return [replDict.get(i.lower(), i.lower()) for i in s]

In [10]:
preWords = [clean(cut(s)) for s in premise]
altWords = [clean(cut(s)) for s in alternative]
queWords = [clean(cut(s)) for s in question]
pretestWords = [clean(cut(s)) for s in premiseTest]
alt1Words = [clean(cut(s)) for s in alternative1]
alt2Words = [clean(cut(s)) for s in alternative2]
quetestWords = [clean(cut(s)) for s in questionTest]

# Build Dataset

In [18]:
MAX_LEN = 13
MAX_Q_LEN = 6
SEED = 42

In [25]:
import pickle
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [20]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Sentence_Classification_Glove/data/index.pkl', 'rb') as fp:
    word2index = pickle.load(fp)

In [23]:
preSeq = [[word2index[w] for w in s] for s in preWords]
altSeq = [[word2index[w] for w in s] for s in altWords]

xpTrain = pad_sequences(preSeq, maxlen=MAX_LEN, padding='pre', truncating='pre')
xaTrain = pad_sequences(altSeq, maxlen=MAX_LEN, padding='post', truncating='post')

yceTrain = np.array(labelCe)
yhiTrain = np.array(labelHi)

In [24]:
pretestSeq = [[word2index[w] for w in s] for s in pretestWords]
alt1Seq = [[word2index[w] for w in s] for s in alt1Words]
alt2Seq = [[word2index[w] for w in s] for s in alt2Words]

xpTest = pad_sequences(pretestSeq, maxlen=MAX_LEN, padding='pre', truncating='pre')
xa1Test = pad_sequences(alt1Seq, maxlen=MAX_LEN, padding='post', truncating='post')
xa2Test = pad_sequences(alt2Seq, maxlen=MAX_LEN, padding='post', truncating='post')

yTest = np.array(labelTest)

In [31]:
tokWords = queWords.copy()
tokWords.extend(quetestWords)
tokTexts = [' '.join(i) for i in tokWords]
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(tokTexts)
qword2index = tokenizer.word_index
print('Found %s unique tokens.' % len(qword2index))

queSeq = tokenizer.texts_to_sequences(question)
xqTrain = pad_sequences(queSeq, maxlen=MAX_Q_LEN, padding='post', truncating='post')

quetestSeq = tokenizer.texts_to_sequences(questionTest)
xqTest = pad_sequences(quetestSeq, maxlen=MAX_Q_LEN, padding='post', truncating='post')

Found 10 unique tokens.


In [32]:
xpTrain, _, xaTrain, _ = train_test_split(xpTrain, xaTrain, test_size=0., random_state=SEED)
xqTrain, _ = train_test_split(xqTrain, test_size=0., random_state=SEED)
yceTrain, _, yhiTrain, _ = train_test_split(yceTrain, yhiTrain, test_size=0., random_state=SEED)

# Save Data

In [37]:
import h5py

In [38]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/RN/data/train.h5', 'w')
fh['xpTrain'] = xpTrain
fh['xaTrain'] = xaTrain
fh['xqTrain'] = xqTrain
fh['yceTrain'] = yceTrain
fh['yhiTrain'] = yhiTrain
fh.close()

In [39]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/RN/data/test.h5', 'w')
fh['xpTest'] = xpTest
fh['xqTest'] = xqTest
fh['xa1Test'] = xa1Test
fh['xa2Test'] = xa2Test
fh['yTest'] = yTest 
fh.close()