# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Tokenize-Text" data-toc-modified-id="Tokenize-Text-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tokenize Text</a></div><div class="lev1 toc-item"><a href="#Create-Word-Embeddings-with-GloVe" data-toc-modified-id="Create-Word-Embeddings-with-GloVe-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Word Embeddings with GloVe</a></div><div class="lev2 toc-item"><a href="#Read-Glove" data-toc-modified-id="Read-Glove-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Read Glove</a></div><div class="lev2 toc-item"><a href="#Use-Glove-to-Initialize-Embedding-Matrix" data-toc-modified-id="Use-Glove-to-Initialize-Embedding-Matrix-42"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Use Glove to Initialize Embedding Matrix</a></div><div class="lev1 toc-item"><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build Dataset</a></div><div class="lev1 toc-item"><a href="#Save-Data" data-toc-modified-id="Save-Data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Save Data</a></div><div class="lev1 toc-item"><a href="#Checkpoint" data-toc-modified-id="Checkpoint-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Checkpoint</a></div><div class="lev1 toc-item"><a href="#Load-Model" data-toc-modified-id="Load-Model-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Load Model</a></div><div class="lev2 toc-item"><a href="#Set-Hyperparameters" data-toc-modified-id="Set-Hyperparameters-81"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Set Hyperparameters</a></div><div class="lev2 toc-item"><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-82"><span class="toc-item-num">8.2&nbsp;&nbsp;</span>Import Libraries</a></div><div class="lev2 toc-item"><a href="#Build-Graph" data-toc-modified-id="Build-Graph-83"><span class="toc-item-num">8.3&nbsp;&nbsp;</span>Build Graph</a></div><div class="lev1 toc-item"><a href="#Train" data-toc-modified-id="Train-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Train</a></div>

# Load Data

In [1]:
import json
import os

In [2]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

In [5]:
premise = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-all.json', 'premise')
asks_for = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-all.json', 'asks-for')
alternative1 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-all.json', 'alternative1')
alternative2 = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-all.json', 'alternative2')

for i in range(len(premise)):
    if asks_for[i] == 'cause':
        premise[i] += ' What was the CAUSE of this?'
    else:
        premise[i] += ' What happened as a RESULT'

premise.extend(premise)
alternative = []
alternative.extend(alternative1)
alternative.extend(alternative2)

rawLabel = load_data('/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-all.json', 'most-plausible-alternative')
rawLabel = [int(l) for l in rawLabel]

l = [0] * len(premise)
for i in range(len(rawLabel)):
    if rawLabel[i] == 1:
        l[i] = 1
        l[i+len(rawLabel)] = 0
    if rawLabel[i] == 2:
        l[i] = 0
        l[i+len(rawLabel)] = 1
        
label = l

In [6]:
len(label)

2000

# Word Segmentation

In [7]:
from nltk import regexp_tokenize
import numpy as np

In [8]:
replDict = {"woman's": 'woman', "man's": 'man', "patient's": 'patient', "student's": 'student', "boy's": 'boy', 
            "friend's": 'friend', "enemy's": 'enemy', "parent's": 'parent', "humanitarian's": 'humanitarian', 
            "child's": 'child', "professor's": 'professor', "daughter's": 'daughter', "mother's": 'mother', 
            "children's": 'children', "teller's": 'teller', "company's": 'company', "group's": 'group', 
            "laptop's": 'laptop', "girl's": 'girl', "salesman's": 'salesman', "cook's": 'cook', "car's": 'car', 
            "offender's": 'offender', "detective's": 'detective', "librarian's": 'librarian', "caller's": 'caller', 
            "victim's": 'victim', "interviewer's": 'interviewer', "ship's": 'ship', "site's": 'site', 
            "chandelier's": 'chandelier', "bully's": 'bully', "river's": 'river', "puppy's": 'puppy', 
            "pilot's": 'pilot', "girlfriend's": 'girlfriend', "politician's": 'politician', "couple's": 'couple', 
            "son's": 'son', "actor's": 'actor', "neighbor's": 'neighbor', "nation's": 'nation', 
            "classmate's": 'classmate', "businessman's": 'businessman', "architect's": 'architect', 
            "imposter's": 'imposter', "kidnapper's": 'kidnapper', "colleague's": 'colleague', "flower's": 'flower',
            "bull's": 'bull', "employee's": 'employee', "wouldn't": 'wouldn', "team's": 'team', "other's": 'other', 
            "writer's": 'writer', "baby's": 'baby', "attacker's": 'attacker', "uncle's": 'uncle', "driver's": 'driver'}

In [9]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
            '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data 
    """
    for i in range(len(s)):
        if s[i] == "couldn't":
            s[i] = 'could'
            s.insert(i+1, 'not')
    s = [i for i in s if i != '']
    return [replDict.get(i.lower(), i.lower()) for i in s]

In [10]:
pWords = [clean(cut(s)) for s in premise]
aWords = [clean(cut(s)) for s in alternative]

In [11]:
MAX_P_LEN = 19
MAX_A_LEN = 11
SEED = 42

# Tokenize Text

In [12]:
from keras.preprocessing.text import Tokenizer
import numpy as np
import pickle

Using TensorFlow backend.


In [13]:
tokWords = []
tokWords.extend(pWords)
tokWords.extend(aWords)
tokenizer = Tokenizer(filters='')
tokTexts = [' '.join(i) for i in tokWords]
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
sentLens = np.array([len(i) for i in pWords])
print('Found %s unique tokens.' % len(word2index))
print('Distribution of premise lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))
sentLens = np.array([len(i) for i in aWords])
print('Distribution of alternative lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))

Found 3370 unique tokens.
Distribution of premise lengths (number of words):
Min: 8   Max: 19   Mean: 11.668   Med: 12.000
Distribution of alternative lengths (number of words):
Min: 2   Max: 11   Mean: 5.074   Med: 5.000


In [73]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/index.pkl', 'wb') as fp:
    pickle.dump((word2index), fp, -1)

# Create Word Embeddings with GloVe

In [16]:
SEED = 42
VOCAB_SIZE = 3371
EMBEDDING_DIM = 300

## Read Glove

In [17]:
#glove_n_symbols = !wc -l /Users/lizhn7/Downloads/DATA/Glove/glove.42B.300d.txt
#glove_n_symbols = int(glove_n_symbols[0].split()[0])
glove_n_symbols = 1917495

In [18]:
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, EMBEDDING_DIM))
globale_scale = 0.1
with open('/Users/lizhn7/Downloads/DATA/Glove/glove.42B.300d.txt', 'r') as fp:
    index = 0
    for l in fp:
        l = l.strip().split()
        word = l[0]
        glove_index_dict[word] = index
        glove_embedding_weights[index, :] = [float(n) for n in l[1:]]
        index += 1
glove_embedding_weights *= globale_scale

## Use Glove to Initialize Embedding Matrix

In [19]:
from nltk import PorterStemmer, LancasterStemmer, WordNetLemmatizer

In [20]:
# Generate random embedding with same scale as glove
np.random.seed(SEED)
shape = (VOCAB_SIZE, EMBEDDING_DIM)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2 
embedding = np.random.uniform(low=-scale, high=scale, size=shape)

In [21]:
wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [22]:
# Copy from glove weights of words that appear in index2word
count = 0 
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        ww = wnl.lemmatize(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = porter.stem(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = lancaster.stem(w)
        g = glove_index_dict.get(ww)
    if g is not None:
        embedding[i, :] = glove_embedding_weights[g, :]
        count += 1
print('{num_tokens}-{per:.2f}% tokens in vocab found in glove and copied to embedding.'.format(num_tokens=count, per=count/float(VOCAB_SIZE)*100))

3370-99.97% tokens in vocab found in glove and copied to embedding.


In [25]:
'tattled' in glove_index_dict

True

In [31]:
glove_index_dict['unlaced']

356749

In [37]:
[i for i in sum(tokWords, []) if i[-2:] == "'t"]

["didn't", "didn't", "didn't"]

In [None]:
[i for i in replDict if i[-1] != 's']

# Build Dataset

In [9]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle

Using TensorFlow backend.


In [10]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/index.pkl', 'rb') as fp:
    word2index = pickle.load(fp)

In [15]:
pSeq = [[word2index.get(w, w) for w in s] for s in pWords]
aSeq = [[word2index.get(w, w) for w in s] for s in aWords]

xp = pad_sequences(pSeq, maxlen=MAX_P_LEN, padding='post', truncating='post')
xa = pad_sequences(aSeq, maxlen=MAX_A_LEN, padding='post', truncating='post')

xpTest = xp[:500]
xa1Test = xa[:500]
xa2Test =  xa[500:]
yTest = np.array(rawLabel)

In [14]:
pSeq = [[word2index.get(w, w) for w in s] for s in pWords]
aSeq = [[word2index.get(w, w) for w in s] for s in aWords]

xp = pad_sequences(pSeq, maxlen=MAX_P_LEN, padding='post', truncating='post')
xa = pad_sequences(aSeq, maxlen=MAX_A_LEN, padding='post', truncating='post')
y = np.array(label)

xpTrain, _, yTrain, _ = train_test_split(xp, y, test_size=0., random_state=SEED)
xaTrain, _ = train_test_split(xa, test_size=0., random_state=SEED)

In [336]:
pSeq = [[word2index.get(w, w) for w in s] for s in pWords]
aSeq = [[word2index.get(w, w) for w in s] for s in aWords]

xp = pad_sequences(pSeq, maxlen=MAX_P_LEN, padding='post', truncating='post')
xa = pad_sequences(aSeq, maxlen=MAX_A_LEN, padding='post', truncating='post')
y = np.array(label)


xpTrain1, xpVal1, yTrain1, yVal1 = train_test_split(xp[:500], y[:500], test_size=0.2, random_state=SEED)
xaTrain1, xaVal1 = train_test_split(xa[:500], test_size=0.2, random_state=SEED)
xpTrain2, xpVal2, yTrain2, yVal2 = train_test_split(xp[500:], y[500:], test_size=0.2, random_state=SEED)
xaTrain2, xaVal2 = train_test_split(xa[500:], test_size=0.2, random_state=SEED)

xpTrain = np.vstack((xpTrain1, xpTrain2))
xpVal = np.vstack((xpVal1, xpVal2))
xaTrain = np.vstack((xaTrain1, xaTrain2))
xaVal = np.vstack((xaVal1, xaVal2))
yTrain = np.concatenate((yTrain1, yTrain2))
yVal = np.concatenate((yVal1, yVal2))
xpTest = xpVal1
xa1Test = xaVal1
xa2Test = xaVal2
yTest = np.array([1 if yVal1[i] == 1 else 2 for i in range(len(yVal1))])

# Save Data

In [41]:
import h5py

In [70]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/embedding.h5', 'w')
fh['embedding'] = embedding
fh.close()

In [None]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/train.h5', 'w')
fh['xpTrain'] = xpTrain
fh['xaTrain'] = xaTrain
fh['yTrain'] = yTrain
fh.close()

In [19]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/test.h5', 'w')
fh['xpTest'] = xpTest
fh['xa1Test'] = xa1Test
fh['xa2Test'] = xa2Test
fh['yTest'] = yTest
fh.close()

In [43]:
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/train.h5', 'w')
fh['xpTrain'] = xpTrain
fh['xpVal'] = xpVal
fh['xaTrain'] = xaTrain
fh['xaVal'] = xaVal
fh['yTrain'] = yTrain
fh['yVal'] = yVal
fh['xpTest'] = xpTest
fh['xa1Test'] = xa1Test
fh['xa2Test'] = xa2Test
fh['yTest'] = yTest
fh.close()

OSError: Unable to create file (Unable to truncate a file which is already open)

# Checkpoint

In [340]:
import h5py

In [None]:
with h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/embedding.h5', 'r') as fh:
    embedding = fh['embedding'][:]

In [None]:
with h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Question_Answering_Hinge_Glove/data/train.h5', 'r') as fh:
    xpTrain = fh['xpTrain'][:]
    xpVal = fh['xpVal'][:]
    xaTrain = fh['xaTrain'][:]
    xaVal = fh['xaVal'][:]
    yTrain = fh['yTrain'][:]
    yVal = fh['yVal'][:]
    xpTest = fh['xpTest'][:]
    xa1Test = fh['xa1Test'][:]
    xa2Test = fh['xa2Test'][:]
    yTest = fh['yTest'][:]

# Load Model

## Set Hyperparameters

In [59]:
MAX_P_LEN = 19
MAX_A_LEN = 11
VOCAB_SIZE = 3371
SEED = 42
EMBEDDING_DIM = 300
TUNE = False
BATCH_SIZE = 1024
NUM_EPOCHS = 256
CNN_SIZE = 128
WINDOW_SIZE = 3
DROPOUT_RATE = 0.5

## Import Libraries

In [60]:
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, Lambda, dot
from keras.models import Model
import keras.backend as K
from keras.callbacks import*
from keras import losses
from keras import optimizers
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Build Graph

In [61]:
def build():
    """
    Build model
    """
    inputs = Input(shape=(None,))
    emb_seq = Embedding(VOCAB_SIZE, 
                    EMBEDDING_DIM, 
                    weights=[embedding], 
                    mask_zero=False, 
                    trainable=TUNE)(inputs)
    #dropout = Dropout(DROPOUT_RATE)(emb_seq)
    conv = Conv1D(CNN_SIZE, 
                  WINDOW_SIZE, 
                  padding='same', 
                  activation='elu')(emb_seq)
    pool = GlobalMaxPooling1D()(conv)
    dropout = Dropout(DROPOUT_RATE)(pool)
    outputs = Activation('tanh')(pool)
    model = Model(inputs=[inputs], outputs=[outputs])
    return model

def hinge_loss(y_true, y_pred):
    """
    Define hinge loss function
    """
    pos = K.sum(y_true * y_pred, axis=-1)
    neg = K.max((1. - y_true) * y_pred, axis=-1)
    return K.maximum(0., neg - pos + 0.009)

def squared_hinge(y_true, y_pred):
    return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)


def hinge(y_true, y_pred):
    return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)


def categorical_hinge(y_true, y_pred):
    pos = K.sum(y_true * y_pred, axis=-1)
    neg = K.max((1. - y_true) * y_pred, axis=-1)
    return K.maximum(0., neg - pos + 1.)

In [62]:
np.array([1, 2, 3]) * np.array([4, 5, 6])

array([ 4, 10, 18])

In [63]:
model = build()

In [64]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 300)         1011300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         115328    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
Total params: 1,126,628
Trainable params: 115,328
Non-trainable params: 1,011,300
_________________________________________________________________


In [65]:
p_seq, a_seq = Input(shape=(MAX_P_LEN,)), Input(shape=(MAX_A_LEN,))
p_out, a_out = model(p_seq), model(a_seq)
similarity = dot([p_out, a_out], axes=-1, normalize=True)
similarity = Lambda(lambda x: K.abs(x))(similarity)
qa = Model(inputs=[p_seq, a_seq], outputs=[similarity])
sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
qa.compile(loss=hinge_loss, optimizer='sgd', metrics=['accuracy'])

In [263]:
qa.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_86 (InputLayer)            (None, 19)            0                                            
____________________________________________________________________________________________________
input_87 (InputLayer)            (None, 11)            0                                            
____________________________________________________________________________________________________
model_53 (Model)                 (None, 128)           775328      input_86[0][0]                   
                                                                   input_87[0][0]                   
____________________________________________________________________________________________________
dot_25 (Dot)                     (None, 1)             0           model_53[1][0]          

# Train

In [184]:
#filepath = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/Sentence_Classification_Glove/cp_logs/weights.{epoch:03d}-{loss:.6f}.hdf5'
#checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max')
log_string = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/Sentence_Classification_Glove/tb_logs/128_3_0.5'
tensorboard = TensorBoard(log_dir=log_string)
#callbacks_list = [checkpoint, tensorboard]
callbacks_list = [tensorboard]

In [265]:
history = qa.fit([xpTrain, xaTrain], 
                 yTrain,
                 batch_size=BATCH_SIZE,
                 epochs=NUM_EPOCHS,
                 #callbacks=callbacks_list,
                 validation_data=([xpVal, xaVal], yVal),
                 shuffle=True)

ValueError: Not a dataset (Not a dataset)