# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Preprocess" data-toc-modified-id="Data-Preprocess-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preprocess</a></div><div class="lev1 toc-item"><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build Dataset</a></div><div class="lev2 toc-item"><a href="#Tokenize-Text" data-toc-modified-id="Tokenize-Text-21"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Tokenize Text</a></div><div class="lev2 toc-item"><a href="#Create-Word-Embeddings-with-GloVe" data-toc-modified-id="Create-Word-Embeddings-with-GloVe-22"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Create Word Embeddings with GloVe</a></div><div class="lev2 toc-item"><a href="#Split-Data" data-toc-modified-id="Split-Data-23"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Split Data</a></div><div class="lev1 toc-item"><a href="#Save-Data" data-toc-modified-id="Save-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Save Data</a></div>

# Data Preprocess

In [17]:
import json
import os
import spacy
from spacy.parts_of_speech import NOUN, VERB, ADJ, ADV, NUM, PROPN

In [18]:
nlp = spacy.load('en')
DEV_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-dev.json'
TEST_DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json'
#SEMEVAL_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval.json'
SEMEVAL_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/semeval_all.json'

In [60]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

def isNoise(token):
    """
    Check if the token is a noise or not 
    """
    is_noise = False
    if token.pos not in [NOUN, VERB, ADJ, ADV, NUM, PROPN]:
        is_noise = True
    return is_noise

def del_stop(s):
    """
    Delete stop words
    """
    return [w for w in s if w not in stopWords and '-' not in w]

def clean(token):
    """
    Clean data
    """
    return token.lemma_

In [None]:
tokens.token.Token

In [20]:
semevalSent = load_data(SEMEVAL_DIR, 'sentence')
semevalLabel = load_data(SEMEVAL_DIR, 'label')

In [61]:
class Data:
    def __init__(self, path):
        self.rawPremise = load_data(path, 'premise')
        self.ask_for = load_data(path, 'asks-for')
        self.rawAlternative1 = load_data(path, 'alternative1')
        self.rawAlternative2 = load_data(path, 'alternative2')
        self.label = load_data(path, 'most-plausible-alternative')
        self.premise = [del_stop(s) for s in [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawPremise]]]
        self.alternative1 = [del_stop(s) for s in [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawAlternative1]]]
        self.alternative2 = [del_stop(s) for s in [[clean(i) for i in j if not isNoise(i)] for j in [nlp(i) for i in self.rawAlternative2]]]
            
    def train_data(self):
        t1 = [self.premise[i] + self.alternative1[i] for i in range(len(self.premise))]
        t2 = [self.premise[i] + self.alternative2[i] for i in range(len(self.premise))]
        t3 = [self.alternative1[i] + self.premise[i] for i in range(len(self.premise))]
        t4 = [self.alternative2[i] + self.premise[i] for i in range(len(self.premise))]
        l1, l2, l3, l4 = [], [], [], []
        for i in range(len(self.label)):
            if self.label[i] == '1':
                l1.append(1), l2.append(0), l3.append(1), l4.append(0);
            else:
                l1.append(0), l2.append(1), l3.append(0), l4.append(1);
        return t1+t2+t3+t4, l1+l2+l3+l4
        
    def test_data(self):
        v1, v2 = [], []
        for i in range(len(self.ask_for)):
            if self.ask_for[i] == 'cause':
                v1.append(self.alternative1[i] + self.premise[i])
                v2.append(self.alternative2[i] + self.premise[i])
            else:
                v1.append(self.premise[i] + self.alternative1[i])
                v2.append(self.premise[i] + self.alternative2[i])
        return v1, v2, [int(l) for l in self.label]            

In [39]:
type(trainData.premise[0][0])

str

In [7]:
a = trainData.rawAlternative1[331]

In [36]:
nlp(a)

He put out his back.

In [10]:
from nltk.corpus import stopwords

In [12]:
stopWords = stopwords.words('english')

In [16]:
'back' in stopWords

False

In [57]:
nlp('ABC')

ABC

In [59]:
for i in nlp('AB-VC'):
    print(i.lemma_)

ab
-
vc


In [56]:
for i in nlp(a):
    print(i.lemma_, i.pos_, i.is_stop)
    

-PRON- PRON True
put VERB True
out PART True
-PRON- ADJ True
back NOUN True
. PUNCT False


In [None]:
spacy.tokens.token.Token, got str

In [None]:
tokens.token.Token

# Build Dataset

In [47]:
from keras.preprocessing.text import Tokenizer
import numpy as np
from nltk import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [62]:
trainData = Data(DEV_DATA_DIR)
valData = Data(DEV_DATA_DIR)
testData = Data(TEST_DATA_DIR)

xT, yT = trainData.train_data()
x1Val, x2Val, yVal = valData.test_data()
x1Test, x2Test, yTest = testData.test_data()

In [64]:
trainData.rawAlternative1[331]

'He put out his back.'

In [76]:
[i for i in enumerate(trainData.premise) if len(i[-1]) == 1]

[(432, ['cough'])]

In [66]:
trainData.alternative2[331]

['scratch', 'back']

In [67]:
x1Val[1]

['woman',
 'know',
 'friend',
 'go',
 'hard',
 'time',
 'woman',
 'tolerate',
 'friend',
 'difficult',
 'behavior']

In [68]:
x2Val[1]

['woman',
 'feel',
 'friend',
 'take',
 'advantage',
 'kindness',
 'woman',
 'tolerate',
 'friend',
 'difficult',
 'behavior']

## Tokenize Text

In [29]:
tok_sentWords = x1Val+x2Val+x1Test+x2Test+semevalSent
tokTexts = [' '.join(i) for i in tok_sentWords]
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(tokTexts)
word2index = tokenizer.word_index
index2word = {i: w for w, i in word2index.items()}
sentLens = np.array([len(i) for i in tok_sentWords])
print('Number of sentences: \t{:d}'.format(len(sentLens)))
print('Distribution of sentence lengths (number of words):')
print('Min: {:d}   Max: {:d}   Mean: {:.3f}   Med: {:.3f}'.format(np.min(sentLens), np.max(sentLens), np.mean(sentLens), np.median(sentLens)))
print('Found %s unique tokens.' % len(word2index))

Number of sentences: 	11495
Distribution of sentence lengths (number of words):
Min: 2   Max: 12   Mean: 7.054   Med: 7.000
Found 12887 unique tokens.


## Create Word Embeddings with GloVe

In [30]:
SEED = 42
VOCAB_SIZE = 12888
EMBEDDING_DIM = 300

In [31]:
wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [32]:
glove_n_symbols = 1917495
glove_index_dict = {}
glove_embedding_weights = np.empty((glove_n_symbols, EMBEDDING_DIM))
globale_scale = 0.1
with open('/Users/lizhn7/Downloads/DATA/glove/glove.42B.300d.txt', 'r') as fp:
    index = 0
    for l in tqdm(fp):
        l = l.strip().split()
        word = l[0]
        glove_index_dict[word] = index
        glove_embedding_weights[index, :] = [float(n) for n in l[1:]]
        index += 1
glove_embedding_weights *= globale_scale

# Generate random embedding with same scale as glove
np.random.seed(SEED)
shape = (VOCAB_SIZE, EMBEDDING_DIM)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2 
embedding = np.random.uniform(low=-scale, high=scale, size=shape)

# Copy from glove weights of words that appear in index2word
count = 0 
for i in tqdm(range(1, VOCAB_SIZE)):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        ww = wnl.lemmatize(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = porter.stem(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = lancaster.stem(w)
        g = glove_index_dict.get(ww)
    if g is None:
        ww = w[:-1]
        glove_index_dict.get(ww)
    if g is None:
        ww = w[:-2]
        glove_index_dict.get(ww)
    if g is None:
        ww = w[:-3]
        glove_index_dict.get(ww)
    if g is not None:
        embedding[i, :] = glove_embedding_weights[g, :]
        count += 1
print('{num_tokens}-{per:.2f}% tokens in vocab found in glove and copied to embedding.'.format(num_tokens=count, per=count/float(VOCAB_SIZE)*100))

1917495it [03:00, 10608.39it/s]
100%|██████████| 12887/12887 [00:00<00:00, 80132.59it/s]

12710-98.62% tokens in vocab found in glove and copied to embedding.





## Split Data

In [33]:
MAX_LEN = 12

In [34]:
x = [[word2index[w] for w in s] for s in semevalSent]
x1Val = [[word2index[w] for w in s] for s in x1Val]
x2Val = [[word2index[w] for w in s] for s in x2Val]
x1Test = [[word2index[w] for w in s] for s in x1Test]
x2Test = [[word2index[w] for w in s] for s in x2Test]

x = pad_sequences(x, maxlen=MAX_LEN, padding='post', truncating='post')
x1Val = pad_sequences(x1Val, maxlen=MAX_LEN, padding='post', truncating='post')
x2Val = pad_sequences(x2Val, maxlen=MAX_LEN, padding='post', truncating='post')
x1Test = pad_sequences(x1Test, maxlen=MAX_LEN, padding='post', truncating='post')
x2Test = pad_sequences(x2Test, maxlen=MAX_LEN, padding='post', truncating='post')

y = np.array(semevalLabel)
yVal = np.array(yVal)
yTest = np.array(yTest)

xTrain, _, yTrain, _ = train_test_split(x, y, test_size=0., random_state=SEED)

In [35]:
sum(y), len(y)

(1148, 9495)

# Save Data

In [37]:
import pickle
import h5py

In [38]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/index.pkl', 'wb') as fp:
    pickle.dump((word2index, index2word), fp, -1)

fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/embedding.h5', 'w')
fh['embedding'] = embedding
fh.close()

fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/train.h5', 'w')
fh['xTrain'] = xTrain
fh['yTrain'] = yTrain
fh.close()

fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/val.h5', 'w')
fh['x1Val'] = x1Val
fh['x2Val'] = x2Val
fh['yVal'] = yVal
fh.close()

fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/FINAL/test.h5', 'w')
fh['x1Test'] = x1Test
fh['x2Test'] = x2Test
fh['yTest'] = yTest
fh.close()

In [41]:
embedding.shape

(12888, 300)