# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev1 toc-item"><a href="#Build-Co-occurence-Matrix" data-toc-modified-id="Build-Co-occurence-Matrix-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build Co-occurence Matrix</a></div><div class="lev1 toc-item"><a href="#Train-Glove" data-toc-modified-id="Train-Glove-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train Glove</a></div><div class="lev1 toc-item"><a href="#COPA-Preprocess" data-toc-modified-id="COPA-Preprocess-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>COPA Preprocess</a></div>

# Load Data

In [2]:
import json
import os

In [3]:
DATA_DIR = '/Users/lizhn7/Desktop/ShotaSasaki-csmwp/data'

In [33]:
class Data:
    def __init__(self):
        self.copa = json.load(open(os.path.join(DATA_DIR, 'word_comb_file_with_mwp.json'), 'r'))
        self.cooccur = json.load(open(os.path.join(DATA_DIR, 'freq_dic/c_r_dic_with_mwp.json'), 'r'))
        self.cause = json.load(open(os.path.join(DATA_DIR, 'freq_dic/c_dic_with_mwp.json'), 'r'))
        self.effect = json.load(open(os.path.join(DATA_DIR, 'freq_dic/r_dic_with_mwp.json'), 'r'))
        self.vc = open(os.path.join(DATA_DIR, 'vocabulary/vocab_c_with_mwp.tsv')).read().split()
        self.ve = open(os.path.join(DATA_DIR, 'vocabulary/vocab_r_with_mwp.tsv')).read().split()

In [34]:
data = Data()

# Build Co-occurence Matrix

In [85]:
from collections import Counter
from tqdm import tqdm

In [45]:
counts = Counter(sum([i.split(':') for i in data.cooccur], []))
vocab = sorted(counts, key=counts.get, reverse=True)
vocab2int = {word: ii for ii, word in enumerate(vocab, 0)}
int2vocab = {i: w for w, i in vocab2int.items()}
causal_pair = [[vocab2int[i] for i in p] for p in [i.split(':') for i in data.cooccur]]
causal_freq = [data.cooccur.get(i) for i in data.cooccur]

In [71]:
c = list(zip(a, b))

In [83]:
c[123][0]

2326

In [86]:
cooccur_matrix = {i: {c[0][1]: float(c[1]) for c in list(zip(a, b)) if c[0][0] == i} for i in tqdm(range(len(vocab)))}

100%|██████████| 2887/2887 [00:22<00:00, 128.58it/s]


# Train Glove

In [94]:
import glove
import pickle
import h5py
import numpy as np

In [125]:
model = glove.Glove(cooccur_matrix, d=300, alpha=0.75, x_max=100.0)

for epoch in tqdm(range(100)):
    err = model.train(step_size=0.05, workers=9, batch_size=50)
    print("Epoch %d \t Error %.3f" % (epoch+1, err), flush=True)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1 	 Error 4.718
Epoch 2 	 Error 1.477


  2%|▏         | 2/100 [00:00<00:06, 16.05it/s]

Epoch 3 	 Error 0.854
Epoch 4 	 Error 0.586


  4%|▍         | 4/100 [00:00<00:05, 16.40it/s]

Epoch 5 	 Error 0.441
Epoch 6 	 Error 0.354


  6%|▌         | 6/100 [00:00<00:05, 16.24it/s]

Epoch 7 	 Error 0.299
Epoch 8 	 Error 0.260


  8%|▊         | 8/100 [00:00<00:05, 16.78it/s]

Epoch 9 	 Error 0.232
Epoch 10 	 Error 0.211


 10%|█         | 10/100 [00:00<00:05, 16.46it/s]

Epoch 11 	 Error 0.192
Epoch 12 	 Error 0.175


 12%|█▏        | 12/100 [00:00<00:05, 17.09it/s]

Epoch 13 	 Error 0.159
Epoch 14 	 Error 0.145


 14%|█▍        | 14/100 [00:00<00:04, 17.84it/s]

Epoch 15 	 Error 0.133
Epoch 16 	 Error 0.121
Epoch 17 	 Error 0.111


 17%|█▋        | 17/100 [00:00<00:04, 18.81it/s]

Epoch 18 	 Error 0.101
Epoch 19 	 Error 0.092


 19%|█▉        | 19/100 [00:01<00:04, 18.16it/s]

Epoch 20 	 Error 0.083
Epoch 21 	 Error 0.076


 21%|██        | 21/100 [00:01<00:04, 17.16it/s]

Epoch 22 	 Error 0.069
Epoch 23 	 Error 0.063


 23%|██▎       | 23/100 [00:01<00:04, 17.02it/s]

Epoch 24 	 Error 0.058
Epoch 25 	 Error 0.053
Epoch 26 	 Error 0.049


 26%|██▌       | 26/100 [00:01<00:04, 17.95it/s]

Epoch 27 	 Error 0.045
Epoch 28 	 Error 0.041


 28%|██▊       | 28/100 [00:01<00:03, 18.31it/s]

Epoch 29 	 Error 0.038
Epoch 30 	 Error 0.036
Epoch 31 	 Error 0.033


 31%|███       | 31/100 [00:01<00:03, 19.50it/s]

Epoch 32 	 Error 0.031
Epoch 33 	 Error 0.029
Epoch 34 	 Error 0.027


 34%|███▍      | 34/100 [00:01<00:03, 20.34it/s]

Epoch 35 	 Error 0.025
Epoch 36 	 Error 0.024
Epoch 37 	 Error 0.023


 37%|███▋      | 37/100 [00:01<00:03, 20.48it/s]

Epoch 38 	 Error 0.021
Epoch 39 	 Error 0.020
Epoch 40 	 Error 0.019


 40%|████      | 40/100 [00:02<00:02, 20.42it/s]

Epoch 41 	 Error 0.018
Epoch 42 	 Error 0.017
Epoch 43 	 Error 0.016


 43%|████▎     | 43/100 [00:02<00:02, 20.97it/s]

Epoch 44 	 Error 0.016
Epoch 45 	 Error 0.015
Epoch 46 	 Error 0.014


 46%|████▌     | 46/100 [00:02<00:02, 21.16it/s]

Epoch 47 	 Error 0.014
Epoch 48 	 Error 0.013
Epoch 49 	 Error 0.013


 49%|████▉     | 49/100 [00:02<00:02, 19.96it/s]

Epoch 50 	 Error 0.012
Epoch 51 	 Error 0.012
Epoch 52 	 Error 0.011


 52%|█████▏    | 52/100 [00:02<00:02, 19.10it/s]

Epoch 53 	 Error 0.011
Epoch 54 	 Error 0.010
Epoch 55 	 Error 0.010


 55%|█████▌    | 55/100 [00:02<00:02, 19.80it/s]

Epoch 56 	 Error 0.010
Epoch 57 	 Error 0.009
Epoch 58 	 Error 0.009


 58%|█████▊    | 58/100 [00:03<00:02, 20.45it/s]

Epoch 59 	 Error 0.009
Epoch 60 	 Error 0.008
Epoch 61 	 Error 0.008


 61%|██████    | 61/100 [00:03<00:01, 21.03it/s]

Epoch 62 	 Error 0.008
Epoch 63 	 Error 0.008
Epoch 64 	 Error 0.007


 64%|██████▍   | 64/100 [00:03<00:01, 21.19it/s]

Epoch 65 	 Error 0.007
Epoch 66 	 Error 0.007
Epoch 67 	 Error 0.007


 67%|██████▋   | 67/100 [00:03<00:01, 21.05it/s]

Epoch 68 	 Error 0.007
Epoch 69 	 Error 0.006
Epoch 70 	 Error 0.006


 70%|███████   | 70/100 [00:03<00:01, 20.03it/s]

Epoch 71 	 Error 0.006
Epoch 72 	 Error 0.006
Epoch 73 	 Error 0.006


 73%|███████▎  | 73/100 [00:03<00:01, 18.57it/s]

Epoch 74 	 Error 0.006
Epoch 75 	 Error 0.005


 75%|███████▌  | 75/100 [00:03<00:01, 17.77it/s]

Epoch 76 	 Error 0.005
Epoch 77 	 Error 0.005
Epoch 78 	 Error 0.005


 78%|███████▊  | 78/100 [00:04<00:01, 18.91it/s]

Epoch 79 	 Error 0.005
Epoch 80 	 Error 0.005
Epoch 81 	 Error 0.005


 81%|████████  | 81/100 [00:04<00:00, 19.91it/s]

Epoch 82 	 Error 0.005
Epoch 83 	 Error 0.004
Epoch 84 	 Error 0.004


 84%|████████▍ | 84/100 [00:04<00:00, 20.37it/s]

Epoch 85 	 Error 0.004
Epoch 86 	 Error 0.004
Epoch 87 	 Error 0.004


 87%|████████▋ | 87/100 [00:04<00:00, 19.67it/s]

Epoch 88 	 Error 0.004
Epoch 89 	 Error 0.004


 89%|████████▉ | 89/100 [00:04<00:00, 19.54it/s]

Epoch 90 	 Error 0.004
Epoch 91 	 Error 0.004


 91%|█████████ | 91/100 [00:04<00:00, 19.19it/s]

Epoch 92 	 Error 0.004
Epoch 93 	 Error 0.004


 93%|█████████▎| 93/100 [00:04<00:00, 18.70it/s]

Epoch 94 	 Error 0.004
Epoch 95 	 Error 0.003


 95%|█████████▌| 95/100 [00:04<00:00, 18.57it/s]

Epoch 96 	 Error 0.003
Epoch 97 	 Error 0.003


 97%|█████████▋| 97/100 [00:05<00:00, 18.20it/s]

Epoch 98 	 Error 0.003
Epoch 99 	 Error 0.003


 99%|█████████▉| 99/100 [00:05<00:00, 18.67it/s]

Epoch 100 	 Error 0.003


100%|██████████| 100/100 [00:05<00:00, 19.27it/s]


In [117]:
causal_embedding = model.W

In [118]:
normed_embedding = causal_embedding/np.array([np.sqrt(np.dot(gweight, gweight)) for gweight in causal_embedding])[:, None] #1D -> 2D

In [119]:
def most_similar(w):
    v = normed_embedding[vocab2int[w]]
    sims = np.dot(normed_embedding, v)
    sort = sims.argsort()[::-1]
    sort = sort[sort > 0]
    return [(int2vocab[i],sims[i]) for i in sort[:10]]

In [197]:
most_similar('cast_a_shadow')

[('cast_a_shadow', 1.0),
 ('survive', 0.17208711702120535),
 ('daycare', 0.16283749348515927),
 ('cut_out', 0.15646058262898505),
 ('knock_over', 0.1544220281004646),
 ('grieve', 0.145969027078129),
 ('heartbeat', 0.14086283540459807),
 ('safely', 0.14018019831113687),
 ('elderly', 0.13976928333156818),
 ('nation', 0.1387353543701747)]

In [127]:
with open('/Users/lizhn7/Documents/Github/深度炼丹炉/COPA/Final/index.pkl', 'wb') as fp:
    pickle.dump((vocab2int, int2vocab), fp, -1)

In [128]:
fh = h5py.File('/Users/lizhn7/Documents/Github/深度炼丹炉/COPA/Final/causal_embedding.h5', 'w')
fh['causal_embedding'] = causal_embedding
fh.close()

# COPA Preprocess

In [44]:
import pickle

In [9]:
mwe = [i for i in sum([i.split(':') for i in data.cooccur], []) if '_' in i]

In [10]:
len(set(mwe))

290

In [11]:
cause = []
effect = []
for i in range(len(data.copa)):
    for j in [0, 1]:
        c = [i[0] for i in [i.split(':') for i in data.copa[i][j]]]
        cw = list(set(c))
        cw.sort(key=c.index)
        cause.append(cw)
    for j in [0, 1]:
        e = [i[1] for i in [i.split(':') for i in data.copa[i][j]]]
        ew = list(set(e))
        ew.sort(key=e.index)
        effect.append(ew)

In [12]:
[i[0] for i in enumerate(data.copa) if i[-1] == [[], []]]

[8, 370, 743]

In [13]:
word = set(sum([i.split(':') for i in sum(sum(data.copa, []), [])], []))

In [38]:
a = [i for i in word if '_' in i]
b = [i for i in data.cause if '_' in i]
c = [i for i in data.effect if '_' in i]
d = [i for i in data.vc if '_' in i]
e = [i for i in data.ve if '_' in i]

In [41]:
mwe = set(mwe + a + b + c + d + e)

In [45]:
with open('/Users/lizhn7/Documents/Github/深度炼丹炉/COPA/Final/mwe.pkl', 'wb') as fp:
    pickle.dump(mwe, fp, -1)