# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></div><div class="lev1 toc-item"><a href="#Word-Segmentation" data-toc-modified-id="Word-Segmentation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Word Segmentation</a></div><div class="lev1 toc-item"><a href="#Embedding" data-toc-modified-id="Embedding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Embedding</a></div><div class="lev1 toc-item"><a href="#Compute" data-toc-modified-id="Compute-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compute</a></div>

#  Load Data

In [1]:
import json
import os

In [2]:
def load_data(path, name):
    """
    Load date from file
    """
    data = []
    input_file = os.path.join(path)
    with open(input_file) as f:
        lines = f.readlines()
    for line in lines:
        item = json.loads(line)
        data.append(item[name])
    return data

In [3]:
DATA_DIR = '/Users/lizhn7/Downloads/EXPERIMENT/COPA/LM/data/copa-test.json'

In [8]:
premise = load_data(DATA_DIR, 'premise')
alternative1 = load_data(DATA_DIR, 'alternative1')
alternative2 = load_data(DATA_DIR, 'alternative2')
label = [int(l) for l in load_data(DATA_DIR, 'most-plausible-alternative')]

# Word Segmentation

In [25]:
from nltk import regexp_tokenize

In [26]:
replDict = {"woman's": 'woman', "man's": 'man', "patient's": 'patient', "student's": 'student', "boy's": 'boy', 
            "friend's": 'friend', "enemy's": 'enemy', "parent's": 'parent', "humanitarian's": 'humanitarian', 
            "child's": 'child', "professor's": 'professor', "daughter's": 'daughter', "mother's": 'mother', 
            "children's": 'children', "teller's": 'teller', "company's": 'company', "group's": 'group', 
            "laptop's": 'laptop', "girl's": 'girl', "salesman's": 'salesman', "cook's": 'cook', "car's": 'car', 
            "offender's": 'offender', "detective's": 'detective', "librarian's": 'librarian', "caller's": 'caller', 
            "victim's": 'victim', "interviewer's": 'interviewer', "ship's": 'ship', "site's": 'site', 
            "chandelier's": 'chandelier', "bully's": 'bully', "river's": 'river', "puppy's": 'puppy', 
            "pilot's": 'pilot', "girlfriend's": 'girlfriend', "politician's": 'politician', "couple's": 'couple', 
            "son's": 'son', "actor's": 'actor', "neighbor's": 'neighbor', "nation's": 'nation', 
            "classmate's": 'classmate', "businessman's": 'businessman', "architect's": 'architect', 
            "imposter's": 'imposter', "kidnapper's": 'kidnapper', "colleague's": 'colleague', "flower's": 'flower',
            "bull's": 'bull', "employee's": 'employee', "team's": 'team', "other's": 'other', 
            "writer's": 'writer', "baby's": 'baby', "attacker's": 'attacker', "uncle's": 'uncle', "driver's": 'driver'}

In [27]:
def cut(s):
    """
    Word segmentation
    """
    pattern = r'''
              (?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\w+(?:[-&']\w+)*      # words w/ optional internal hyphens/apostrophe  
            '''  
    return regexp_tokenize(s, pattern=pattern)

def clean(s):
    """
    Clean data 
    """
    for i in range(len(s)):
        if s[i] == "couldn't":
            s[i] = 'could'
            s.insert(i+1, 'not')
        if s[i] == "wouldn't":
            s[i] = 'would'
            s.insert(i+1, 'not')
    s = [i for i in s if i != '']
    return [replDict.get(i.lower(), i.lower()) for i in s]

In [43]:
pWords = [clean(cut(s)) for s in premise]
a1Words = [clean(cut(s)) for s in alternative1]
a2Words = [clean(cut(s)) for s in alternative2]

# Embedding

In [77]:
import pickle
import h5py
import numpy as np

In [78]:
with open('/Users/lizhn7/Downloads/EXPERIMENT/COPA/Sentence_Classification_Glove/data/index.pkl', 'rb') as fp:
    word2index = pickle.load(fp)
    
with h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/RN/data/embedding.h5', 'r') as fh:
    embedding = fh['embedding'][:]

In [79]:
pSeq = [[word2index[w] for w in s] for s in pWords]
a1Seq = [[word2index[w] for w in s] for s in a1Words]
a2Seq = [[word2index[w] for w in s] for s in a2Words]

In [80]:
pEmb = np.array([[embedding[w] for w in s] for s in pSeq])
a1Emb = np.array([[embedding[w] for w in s] for s in a1Seq])
a2Emb = np.array([[embedding[w] for w in s] for s in a2Seq])

In [81]:
pEmb = [sum(s)/len(s) for s in pEmb]
a1Emb = [sum(s)/len(s) for s in a1Emb]
a2Emb = [sum(s)/len(s) for s in a2Emb]

# Compute

In [82]:
def cosine_similarity(x, y):
    """
    Compute_cosine_similarity
    """
    return np.dot(x, y) / (np.linalg.norm(x, 2) * np.linalg.norm(y, 2))

In [88]:
len(pEmb)

500

In [89]:
def accuracy(p, a1, a2, actu, show=True):
    """
    Calculate Accuracy
    """
    p1 = []
    p2 = []
    for i in range(len(a1)):
        p1.append(cosine_similarity(p[i], a1[i]))
        p2.append(cosine_similarity(p[i], a2[i]))
    pred = []
    for i in range(len(p1)):
        if p1[i] <= p2[i]:
            pred.append(1)
        else:
            pred.append(2)
    S = sum([1 for i in range(len(pred)) if pred[i] == actu[i]])
    ACC = S / len(actu)
    if show:
        print('Accuracy: \t%.3f' % (ACC))
    return ACC

In [90]:
accuracy(pEmb, a1Emb, a2Emb, label)

Accuracy: 	0.510


0.51