In [11]:
import utils
import os
import numpy as np
import data_loading
from sklearn.metrics import accuracy_score
from nltk.corpus import wordnet

In [2]:
msr = data_loading.MSR()
dev = msr.dev()

In [3]:
vsmdata_home = '../data/vsmdata'
glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [4]:
glove_lookup = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.300d.txt'))
glove_vocab = sorted(set(glove_lookup))

In [5]:
unk = utils.randvec(glove_lookup['the'].shape[0])

In [6]:
def get_word_vector(word, try_synonyms=False):
    if try_synonyms:
        return glove_lookup.get(word, None)
    return glove_lookup.get(word, unk)

In [7]:
def cosine_similarity(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
mom = get_word_vector('mom')
dad = get_word_vector('dad')
cosine_similarity(mom, dad)

0.81658541566168974

In [13]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

class GloveBaseline:
    def __init__(self, try_synonyms=False):
        self.index_to_label = ['a', 'b', 'c', 'd', 'e']
        self.try_synonyms = try_synonyms
    
    def answer(self, problem, try_synonyms=False):
        scores = []
        question = problem['question']
        scores.append(self.totalsim(question, problem['a)'])) 
        scores.append(self.totalsim(question, problem['b)'])) 
        scores.append(self.totalsim(question, problem['c)'])) 
        scores.append(self.totalsim(question, problem['d)'])) 
        scores.append(self.totalsim(question, problem['e)']))
        return self.index_to_label[np.argmax(scores)]
    
    def sim(self, proposal, word):
        proposal_v = get_word_vector(proposal, self.try_synonyms)
        word_v = get_word_vector(word, self.try_synonyms)
        if word_v is None or proposal_v is None:
            return None
        return cosine_similarity(proposal_v, word_v)
    
    def totalsim(self, sentence, proposal):
        sentence = sentence.lower()
        score = 0
        if self.try_synonyms:
            synonyms = get_synonyms(proposal)
        for word in sentence.split():
            if word == '_____':
                continue
            s = self.sim(proposal, word)
            if s is None and self.try_synonyms:
                for syn in synonyms:
                    s = self.sim(syn, word)
                    if s is not None:
                        break 
            score += s if s is not None else 0
        return score

In [14]:
model = GloveBaseline(try_synonyms=True)
print("Making predictions")
predictions = []
for _, problem in dev.iterrows():
    ans = model.answer(problem)
    predictions.append(ans)

Making predictions


In [15]:
print(accuracy_score(dev.loc[:, 'answer'], predictions))

0.307692307692


## Results
* Glove300, synonyms=False: 0.304487179487
* Glove300, synonyms=True: 0.307692307692