In [1]:
from nltk.corpus import wordnet
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
import vsm
import data_loading

In [2]:
msr = data_loading.MSR()
M = msr.train_word_word_cooccurance(window=5, vocab_size=10000)

Loading existing co-occurance matrix


In [3]:
M.shape

(10000, 10000)

In [4]:
dev = msr.dev()

In [5]:
# Scale M due to power law distribution of word occurances
M = np.log(1 + M)

In [6]:
M_lsa = vsm.lsa(M)

In [7]:
def cosine_similarity(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
cosine_similarity(M_lsa.loc["king", :], M_lsa.loc["queen", :])

0.89019982465138436

In [9]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

class LSABaseline:
    def __init__(self, word_vecs=M_lsa, try_synonyms=False):
        self.word_vecs=M_lsa
        self.index_to_label = ['a', 'b', 'c', 'd', 'e']
        self.try_synonyms = try_synonyms
    
    def answer(self, problem, try_synonyms=False):
        scores = []
        question = problem['question']
        scores.append(self.totalsim(question, problem['a)'])) 
        scores.append(self.totalsim(question, problem['b)'])) 
        scores.append(self.totalsim(question, problem['c)'])) 
        scores.append(self.totalsim(question, problem['d)'])) 
        scores.append(self.totalsim(question, problem['e)']))
        return self.index_to_label[np.argmax(scores)]
    
    def sim(self, proposal, word):
        try:
            return cosine_similarity(self.word_vecs.loc[proposal, :], self.word_vecs.loc[word, :])
        except KeyError:
            return None
    
    def totalsim(self, sentence, proposal):
        sentence = sentence.lower()
        score = 0
        if self.try_synonyms:
            synonyms = get_synonyms(proposal)
        for word in sentence.split():
            if word == '_____':
                continue
            s = self.sim(proposal, word)
            if s is None and self.try_synonyms:
                for syn in synonyms:
                    s = self.sim(syn, word)
                    if s is not None:
                        break 
            score += s if s is not None else 0
        return score

In [10]:
model = LSABaseline(M_lsa, try_synonyms=False)
print("Making predictions")
predictions = []
for _, problem in dev.iterrows():
    ans = model.answer(problem)
    predictions.append(ans)

Making predictions


In [11]:
print(accuracy_score(dev.loc[:, 'answer'], predictions))

0.272435897436


## Results
* word-doc, vocab=30000, synonyms=False, dim=100, np.log(1 + M) scaling: 0.269230769231
* word-doc, vocab=30000, synonyms=True, dim=100, np.log(1 + M) scaling: 0.266025641026
* word-doc, vocab=60000, synonyms=False, dim=100, np.log(1 + M) scaling: 0.267628205128
* word-doc, vocab=60000, synonyms=True, dim=100, np.log(1 + M) scaling: 0.266025641026
* word-word, vocab=10000, synonyms=False, dim=100, np.log(1 + M) scaling: 0.272435897436
* For reference, the Glove baseline using cosine similarity like this gets 0.304487179487