In [6]:
import numpy as np
import os
import pandas as pd
import vsm
import data_loading
import itertools
import utils
# from nltk.corpus import wordnet

In [7]:
gre = data_loading.GRE()
dev = gre.dev_sentence_completion()

In [8]:
vsmdata_home = '../data/vsmdata'
glove_home = os.path.join(vsmdata_home, 'glove.6B')

In [9]:
glove_lookup = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.300d.txt'))
glove_vocab = sorted(set(glove_lookup))
unk = utils.randvec(glove_lookup['the'].shape[0])

In [10]:
def get_word_vector(word, try_synonyms=False):
    if try_synonyms:
        return glove_lookup.get(word, None)
    return glove_lookup.get(word, unk)

In [11]:
def cosine_similarity(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [20]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms
    
class GloveBaseline:
    def __init__(self, try_synonyms=False):
        self.try_synonyms = try_synonyms
    
    def answer(self, problem):
        n_blanks = problem['num_blanks']
        if n_blanks == 1:
            return self.answer1(problem)
        elif n_blanks == 2:
            return self.answer2(problem)
        else:
            return self.answer3(problem)
    
    def answer1(self, problem):
        scores = []
        for option1 in problem["candidates"]:
            scores += [self.score1(problem["question"], option1)]
        return [np.argmax(scores)]
        
    def answer2(self, problem):
        scores = []
        combos = list(itertools.product(problem["candidates"], problem["candidates_2"]))
        for option1, option2 in combos:
            scores += [self.score2(problem["question"], option1, option2)]
        ans1, ans2 = combos[np.argmax(scores)]
        return [problem["candidates"].index(ans1), problem["candidates_2"].index(ans2)]
        
    def answer3(self, problem):
        scores = []
        combos = list(itertools.product(problem["candidates"], problem["candidates_2"], problem["candidates_3"]))
        for option1, option2, option3 in combos:
            scores += [self.score3(problem["question"], option1, option2, option3)]
        ans1, ans2, ans3 = combos[np.argmax(scores)]
        return [problem["candidates"].index(ans1), problem["candidates_2"].index(ans2), problem["candidates_3"].index(ans3)]
            
    def sim(self, proposal, word):
        return cosine_similarity(get_word_vector(proposal, self.try_synonyms), get_word_vector(word, self.try_synonyms))
        
    def score1(self, sentence, option1):
        sentence = sentence.lower()
        score = 0
        for word in sentence.split():
            if word == '$BLANK_0':
                continue
            score += self.sim(option1, word)
        return score
        
    def score2(self, sentence, option1, option2):
        sentence = sentence.lower()
        score = 0
        for word in sentence.split():
            if word == '$BLANK_0' or '$BLANK_1':
                continue
            score += self.sim(option1, word)
            score += self.sim(option2, word)
        score += self.sim(option1, option2)
        return score
    
    def score3(self, sentence, option1, option2, option3):
        sentence = sentence.lower()
        score = 0
        for word in sentence.split():
            if word == '$BLANK_0' or '$BLANK_1' or '$BLANK_2':
                continue
            score += self.sim(option1, word)
            score += self.sim(option2, word)
            score += self.sim(option3, word)
        score += self.sim(option1, option2)
        score += self.sim(option1, option3)
        score += self.sim(option2, option3)
        return score

In [18]:
model = GloveBaseline()
predictions = []
for _, problem in dev.iterrows():
    ans = model.answer(problem)
    predictions.append(ans)

In [19]:
def accuracy_score(predictions, dev):
    n_correct = 0.0
    for i, (_, problem) in enumerate(dev.iterrows()):
        print(problem["solution_index"], "or", problem["solution_indices"], "==", predictions[i])
        if problem["solution_index"] == predictions[i] or problem["solution_indices"] == predictions[i]:
            n_correct += 1
    return n_correct / len(predictions)
        
accuracy_score(predictions, dev)

[0] or nan == [0]
[3] or nan == [0]
nan or [0, 1, 1] == [0, 2, 1]
[3] or nan == [3]
[1] or nan == [3]
[0] or nan == [3]
nan or [0, 1] == [1, 1]
nan or [2, 1, 2] == [0, 2, 1]
nan or [2, 2] == [2, 0]
nan or [0, 2] == [1, 1]
nan or [0, 1] == [2, 0]


0.18181818181818182

## Results
* Glove300, synonyms=False: 0.18181818181818182
* Glove300, synonyms=True: 