In [1]:
from nltk.corpus import wordnet
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import vsm
import data_loading
import nlu_utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Load the data
msr = data_loading.MSR()
dev = msr.dev()
gutenberg = msr.train_word_word_cooccurence(window=5, vocab_size=10000, load=True)

Loading existing co-occurence matrix


In [4]:
gutenberg.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,shouted.,organized,proposition,sustain,soldier.,mistress.,politeness,emperor's,"assistance,",fearfully
the,1135176,1500275,2286572,1012251,540124,838114,324499,413722,551703,450568,...,188,328,338,302,432,218,266,768,202,342
and,1500275,276919,712490,588648,504338,364917,243348,284888,294890,241676,...,68,186,100,164,112,82,242,160,238,264
of,2286572,712490,201441,366229,564927,347975,158080,178656,241087,200422,...,46,176,148,168,158,136,254,178,120,98
to,1012251,588648,366229,175760,332192,201193,284508,269588,238082,276274,...,38,84,238,400,88,144,176,100,266,66
gonna,540124,504338,564927,332192,131188,302648,157978,190644,240446,188484,...,48,172,190,128,264,76,86,58,64,80


In [5]:
# Calculate PPMI matrix
guten_ppmi = vsm.pmi(gutenberg)

In [6]:
guten_ppmi.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,shouted.,organized,proposition,sustain,soldier.,mistress.,politeness,emperor's,"assistance,",fearfully
the,0.0,0.121445,0.67024,0.0,0.0,0.290705,0.0,0.0,0.069809,0.0,...,0.0,0.242544,0.22311,0.089951,0.50136,0.0,0.0,1.090788,0.0,0.261282
and,0.121445,0.0,0.039136,0.0,0.025543,0.0,0.0,0.0,0.0,0.0,...,0.0,0.210216,0.0,0.01433,0.0,0.0,0.418647,0.057111,0.35876,0.53736
of,0.67024,0.039136,0.0,0.0,0.266382,0.07401,0.0,0.0,0.0,0.0,...,0.0,0.282343,0.059605,0.165817,0.157858,0.003439,0.594432,0.29111,0.0,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.092041,0.057488,0.0,0.096384,...,0.0,0.0,0.617152,1.115805,0.0,0.143086,0.31007,0.0,0.679863,0.0
gonna,0.0,0.025543,0.266382,0.0,0.0,0.266376,0.0,0.0,0.233549,0.0,...,0.0,0.591279,0.641343,0.225809,1.003138,0.0,0.0,0.0,0.0,0.0


In [9]:
class PPMIModel:
    def __init__(self, corpus_pmi, try_synonyms=True, verbose=False):
        self.corpus_pmi = corpus_pmi
        self.index_to_label = ['a', 'b', 'c', 'd', 'e']
        self.try_synonyms = try_synonyms
        self.verbose = verbose
    
    def answer(self, problem, try_synonyms=True):
        scores = []
        question = problem['question']
        scores.append(self.score(question, problem['a)'])) 
        scores.append(self.score(question, problem['b)'])) 
        scores.append(self.score(question, problem['c)'])) 
        scores.append(self.score(question, problem['d)'])) 
        scores.append(self.score(question, problem['e)']))
        return self.index_to_label[np.argmax(scores)], scores
    
    def approx_ppmi(self, proposal_token, proposal_synonyms, word_token):
        pos = nlu_utils.spacy_to_wn_tag(word_token.pos_)
        word_synonyms = nlu_utils.get_alternate_words(word_token.norm_, pos)
        # try matching using different versions of the proposal word
        for psyn in proposal_synonyms:
            score = self.ppmi(psyn, word_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym for proposal word: {} -> {}".format(proposal_token.text, psyn))
                return score
        # try matching using different versions of the non-proposal word
        for wsyn in word_synonyms:
            score = self.ppmi(wsyn, proposal_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym: {} -> {}".format(word_token.text, wsyn))
                return score
        # Next just try all combos
        for psyn in proposal_synonyms:
            for wsyn in word_synonyms:
                score = self.ppmi(psyn, word_token.norm_)
                if score is not None:
                    if self.verbose:
                        print("Used synonym: {} -> {} and {} -> {}".format(proposal_token.text, psyn, word_token.text, wsyn))
                    return score
        if self.verbose:
            print("UNABLE TO FIND ANY SYNONYMS IN VOCABULARY")
        return None

    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None

    def substitute(self, sentence, proposal):
        sentence_list = sentence.split()
        i = sentence_list.index('_____')
        sentence_list[i] = proposal
        return ' '.join(sentence_list)

    def score(self, sentence, proposal):
        full_sentence = self.substitute(sentence, proposal)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token = nlu_utils.get_token(doc, proposal)

        if self.try_synonyms:
            pos = nlu_utils.spacy_to_wn_tag(proposal_token.pos_)
            synonyms = nlu_utils.get_alternate_words(proposal_token.norm_, pos)

        tot_score = 0
        for token in doc:
            if token == proposal_token:  # !!! This is dubious (might be 'is', not ==)
                continue
            if token.is_punct or token.is_space:
                continue
            score = self.ppmi(proposal_token.norm_, token.norm_)
            if score is None and self.try_synonyms:
                score = self.approx_ppmi(proposal_token, synonyms, token)
            tot_score += score if score is not None else 0
        return tot_score

In [10]:
model = PPMIModel(guten_ppmi, try_synonyms=True, verbose=False)
print("Making predictions")
predictions = []
for i, (_, problem) in enumerate(dev.iterrows()):
    ans, scores = model.answer(problem)
    predictions.append(ans)
    if i % 25 == 0:
        print("------------------------------------------------------------------------------")
        print(problem['question'])
        print(problem)
        print(ans)
        print(scores)
        

Making predictions
--------------------------
His hair and whiskers were shot with gray , and his face was all crinkled and _____ like a withered apple.
question    His hair and whiskers were shot with gray , an...
a)                                                 chattering
b)                                                picturesque
c)                                                    hopeful
d)                                                   puckered
e)                                                 glistening
answer                                                      d
Name: 97, dtype: object
e
[8.41877136732423, 4.067458034711444, 3.382770316212859, 1.373420578134755, 15.855537113552309]
--------------------------
He has been very _____ to us , and hardly a day has passed that he has not called at the Hall to see how we were getting on.
question    He has been very _____ to us , and hardly a da...
a)                                                  difficult
b)             

--------------------------
Left his _____ at ten o'clock at night , and has not been heard of since.
question    Left his _____ at ten o'clock at night , and h...
a)                                                     defeat
b)                                                  impudence
c)                                                     finger
d)                                                   lodgings
e)                                                    affairs
answer                                                      d
Name: 222, dtype: object
d
[4.873234004086572, 3.0591862454641774, 4.727856785889174, 5.893753469216782, 2.4684052831182135]
--------------------------
They were the only signs of human life which I could see , save only those prehistoric _____ which lay thickly upon the slopes of the hills.
question    They were the only signs of human life which I...
a)                                                     forces
b)                                              

In [11]:
print(accuracy_score(dev.loc[:, 'answer'], predictions))

0.49198717948717946


## Results
* BASELINE: Window=5, Vocab=10000, synonyms=True: 0.482371794872
* Window=5, Vocab=10000, synonyms_POS=True, spacy.norm_=true: 0.49198717948717946