In [11]:
from nltk.corpus import wordnet
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import vsm
import data_loading
import nlu_utils

In [2]:
# Load the data
msr = data_loading.MSR()
dev = msr.dev()
gutenberg = msr.train_word_word_cooccurance(window=5, vocab_size=1000)

Loading existing co-occurance matrix


In [14]:
dev.shape

(624, 7)

In [3]:
# Calculate PPMI matrix
guten_ppmi = vsm.pmi(gutenberg)

  expected = np.outer(row_totals, col_totals) / total


In [12]:
class PPMIModel:
    def __init__(self, corpus_pmi, try_synonyms=False, verbose=False):
        self.corpus_pmi = corpus_pmi
        self.index_to_label = ['a', 'b', 'c', 'd', 'e']
        self.try_synonyms = try_synonyms
        self.verbose = verbose
    
    def answer(self, problem, try_synonyms=False):
        scores = []
        question = problem['question']
        scores.append(self.score(question, problem['a)'])) 
        scores.append(self.score(question, problem['b)'])) 
        scores.append(self.score(question, problem['c)'])) 
        scores.append(self.score(question, problem['d)'])) 
        scores.append(self.score(question, problem['e)']))
        return self.index_to_label[np.argmax(scores)]
    
    def approx_ppmi(self, proposal_token, proposal_synonyms, word_token):
        pos = nlu_utils.spacy_to_wn_tag(word_token.pos_)
        word_synonyms = nlu_utils.get_alternate_words(word_token.norm_, pos)
        # try matching using different versions of the proposal word
        for psyn in proposal_synonyms:
            score = self.ppmi(psyn, word_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym for proposal word: {} -> {}".format(proposal_token.text, psyn))
                return score
        # try matching using different versions of the non-proposal word
        for wsyn in word_synonyms:
            score = self.ppmi(wsyn, proposal_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym: {} -> {}".format(word_token.text, wsyn))
                return score
        # Next just try all combos
        for psyn in proposal_synonyms:
            for wsyn in word_synonyms:
                score = self.ppmi(psyn, word_token.norm_)
                if score is not None:
                    if self.verbose:
                        print("Used synonym: {} -> {} and {} -> {}".format(proposal_token.text, psyn, word_token.text, wsyn))
                    return score
        if self.verbose:
            print("UNABLE TO FIND ANY SYNONYMS IN VOCABULARY")
        return None

    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None

    def substitute(self, sentence, proposal):
        sentence_list = sentence.split()
        i = sentence_list.index('_____')
        sentence_list[i] = proposal
        return ' '.join(sentence_list)

    def score(self, sentence, proposal):
        full_sentence = self.substitute(sentence, proposal)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token = nlu_utils.get_token(doc, proposal)

        if self.try_synonyms:
            pos = nlu_utils.spacy_to_wn_tag(proposal_token.pos_)
            synonyms = nlu_utils.get_alternate_words(proposal_token.norm_, pos)

        tot_score = 0
        for token in doc:
            if token == proposal_token:  # !!! This is dubious (might be 'is', not ==)
                continue
            if token.is_punct or token.is_space:
                continue
            score = self.ppmi(proposal_token.norm_, token.norm_)
            if score is None and self.try_synonyms:
                score = self.approx_ppmi(proposal_token, synonyms, token)
            tot_score += score if score is not None else 0
        return tot_score

In [None]:
model = PPMIModel(guten_ppmi, try_synonyms=True, verbose=False)
print("Making predictions")
predictions = []
for i, (_, problem) in enumerate(dev.iterrows()):
    ans = model.answer(problem)
    predictions.append(ans)
    if i % 25 == 0:
        print("--------------------------")
        print(problem['question'])
        print(problem)
        print(ans)
        

Making predictions
--------------------------
His hair and whiskers were shot with gray , and his face was all crinkled and _____ like a withered apple.
question    His hair and whiskers were shot with gray , an...
a)                                                 chattering
b)                                                picturesque
c)                                                    hopeful
d)                                                   puckered
e)                                                 glistening
answer                                                      d
Name: 97, dtype: object
a
--------------------------
He has been very _____ to us , and hardly a day has passed that he has not called at the Hall to see how we were getting on.
question    He has been very _____ to us , and hardly a da...
a)                                                  difficult
b)                                                  attentive
c)                                               

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/home/tromero1/anaconda2/envs/nlu/lib/python3.6/site-packages/tqdm/_tqdm.py", line 882, in __del__
    self.close()
  File "/home/tromero1/anaconda2/envs/nlu/lib/python3.6/site-packages/tqdm/_tqdm.py", line 1087, in close
    self._decr_instances(self)
  File "/home/tromero1/anaconda2/envs/nlu/lib/python3.6/site-packages/tqdm/_tqdm.py", line 439, in _decr_instances
    cls._instances.remove(instance)
  File "/home/tromero1/anaconda2/envs/nlu/lib/python3.6/_weakrefset.py", line 109, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x7fae36bda138; to 'tqdm' at 0x7fae24ab3e10>


--------------------------
During my long and _____ acquaintance with Mr. Sherlock Holmes I had never heard him refer to his relations , and hardly ever to his own early life.
question    During my long and _____ acquaintance with Mr....
a)                                                unalterable
b)                                                 inexorable
c)                                                  agreeable
d)                                                   intimate
e)                                                destructive
answer                                                      d
Name: 895, dtype: object
a
--------------------------
But a singular _____ brought us to a standstill.
question    But a singular _____ brought us to a standstill.
a)                                                 diligence
b)                                                  anecdote
c)                                                   fortune
d)                                         

In [None]:
print(accuracy_score(dev.loc[:, 'answer'], predictions))