In [5]:
from nltk.corpus import wordnet
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import vsm
import data_loading
import nlu_utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# Load the data
sat = data_loading.SAT()
dev = sat.dev()
gutenberg = sat.train_word_word_cooccurence(window=5, vocab_size=10000, load=True)

Loading existing co-occurence matrix


In [10]:
gutenberg.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,strung,bounding,accomplishments,wee,inflict,denial,gratifying,arouse,clustered,acceptable
the,1150758,1546998,2303638,1025809,546682,857167,381270,431674,567239,459783,...,232,350,188,190,196,186,206,250,412,188
and,1546998,285101,732932,611510,521166,385879,282628,305280,309942,252834,...,182,232,198,168,102,112,106,122,212,112
of,2303638,732932,203382,372685,568559,353957,185758,187258,247891,205648,...,92,84,206,76,78,198,160,90,148,52
to,1025809,611510,372685,180858,335890,209907,333130,281440,245218,282880,...,108,82,98,72,254,82,180,308,40,218
gonna,546682,521166,568559,335890,132512,307890,184868,199682,246220,192138,...,118,70,66,244,104,146,82,68,82,70


In [11]:
# Calculate PPMI matrix
guten_ppmi = vsm.pmi(gutenberg)

In [12]:
guten_ppmi.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,strung,bounding,accomplishments,wee,inflict,denial,gratifying,arouse,clustered,acceptable
the,0.0,0.095674,0.650401,0.0,0.0,0.272996,0.0,0.0,0.047458,0.0,...,0.052528,0.471021,0.0,0.0,0.0,0.0,0.0,0.106221,0.654601,0.0
and,0.095674,0.0,0.016762,0.0,0.009134,0.0,0.0,0.0,0.0,0.0,...,0.32135,0.571378,0.417991,0.315588,0.0,0.0,0.0,0.0,0.501717,0.0
of,0.650401,0.016762,0.0,0.0,0.252723,0.056644,0.0,0.0,0.0,0.0,...,0.0,0.0,0.614153,0.0,0.0,0.56721,0.344049,0.0,0.298896,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.088823,0.053915,0.0,0.109742,...,0.048194,0.0,0.0,0.0,0.886758,0.0,0.553999,1.075133,0.0,0.744425
gonna,0.0,0.009134,0.252723,0.0,0.0,0.250567,0.0,0.0,0.214365,0.0,...,0.377937,0.0,0.0,1.1787,0.235004,0.595905,0.008951,0.0,0.041759,0.0


In [None]:
class PPMIModel:
    def __init__(self, corpus_pmi, try_synonyms=True, verbose=False):
        self.corpus_pmi = corpus_pmi
        self.try_synonyms = try_synonyms
        self.verbose = verbose
    
    def answer(self, problem):
        n_blanks = problem['num_blanks']
        if n_blanks == 1:
            return self.answer1(problem)
        else: # n_blanks == 2
            return self.answer2(problem)
    
    def answer1(self, problem):
        for option1 in problem["candidates"]:
            scores += [self.score1(problem["question"], option1)]
        return [np.argmax(scores)]

    
    def approx_ppmi(self, proposal_token, proposal_synonyms, word_token):
        pos = nlu_utils.spacy_to_wn_tag(word_token.pos_)
        word_synonyms = nlu_utils.get_alternate_words(word_token.norm_, pos)
        # try matching using different versions of the proposal word
        for psyn in proposal_synonyms:
            score = self.ppmi(psyn, word_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym for proposal word: {} -> {}".format(proposal_token.text, psyn))
                return score
        # try matching using different versions of the non-proposal word
        for wsyn in word_synonyms:
            score = self.ppmi(wsyn, proposal_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym: {} -> {}".format(word_token.text, wsyn))
                return score
        # Next just try all combos
        for psyn in proposal_synonyms:
            for wsyn in word_synonyms:
                score = self.ppmi(psyn, word_token.norm_)
                if score is not None:
                    if self.verbose:
                        print("Used synonym: {} -> {} and {} -> {}".format(proposal_token.text, psyn, word_token.text, wsyn))
                    return score
        if self.verbose:
            print("UNABLE TO FIND ANY SYNONYMS IN VOCABULARY")
        return None

    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None

    def substitute(self, sentence, proposal):
        sentence_list = sentence.split()
        i = sentence_list.index('_____')
        sentence_list[i] = proposal
        return ' '.join(sentence_list)

    def score1(self, sentence, proposal):
        full_sentence = self.substitute(sentence, proposal)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token = nlu_utils.get_token(doc, proposal)

        if self.try_synonyms:
            pos = nlu_utils.spacy_to_wn_tag(proposal_token.pos_)
            synonyms = nlu_utils.get_alternate_words(proposal_token.norm_, pos)

        tot_score = 0
        for token in doc:
            if token == proposal_token:  # !!! This is dubious (might be 'is', not ==)
                continue
            if token.is_punct or token.is_space:
                continue
            score = self.ppmi(proposal_token.norm_, token.norm_)
            if score is None and self.try_synonyms:
                score = self.approx_ppmi(proposal_token, synonyms, token)
            tot_score += score if score is not None else 0
        return tot_score

In [17]:
doc = nlu_utils.get_spacy_doc("The BLANK_0 of the scientist's rebuttal of the hypothesis was startling even in the notoriously $BLANK1 world of nineteenth-century geology.")
for token in doc: print (token)


The
BLANK_0
of
the
scientist
's
rebuttal
of
the
hypothesis
was
startling
even
in
the
notoriously
$
BLANK1
world
of
nineteenth
-
century
geology
.
