In [1]:
from nltk.corpus import wordnet
import numpy as np
import os
import re
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import vsm
import data_loading
import nlu_utils


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Load the data
sat = data_loading.SAT()
dev = sat.dev()
dev.head()

Unnamed: 0,Difficulty,URL,candidates,id,num_blanks,question,solution_index,source
210,1,,"[concur with, rescind, object to, repeal, disa...",,2,"In the United States, social activists who str...",1,SAT 8-10 Section 3 Q2
183,5,,"[incantatory, economical, disaffected, unstint...",,1,African American poet Lucille Clifton writes i...,1,SAT 8-01 Section 4 Q8
271,4,,"[sycophant, pedant, pundit, nemesis, polymath]",,1,Benjamin Franklin was renowned for being a BLA...,4,SAT 9-10 Section 3 Q6
121,5,,"[obscure, deferential, discriminating, sanctim...",,1,The judges for the chili competition were BLAN...,2,SAT 7-05 (Sat) Section 4 Q8
143,3,,"[invalidates, manifesting, disregards, invigor...",,2,Contemporary Inuit sculpture merges traditiona...,4,SAT 7-05 (Sun) Section 7 Q3


In [5]:
giga = sat.train_word_word_cooccurence(window=5, vocab_size=15000, load=True)
giga.head()

Loading existing co-occurence matrix


Unnamed: 0,the,of,to,gonna,and,in,that,for,is,said,...,competitions,pitfalls,rebuffed,shuffle,sham,sweating,arabic,gte,niece,liberated
the,381695,703230,430971,254574,374314,429756,199226,180812,182038,122237,...,63,102,77,91,45,66,70,43,38,58
of,703230,56301,123879,216308,171127,137092,68649,55022,66483,44213,...,21,63,16,24,29,30,20,28,44,33
to,430971,123879,67819,168121,150230,107735,78527,60514,71422,43552,...,27,26,33,34,18,20,13,14,18,23
gonna,254574,216308,168121,56083,141958,156165,81791,85122,81302,51198,...,10,17,18,39,69,21,12,20,43,28
and,374314,171127,150230,141958,45313,128485,56921,63804,48494,26628,...,38,31,27,33,62,47,41,40,59,17


In [7]:
# Calculate PPMI matrix
ppmi = vsm.pmi(giga)
ppmi.head()

Unnamed: 0,the,of,to,gonna,and,in,that,for,is,said,...,competitions,pitfalls,rebuffed,shuffle,sham,sweating,arabic,gte,niece,liberated
the,0.0,0.452639,0.0,0.0,0.0,0.204499,0.0,0.065905,0.115311,0.0,...,0.0,0.462321,0.175213,0.364224,0.0,0.053162,0.145166,0.0,0.0,0.0
of,0.452639,0.0,0.0,0.191142,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.801615,0.0,0.0,0.0,0.085836,0.0,0.001668,0.498836,0.178093
to,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001575,0.0,...,0.0,0.0,0.149789,0.201599,0.0,0.0,0.0,0.0,0.0,0.0
gonna,0.0,0.191142,0.0,0.0,0.0,0.109679,0.007822,0.230017,0.22675,0.0,...,0.0,0.0,0.0,0.434411,0.957656,0.0,0.0,0.0,0.572199,0.110142
and,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.438433,0.223851,0.07976,0.302387,0.885715,0.666169,0.562758,0.489727,0.923567,0.0


In [9]:
class PPMIModel:
    def __init__(self, corpus_pmi, try_synonyms=True, verbose=False):
        self.corpus_pmi = corpus_pmi
        self.try_synonyms = try_synonyms
        self.verbose = verbose
    
    def answer(self, problem):
        n_blanks = problem['num_blanks']
        if n_blanks == 1:
            return self.answer1(problem)
        else: # n_blanks == 2
            return self.answer2(problem)
    
    def answer1(self, problem):
        scores = []
        for option in problem["candidates"]:
            scores += [self.score1(problem["question"], option)]
        return [np.argmax(scores)], scores
    
    def answer2(self, problem):
        scores = []
        for option in problem["candidates"]:
            cand0, cand1 = option.split(',')
            scores += [self.score2(problem["question"], cand0, cand1)]
        return [np.argmax(scores)], scores
    
    def score1(self, sentence, proposal):
        full_sentence = self.substitute1(sentence, proposal)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token = nlu_utils.get_token(doc, proposal)

        if self.try_synonyms:
            pos = nlu_utils.spacy_to_wn_tag(proposal_token.pos_)
            synonyms = nlu_utils.get_alternate_words(proposal_token.norm_, pos)

        tot_score = 0
        for token in doc:
            if token == proposal_token:
                continue
            if token.is_punct or token.is_space:
                continue
            score = self.ppmi(proposal_token.norm_, token.norm_)
            if score is None and self.try_synonyms:
                score = self.approx_ppmi(proposal_token, synonyms, token)
            tot_score += score if score is not None else 0
        return tot_score
    
    def score2(self, sentence, proposal0, proposal1):
        full_sentence = self.substitute2(sentence, proposal0, proposal1)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token0 = nlu_utils.get_token(doc, proposal0)
        _, proposal_token1 = nlu_utils.get_token(doc, proposal1)

        synonyms0 = None
        synonyms1 = None
        if self.try_synonyms:
            pos0 = nlu_utils.spacy_to_wn_tag(proposal_token0.pos_)
            synonyms0 = nlu_utils.get_alternate_words(proposal_token0.norm_, pos0)
            
            pos1 = nlu_utils.spacy_to_wn_tag(proposal_token1.pos_)
            synonyms1 = nlu_utils.get_alternate_words(proposal_token1.norm_, pos1)

        tot_score = 0
        for proposal_token, synonyms in [(proposal_token0, synonyms0), (proposal_token1, synonyms1)]:
            for token in doc:
                if token == proposal_token0:
                    continue
                if token == proposal_token1:
                    continue
                if token.is_punct or token.is_space:
                    continue
                score = self.ppmi(proposal_token.norm_, token.norm_)
                if score is None and self.try_synonyms:
                    score = self.approx_ppmi(proposal_token, synonyms, token)
                tot_score += score if score is not None else 0
            return tot_score
    
    def approx_ppmi(self, proposal_token, proposal_synonyms, word_token):
        pos = nlu_utils.spacy_to_wn_tag(word_token.pos_)
        word_synonyms = nlu_utils.get_alternate_words(word_token.norm_, pos)
        # try matching using different versions of the proposal word
        for psyn in proposal_synonyms:
            score = self.ppmi(psyn, word_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym for proposal word: {} -> {}".format(proposal_token.text, psyn))
                return score
        # try matching using different versions of the non-proposal word
        for wsyn in word_synonyms:
            score = self.ppmi(wsyn, proposal_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym: {} -> {}".format(word_token.text, wsyn))
                return score
        # Next just try all combos
        for psyn in proposal_synonyms:
            for wsyn in word_synonyms:
                score = self.ppmi(psyn, word_token.norm_)
                if score is not None:
                    if self.verbose:
                        print("Used synonym: {} -> {} and {} -> {}".format(proposal_token.text, psyn, word_token.text, wsyn))
                    return score
        if self.verbose:
            print("UNABLE TO FIND ANY SYNONYMS IN VOCABULARY")
        return None

    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None
        
    def substitute1(self, sentence, proposal):
        new_sentence = re.sub('BLANK', proposal, sentence)
        if 'BLANK' in new_sentence:
            print (sentence)
            assert False
            
        return new_sentence
    
    def substitute2(self, sentence, proposal0, proposal1):
        new_sentence = re.sub('BLANK0', proposal0, sentence)
        new_sentence = re.sub('BLANK1', proposal1, new_sentence)
        if new_sentence == sentence:
            print (sentence)
            assert False
            
        return new_sentence

In [15]:
model = PPMIModel(ppmi, try_synonyms=False, verbose=False)
print("Making predictions")
predictions = []
for i, (_, problem) in enumerate(dev.iterrows()):
    ans, scores = model.answer(problem)
    predictions.append(ans)
    if i % 50 == 0:
        print("------------------------------------------------------------------------------")
        print(problem['question'])
        print(problem)
        print(ans)
        print(scores)

Making predictions
------------------------------------------------------------------------------
In the United States, social activists who strongly BLANK0 a particular law can attempt to obtain a constitutional amendment to BLANK1 it.
Difficulty                                                        1
URL                                                                
candidates        [concur with, rescind, object to, repeal, disa...
id                                                             None
num_blanks                                                        2
question          In the United States, social activists who str...
solution_index                                                    1
source                                        SAT 8-10 Section 3 Q2
Name: 210, dtype: object
[2]
[0, 13.876409877668301, 16.47472762156393, 9.990117159522276, 0]
------------------------------------------------------------------------------
Ken took his BLANK obligations seriously, pati

In [16]:
print(accuracy_score(dev.loc[:, 'solution_index'], predictions))

0.18041237113402062
