In [31]:
from nltk.corpus import wordnet
import numpy as np
import os
import re
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import vsm
import data_loading
import nlu_utils


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
# Load the data
sat = data_loading.SAT()
dev = sat.dev()
gutenberg = sat.train_word_word_cooccurence(window=5, vocab_size=10000, load=True)

Loading existing co-occurence matrix


In [50]:
dev.head()

Unnamed: 0,Difficulty,URL,candidates,id,num_blanks,question,solution_index,source
210,1,,"[concur with, rescind, object to, repeal, disa...",,2,"In the United States, social activists who str...",1,SAT 8-10 Section 3 Q2
183,5,,"[incantatory, economical, disaffected, unstint...",,1,African American poet Lucille Clifton writes i...,1,SAT 8-01 Section 4 Q8
271,4,,"[sycophant, pedant, pundit, nemesis, polymath]",,1,Benjamin Franklin was renowned for being a BLA...,4,SAT 9-10 Section 3 Q6
121,5,,"[obscure, deferential, discriminating, sanctim...",,1,The judges for the chili competition were BLAN...,2,SAT 7-05 (Sat) Section 4 Q8
143,3,,"[invalidates, manifesting, disregards, invigor...",,2,Contemporary Inuit sculpture merges traditiona...,4,SAT 7-05 (Sun) Section 7 Q3


In [10]:
gutenberg.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,strung,bounding,accomplishments,wee,inflict,denial,gratifying,arouse,clustered,acceptable
the,1150758,1546998,2303638,1025809,546682,857167,381270,431674,567239,459783,...,232,350,188,190,196,186,206,250,412,188
and,1546998,285101,732932,611510,521166,385879,282628,305280,309942,252834,...,182,232,198,168,102,112,106,122,212,112
of,2303638,732932,203382,372685,568559,353957,185758,187258,247891,205648,...,92,84,206,76,78,198,160,90,148,52
to,1025809,611510,372685,180858,335890,209907,333130,281440,245218,282880,...,108,82,98,72,254,82,180,308,40,218
gonna,546682,521166,568559,335890,132512,307890,184868,199682,246220,192138,...,118,70,66,244,104,146,82,68,82,70


In [11]:
# Calculate PPMI matrix
guten_ppmi = vsm.pmi(gutenberg)

In [12]:
guten_ppmi.head()

Unnamed: 0,the,and,of,to,gonna,in,i,he,was,have,...,strung,bounding,accomplishments,wee,inflict,denial,gratifying,arouse,clustered,acceptable
the,0.0,0.095674,0.650401,0.0,0.0,0.272996,0.0,0.0,0.047458,0.0,...,0.052528,0.471021,0.0,0.0,0.0,0.0,0.0,0.106221,0.654601,0.0
and,0.095674,0.0,0.016762,0.0,0.009134,0.0,0.0,0.0,0.0,0.0,...,0.32135,0.571378,0.417991,0.315588,0.0,0.0,0.0,0.0,0.501717,0.0
of,0.650401,0.016762,0.0,0.0,0.252723,0.056644,0.0,0.0,0.0,0.0,...,0.0,0.0,0.614153,0.0,0.0,0.56721,0.344049,0.0,0.298896,0.0
to,0.0,0.0,0.0,0.0,0.0,0.0,0.088823,0.053915,0.0,0.109742,...,0.048194,0.0,0.0,0.0,0.886758,0.0,0.553999,1.075133,0.0,0.744425
gonna,0.0,0.009134,0.252723,0.0,0.0,0.250567,0.0,0.0,0.214365,0.0,...,0.377937,0.0,0.0,1.1787,0.235004,0.595905,0.008951,0.0,0.041759,0.0


In [46]:
class PPMIModel:
    def __init__(self, corpus_pmi, try_synonyms=True, verbose=False):
        self.corpus_pmi = corpus_pmi
        self.try_synonyms = try_synonyms
        self.verbose = verbose
    
    def answer(self, problem):
        n_blanks = problem['num_blanks']
        if n_blanks == 1:
            return self.answer1(problem)
        else: # n_blanks == 2
            return self.answer2(problem)
    
    def answer1(self, problem):
        scores = []
        for option in problem["candidates"]:
            scores += [self.score1(problem["question"], option)]
        return [np.argmax(scores)], scores
    
    def answer2(self, problem):
        scores = []
        for option in problem["candidates"]:
            cand0, cand1 = option.split(',')
            scores += [self.score2(problem["question"], cand0, cand1)]
        return [np.argmax(scores)], scores
    
    
    def score1(self, sentence, proposal):
        full_sentence = self.substitute1(sentence, proposal)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token = nlu_utils.get_token(doc, proposal)

        if self.try_synonyms:
            pos = nlu_utils.spacy_to_wn_tag(proposal_token.pos_)
            synonyms = nlu_utils.get_alternate_words(proposal_token.norm_, pos)

        tot_score = 0
        for token in doc:
            if token == proposal_token:  # !!! This is dubious (might be 'is', not ==)
                continue
            if token.is_punct or token.is_space:
                continue
            score = self.ppmi(proposal_token.norm_, token.norm_)
            if score is None and self.try_synonyms:
                score = self.approx_ppmi(proposal_token, synonyms, token)
            tot_score += score if score is not None else 0
        return tot_score
    
    
    def score2(self, sentence, proposal0, proposal1):
        full_sentence = self.substitute2(sentence, proposal0, proposal1)
        doc = nlu_utils.get_spacy_doc(full_sentence)
        _, proposal_token0 = nlu_utils.get_token(doc, proposal0)
        _, proposal_token1 = nlu_utils.get_token(doc, proposal1)

        synonyms0 = None
        synonyms1 = None
        if self.try_synonyms:
            pos0 = nlu_utils.spacy_to_wn_tag(proposal_token0.pos_)
            synonyms0 = nlu_utils.get_alternate_words(proposal_token0.norm_, pos0)
            
            pos1 = nlu_utils.spacy_to_wn_tag(proposal_token1.pos_)
            synonyms1 = nlu_utils.get_alternate_words(proposal_token1.norm_, pos1)

        tot_score = 0
        for proposal_token, synonyms in [(proposal_token0, synonyms0), (proposal_token1, synonyms1)]:
            for token in doc:
                if token == proposal_token0:  # !!! This is dubious (might be 'is', not ==)
                    continue
                if token == proposal_token1:
                    continue
                if token.is_punct or token.is_space:
                    continue
                score = self.ppmi(proposal_token.norm_, token.norm_)
                if score is None and self.try_synonyms:
                    score = self.approx_ppmi(proposal_token, synonyms, token)
                tot_score += score if score is not None else 0
            return tot_score

    
    def approx_ppmi(self, proposal_token, proposal_synonyms, word_token):
        pos = nlu_utils.spacy_to_wn_tag(word_token.pos_)
        word_synonyms = nlu_utils.get_alternate_words(word_token.norm_, pos)
        # try matching using different versions of the proposal word
        for psyn in proposal_synonyms:
            score = self.ppmi(psyn, word_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym for proposal word: {} -> {}".format(proposal_token.text, psyn))
                return score
        # try matching using different versions of the non-proposal word
        for wsyn in word_synonyms:
            score = self.ppmi(wsyn, proposal_token.norm_)
            if score is not None:
                if self.verbose:
                    print("Used synonym: {} -> {}".format(word_token.text, wsyn))
                return score
        # Next just try all combos
        for psyn in proposal_synonyms:
            for wsyn in word_synonyms:
                score = self.ppmi(psyn, word_token.norm_)
                if score is not None:
                    if self.verbose:
                        print("Used synonym: {} -> {} and {} -> {}".format(proposal_token.text, psyn, word_token.text, wsyn))
                    return score
        if self.verbose:
            print("UNABLE TO FIND ANY SYNONYMS IN VOCABULARY")
        return None

    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None

        
        
    def substitute1(self, sentence, proposal):
        new_sentence = re.sub('BLANK', proposal, sentence)
        if 'BLANK' in new_sentence:
            print (sentence)
            assert False
            
        return new_sentence
    
    
    
    def substitute2(self, sentence, proposal0, proposal1):
        new_sentence = re.sub('BLANK0', proposal0, sentence)
        new_sentence = re.sub('BLANK1', proposal1, new_sentence)
        if new_sentence == sentence:
            print (sentence)
            assert False
            
        return new_sentence

    
    
    
    
    
    

In [47]:
model = PPMIModel(guten_ppmi, try_synonyms=True, verbose=False)
print("Making predictions")
predictions = []
for i, (_, problem) in enumerate(dev.iterrows()):
    ans, scores = model.answer(problem)
    predictions.append(ans)
    if i % 25 == 0:
        print("------------------------------------------------------------------------------")
        print(problem['question'])
        print(problem)
        print(ans)
        print(scores)

Making predictions
------------------------------------------------------------------------------
In the United States, social activists who strongly BLANK0 a particular law can attempt to obtain a constitutional amendment to BLANK1 it.
Difficulty                                                        1
URL                                                                
candidates        [concur with, rescind, object to, repeal, disa...
id                                                             None
num_blanks                                                        2
question          In the United States, social activists who str...
solution_index                                                  [1]
source                                        SAT 8-10 Section 3 Q2
Name: 210, dtype: object
[1]
[5.505992992580338, 8.49334436989775, 5.63527103985558, 4.649730133904396, 6.7648380405032675]
------------------------------------------------------------------------------
No BLANK the cas

In [51]:
print(accuracy_score(dev.loc[:, 'solution_index'], predictions))

0.28865979381443296


In [80]:
giga = data_loading.GIGA()
nyt = giga.train_word_word_cooccurence(name='nyt', window=5, vocab_size=30000, load=True)

100%|██████████| 1/1 [00:00<00:00, 48.14it/s]
100%|██████████| 1/1 [00:00<00:00,  9.78it/s]

Loading vocab


NEW MAGAZINES ARE STARTING TO FLOW OUT OF TIME INC




New publications are coming from all directions at Time Inc
Indeed the companys new approach to start-ups has become a sort
of East Coast-West Coast shootout over which part of Time Inc is
developing magazines more aggressively


Two of the new publications In Style and Makeover are spinoffs
of People magazine In May the company gave the go-ahead to In
Style a celebrity magazine after a market test of three issues
And three weeks ago the company published Makeover as a
mass-market magazine for women although its future depends on the
success of its first issue


Another Time Inc magazine Money also has a spinoff Managing
Your Future a quarterly newsletter aimed at workers contributing
to 401k retirement plans The newsletter has seen its circulation
increase to 326000 from 80000 since the first issue in October


And this month the company will publish its second test issue
of Mouth 2 Mouth a magazine for teen-agers





Successfully saved co-occurence matrix


In [81]:
nyt.head()

Unnamed: 0,the,to,of,and,gonna,in,it,time,for,with,...,wire,racks,spray,mace,bring,guests,comatose,state,able,solid
the,12,19,24,23,13,23,6,7,10,14,...,0,0,1,1,0,0,0,0,0,0
to,19,2,5,8,9,3,2,3,3,2,...,0,0,1,1,1,1,1,1,1,1
of,24,5,4,12,13,5,1,5,0,1,...,0,0,0,0,0,1,1,1,1,0
and,23,8,12,2,4,8,4,4,0,3,...,1,1,0,0,0,0,1,1,1,1
gonna,13,9,13,4,2,4,4,3,6,3,...,0,0,0,0,0,1,1,1,1,0
