# A Baseline for the MSR dataset using gigaword and PPMI

In [1]:
from nltk.corpus import wordnet
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
import vsm
import data_loading


%load_ext autoreload
%autoreload 2

## Load the data

In [2]:
msr = data_loading.MSR()
dev = msr.dev()
dev.head()

Unnamed: 0_level_0,question,a),b),c),d),e),answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
97,"His hair and whiskers were shot with gray , an...",chattering,picturesque,hopeful,puckered,glistening,d
727,It was only after a painful and prolonged scen...,ejected,consumed,awakened,startled,softened,a
19,"I went up to the house with the _____ , and sa...",Moonstone,moonlight,empire,asylum,Inspector,e
846,Would she not have made an _____ queen.,oval,imaginary,enormous,admirable,amateur,d
903,"My clothes were all _____ with dew , and my co...",packed,charged,supplied,tattered,sodden,e


In [3]:
vsmdata = '../data/vsmdata'

In [4]:
giga5 = pd.read_csv(
    os.path.join(vsmdata, 'gigaword_window5-scaled.csv.gz'), index_col=0)

In [5]:
giga20 = pd.read_csv(
    os.path.join(vsmdata, 'gigaword_window20-flat.csv.gz'), index_col=0)

## Calculate PPMI matrix

In [6]:
giga5_pmi = vsm.pmi(giga5)

In [7]:
giga20_pmi = vsm.pmi(giga20)

In [8]:
giga20_pmi.head()

Unnamed: 0,!,"""",$,%,&AMP,&amp,',(,(8,),...,zagreb,zambia,zealand,zebra,zimbabwe,zinc,zloty,zombie,zone,zoo
!,5.841817,0.643911,0.0,3.185382,0.0,0.121197,0.880838,0.472697,0.364272,0.535979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.449382,0.0,0.045402
"""",0.643911,2.866995,0.0,0.0,0.0,1.181611,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.400573,0.0,0.0,0.0,1.642623,0.0,0.0
$,0.0,0.0,4.212617,0.784733,1.24755,0.808534,0.0,0.525137,0.666451,0.53002,...,0.0,0.0,0.0,0.401652,0.0,1.227381,0.509906,0.0,0.0,0.27382
%,3.185382,0.0,0.784733,8.278628,1.848348,1.247424,0.0,0.921025,0.806773,0.922745,...,0.0,0.0,0.26203,0.0,0.0,0.314046,0.0,0.0,0.0,0.0
&AMP,0.0,0.0,1.24755,1.848348,5.332599,0.0,0.431175,0.889812,0.933909,0.939811,...,0.0,0.0,0.0,0.013311,0.0,0.091865,0.0,0.0,0.0,0.0


In [9]:
giga20_pmi.loc["!", "oval"]

0.0

## PPMI Model
From Inkpen 2007

In [10]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets("good"):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

In [11]:
class PPMIBaseline:
    def __init__(self, corpus_pmi):
        self.corpus_pmi = corpus_pmi
        self.index_to_label = ['a', 'b', 'c', 'd', 'e']
    
    def answer(self, problem):
        scores = []
        question = problem['question']
        scores.append(self.score(question, problem['a)'])) 
        scores.append(self.score(question, problem['b)'])) 
        scores.append(self.score(question, problem['c)'])) 
        scores.append(self.score(question, problem['d)'])) 
        scores.append(self.score(question, problem['e)']))
        return self.index_to_label[np.argmax(scores)]
    
    def ppmi(self, proposal, word):
        try:
            return self.corpus_pmi.loc[proposal, word]
        except KeyError:
            return None
    
    def score(self, sentence, proposal, try_synonyms=True):
        sentence = sentence.lower()
        score = 0
        synonyms = get_synonyms(proposal)
        for word in sentence.split():
            if word == '_____':
                continue
            s = self.ppmi(proposal, word)
            if s is None and try_synonyms:
                for syn in synonyms:
                    s = self.ppmi(syn, word)
                    if s is not None:
                        break 
            score += s if s is not None else 0
        return score

## Evaluation

In [12]:
model = PPMIBaseline(giga20_pmi)

predictions = []
for _, problem in dev.iterrows():
    ans = model.answer(problem)
    predictions.append(ans)

In [13]:
accuracy_score(dev.loc[:, 'answer'], predictions)

0.27403846153846156

Note: part of the reason this does so poorly is that the majority of the answers are not even in the gigaword vocabulary.
* Before adding synonym matching (giga20): 0.27884615384615385
* After adding synonym matching (giga20): 0.26442307692307693    :(

Thoughts on why:
Using synonyms gives more non-zero scores, but doesn't necessarily give more nonzero scores to the correct answer categories. Just the words that have common synonyms.