# Multinomial Count (Syntax-Context)

## 0a. Coding Info

* **Corpus**
    * Brown
    * McRae Norms
* **Parser**
    * SpaCy
* **Algorithm**
    * Count-based Multinomial Bayesian Updating (Erk 2016)
* **Evaluation**
    * Precision @1/@5
    * Mean Average Precision (MAP) https://www.youtube.com/watch?v=pM6DJ0ZZee0&index=12&list=PLBv09BD7ez_6nqE9YU9bQXpjJ5jJ1Kgr9

## 0b. Packages

In [2]:
import numpy as np
from collections import defaultdict
from itertools import chain
from __future__ import division
from sklearn.metrics import average_precision_score

## 0c. Input Preparation

In [54]:
import cPickle

In [55]:
concepts,cpt2ft,cpt2ftprob,features = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/qmr.p",'rb'))
triples = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/BROWN/brown_triples.p",'rb'))

In [56]:
len(concepts)==len(set(concepts))

True

In [57]:
# normalize ftprobs
for cpt in concepts:
    cpt2ftprob[cpt] = list(np.array(cpt2ftprob[cpt]) / np.array(cpt2ftprob[cpt]).sum())

In [58]:
f2i = {f:i for i,f in enumerate(features)}

In [59]:
cpt2ftprob_dic = defaultdict(dict)
for cpt in concepts:
    cpt2ftprob_dic[cpt] = {ft:ftprob for ft,ftprob in zip(cpt2ft[cpt],cpt2ftprob[cpt])}

In [60]:
cpt2ftprob_dic['book']

{'found_in_libraries': 0.030837004405286337,
 'found_in_schools': 0.030837004405286337,
 'found_on_shelves': 0.083700440528634346,
 'has_a_hard_cover': 0.030837004405286337,
 'has_a_soft_cover': 0.030837004405286337,
 'has_authors': 0.088105726872246687,
 'has_information': 0.088105726872246687,
 'has_page_numbers': 0.088105726872246687,
 'has_pages': 0.088105726872246687,
 'has_pictures': 0.030837004405286337,
 'has_words_in_it': 0.088105726872246687,
 'made_of_paper': 0.083700440528634346,
 'tells_stories': 0.030837004405286337,
 'used_by_reading': 0.088105726872246687,
 'used_for_acquiring/storing_knowledge': 0.088105726872246687,
 'used_for_learning': 0.030837004405286337}

## 0f. General Purpose Functions

In [61]:
def normalize(arr):
    return arr / arr.sum()

def partition(l, k):
    """ args: list, # partitions.
        returns: a list of k partitions.
    """
    k = max(1, k)
    chunk_size = len(l)//k
    return [l[i:i+chunk_size] for i in xrange(0, len(l), chunk_size)]

## I. Preprocessing

### A. Mappings

* Verb-Role $\mapsto$ Concept List/Set Mapping
* Concept $\mapsto$ Verb-Role List/Set Mapping

In [62]:
def preproc_mapping(concepts, triples):
    """ args: concepts, features, concept->feature mapping, concept->P(feature) mapping, dependency triples.
        returns: verb-role->concept-set mapping.
    """
    concepts_set = set(concepts) # for fast lookup.
    vr2c = defaultdict(set)
    c2vr = defaultdict(list)
    for word,dep,head in triples:
        if word in concepts_set and (dep=='nsubj' or dep=='dobj'):
            vr2c[head+'-'+dep].add(word)
            c2vr[word].append(head+'-'+dep)
    return vr2c, c2vr

In [63]:
vr2c, c2vr = preproc_mapping(concepts, triples)

In [64]:
vrs = vr2c.keys() # global verb-role pair indexing.
vr2i = {vr:i for i,vr in enumerate(vrs)}
print len(vrs)

955


In [65]:
concepts_subset = c2vr.keys() # ORDER DOESN'T MATTER
print len(concepts_subset)

311


### B. Train-Test Split

In [66]:
def preproc_split(concepts, k=5):
    """ arg: concepts, cv numbers.
        returns: k .8/.2 train-test splits.
    """
    # random.shuffle(concepts) # TODO: messing up with the indexing. will fix later.
    concept_chunks = partition(concepts, k)
    train_test_splits = []
    for i in range(k):
        train_cpts = list(chain.from_iterable([concept_chunk for j,concept_chunk in enumerate(concept_chunks)
                                               if j!=i]))
        test_cpts = concept_chunks[i]
        train_test_splits.append((train_cpts,test_cpts))
    return train_test_splits

In [67]:
train_test_splits = preproc_split(concepts_subset)

In [68]:
print train_test_splits[0][0]; print
print train_test_splits[0][1]

[u'wagon', u'bat', u'clam', u'peacock', u'tongs', u'level', u'cigar', u'bracelet', u'gun', u'pencil', u'swimsuit', u'axe', u'bag', u'microscope', u'hatchet', u'gloves', u'subway', u'rock', u'nightingale', u'mouse', u'garage', u'tray', u'trailer', u'lantern', u'turtle', u'butterfly', u'fence', u'cage', u'house', u'duck', u'crown', u'worm', u'lion', u'cellar', u'mirror', u'candle', u'marble', u'hammer', u'chicken', u'whip', u'barn', u'moth', u'peg', u'closet', u'clock', u'kettle', u'fox', u'revolver', u'vest', u'mug', u'stool', u'bureau', u'pistol', u'slingshot', u'knife', u'tent', u'cockroach', u'sheep', u'pepper', u'hose', u'umbrella', u'camel', u'pear', u'corn', u'cart', u'cork', u'guitar', u'spoon', u'grasshopper', u'sink', u'key', u'goldfish', u'comb', u'grenade', u'card', u'box', u'stone', u'drum', u'jet', u'cow', u'saddle', u'jeep', u'walrus', u'biscuit', u'anchor', u'magazine', u'cod', u'falcon', u'ox', u'coin', u'buckle', u'scarf', u'clamp', u'shovel', u'apple', u'spear', u'toil

** ================== NB: The objects above this line are globally referenced!! ==================**

## II. Learning

In [69]:
def to_ft_vec(cpt):
    """ args: a concept
        returns: a len(features) array of feature probs in numpy array.
    """
    return np.array([cpt2ftprob_dic[cpt][ft] if ft in cpt2ftprob_dic[cpt] else 0 for ft in features])

def learn(train_cpts):
    """ args: training concept set,
        returns: verb-role->len(features) feature weights vector in numpy array.
    """
    vr2ftdist = defaultdict(lambda : np.ones(len(features)))
    for vr in vrs:
        cpts = list(vr2c[vr])
        for cpt in filter(lambda cpt:cpt in train_cpts, cpts):
            vr2ftdist[vr] += to_ft_vec(cpt)
    return vr2ftdist

## III. Inference

In [70]:
def infer(train_test_splits):
    """ args: a list of train-test concept-list tuples.
        returns: concept->len(features) feature weights vector in numpy array.
    """
    cpt2ftdist = defaultdict(lambda : np.ones(len(features)))
    for train_cpts,test_cpts in train_test_splits:
        vr2ftdist = learn(train_cpts)
        for cpt in test_cpts:
            ftdist = np.ones(len(features))
            for vr in c2vr[cpt]:
                vr_ftdist = vr2ftdist[vr]
                for f_i in range(len(features)):
                    if vr_ftdist[f_i]>1: # means there's evidence at this feature.
                        ftdist[f_i] += vr_ftdist[f_i]
            cpt2ftdist[cpt] = ftdist # feature weights vector.
    return cpt2ftdist

In [71]:
%%time
cpt2ftdist = infer(train_test_splits)

CPU times: user 8.09 s, sys: 40.4 ms, total: 8.13 s
Wall time: 8.13 s


In [72]:
cpt2ftdist['book']

array([ 3.09892473,  8.97881891,  1.        , ...,  1.        ,
        1.        ,  1.        ])

## IV. Evaluation

In [73]:
def evaluate_prec(cpt2ftdist):
    precs_1, precs_5 = [], []
    for cpt in cpt2ftdist.iterkeys():
        true_fts = cpt2ft[cpt]
        pred_fts = map(lambda f_i:features[f_i], np.argsort(cpt2ftdist[cpt])[::-1][:5])
        precs_1.append(1 if pred_fts[0] in true_fts else 0)
        precs_5.append(sum(1 if pred_ft in true_fts else 0 for pred_ft in pred_fts) / 5)
    print "Average Prec @1: %.6f%%" % (np.mean(precs_1)*100)
    print "Average Prec @5: %.6f%%" % (np.mean(precs_5)*100)

In [74]:
evaluate_prec(cpt2ftdist)

Average Prec @1: 22.580645%
Average Prec @5: 14.967742%


In [75]:
def evaluate_map(cpt2ftdist):
    aps = []
    for cpt in cpt2ftdist.iterkeys():
        true_ftbin = np.array([1 if ft in cpt2ft[cpt] else 0 for ft in features])
        pred_ftdist = normalize(cpt2ftdist[cpt])
        aps.append(average_precision_score(true_ftbin, pred_ftdist))
    print "MAP: %.6f%%" % (np.mean(aps)*100)

In [76]:
evaluate_map(cpt2ftdist)

MAP: 11.844935%
