# Multinomial Count (Syntax-Context)

## 0a. Coding Info

* **Corpus**
    * Brown
    * McRae Norms
* **Parser**
    * SpaCy
* **Algorithm**
    * Count-based Multinomial Bayesian Updating (Erk 2016)
* **Evaluation**
    * Precision @1/@5
    * Mean Average Precision (MAP) https://www.youtube.com/watch?v=pM6DJ0ZZee0&index=12&list=PLBv09BD7ez_6nqE9YU9bQXpjJ5jJ1Kgr9

## 0b. Packages

In [97]:
import numpy as np
from collections import defaultdict
from itertools import chain
from __future__ import division
from sklearn.metrics import average_precision_score

## 0c. Input Preparation

In [2]:
import cPickle

In [3]:
concepts,cpt2ft,cpt2ftprob,features = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data.p",'rb'))
triples = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/BROWN/brown_triples.p",'rb'))

In [4]:
concepts = list(set(concepts))

In [98]:
f2i = {f:i for i,f in enumerate(features)}

In [99]:
cpt2ftprob_dic = defaultdict(dict)
for cpt in concepts:
    cpt2ftprob_dic[cpt] = {ft:ftprob for ft,ftprob in zip(cpt2ft[cpt],cpt2ftprob[cpt])}

In [100]:
cpt2ftprob_dic['book']

{'found_in_libraries': 0.074324324324324328,
 'found_in_schools': 0.0472972972972973,
 'found_on_shelves': 0.033783783783783786,
 'has_a_hard_cover': 0.067567567567567571,
 'has_a_soft_cover': 0.040540540540540543,
 'has_authors': 0.087837837837837843,
 'has_information': 0.033783783783783786,
 'has_page_numbers': 0.033783783783783786,
 'has_pages': 0.1554054054054054,
 'has_pictures': 0.033783783783783786,
 'has_words_in_it': 0.0945945945945946,
 'inbeh_-_tells_stories': 0.054054054054054057,
 'made_of_paper': 0.060810810810810814,
 'used_by_reading': 0.11486486486486487,
 'used_for_acquiring/storing_knowledge': 0.033783783783783786,
 'used_for_learning': 0.033783783783783786}

## 0f. General Purpose Functions

In [101]:
def normalize(arr):
    return arr / arr.sum()

def partition(l, k):
    """ args: list, # partitions.
        returns: a list of k partitions.
    """
    k = max(1, k)
    chunk_size = len(l)//k
    return [l[i:i+chunk_size] for i in xrange(0, len(l), chunk_size)]

## I. Preprocessing

### A. Mappings

* Verb-Role $\mapsto$ Concept List/Set Mapping
* Concept $\mapsto$ Verb-Role List/Set Mapping

In [146]:
def preproc_mapping(concepts, triples):
    """ args: concepts, features, concept->feature mapping, concept->P(feature) mapping, dependency triples.
        returns: verb-role->concept-set mapping.
    """
    concepts_set = set(concepts) # for fast lookup.
    vr2c = defaultdict(set)
    c2vr = defaultdict(list)
    for word,dep,head in triples:
        if word in concepts_set and (dep=='nsubj' or dep=='dobj'):
            vr2c[head+'-'+dep].add(word)
            c2vr[word].append(head+'-'+dep)
    return vr2c, c2vr

In [147]:
vr2c, c2vr = preproc_mapping(concepts, triples)

In [148]:
vrs = vr2c.keys() # global verb-role pair indexing.
vr2i = {vr:i for i,vr in enumerate(vrs)}
print len(vrs)

975


In [149]:
concepts_subset = c2vr.keys() # ORDER DOESN'T MATTER
print len(concepts_subset)

327


### B. Train-Test Split

In [134]:
def preproc_split(concepts, k=5):
    """ arg: concepts, cv numbers.
        returns: k .8/.2 train-test splits.
    """
    # random.shuffle(concepts) # TODO: messing up with the indexing. will fix later.
    concept_chunks = partition(concepts, k)
    train_test_splits = []
    for i in range(k):
        train_cpts = list(chain.from_iterable([concept_chunk for j,concept_chunk in enumerate(concept_chunks)
                                               if j!=i]))
        test_cpts = concept_chunks[i]
        train_test_splits.append((train_cpts,test_cpts))
    return train_test_splits

In [135]:
train_test_splits = preproc_split(concepts_subset)

In [136]:
print train_test_splits[0][0]; print
print train_test_splits[0][1]

[u'wagon', u'bat', u'clam', u'peacock', u'onion', u'level', u'cigar', u'bracelet', u'gun', u'glove', u'swimsuit', u'axe', u'bag', u'microscope', u'hatchet', u'subway', u'rock', u'nightingale', u'mouse', u'garage', u'tray', u'trailer', u'lantern', u'turtle', u'butterfly', u'fence', u'cage', u'house', u'duck', u'crown', u'tongs', u'worm', u'lion', u'trouser', u'cellar', u'mirror', u'candle', u'marble', u'hammer', u'chicken', u'whip', u'barn', u'moth', u'peg', u'closet', u'clock', u'kettle', u'fox', u'revolver', u'vest', u'mug', u'stool', u'bureau', u'pistol', u'pea', u'knife', u'tent', u'cockroach', u'sheep', u'pepper', u'hose', u'umbrella', u'pencil', u'camel', u'pear', u'corn', u'slingshot', u'cart', u'cork', u'guitar', u'sandal', u'spoon', u'grasshopper', u'bolt', u'key', u'goldfish', u'comb', u'grenade', u'card', u'box', u'stone', u'drum', u'jet', u'cow', u'saddle', u'jeep', u'walrus', u'biscuit', u'anchor', u'magazine', u'cod', u'falcon', u'ox', u'coin', u'buckle', u'scarf', u'sink'

** ================== NB: The objects above this line are globally referenced!! ==================**

## II. Learning

In [150]:
def to_ft_vec(cpt):
    """ args: a concept
        returns: a len(features) array of feature probs in numpy array.
    """
    return np.array([cpt2ftprob_dic[cpt][ft] if ft in cpt2ftprob_dic[cpt] else 0 for ft in features])

def learn(train_cpts):
    """ args: training concept set,
        returns: verb-role->len(features) feature weights vector in numpy array.
    """
    vr2ftdist = defaultdict(lambda : np.ones(len(features)))
    for vr in vrs:
        cpts = list(vr2c[vr])
        for cpt in filter(lambda cpt:cpt in train_cpts, cpts):
            vr2ftdist[vr] += to_ft_vec(cpt)
    return vr2ftdist

## III. Inference

In [173]:
def infer(train_test_splits):
    """ args: a list of train-test concept-list tuples.
        returns: concept->len(features) feature weights vector in numpy array.
    """
    cpt2ftdist = defaultdict(lambda : np.ones(len(features)))
    for train_cpts,test_cpts in train_test_splits:
        vr2ftdist = learn(train_cpts)
        for cpt in test_cpts:
            ftdist = np.ones(len(features))
            for vr in c2vr[cpt]:
                vr_ftdist = vr2ftdist[vr]
                for f_i in range(len(features)):
                    if vr_ftdist[f_i]>1: # means there's evidence at this feature.
                        ftdist[f_i] += vr_ftdist[f_i]
            cpt2ftdist[cpt] = ftdist # feature weights vector.
    return cpt2ftdist

In [174]:
%%time
cpt2ftdist = infer(train_test_splits)

CPU times: user 8.11 s, sys: 46.3 ms, total: 8.16 s
Wall time: 8.19 s


In [175]:
cpt2ftdist['book']

array([  3.1935453 ,  27.14836756,   1.        , ...,   1.        ,
         1.        ,   1.        ])

## IV. Evaluation

In [176]:
def evaluate_prec(cpt2ftdist):
    precs_1, precs_5 = [], []
    for cpt in cpt2ftdist.iterkeys():
        true_fts = cpt2ft[cpt]
        pred_fts = map(lambda f_i:features[f_i], np.argsort(cpt2ftdist[cpt])[::-1][:5])
        precs_1.append(1 if pred_fts[0] in true_fts else 0)
        precs_5.append(sum(1 if pred_ft in true_fts else 0 for pred_ft in pred_fts) / 5)
    print "Average Prec @1: %.6f%%" % (np.mean(precs_1)*100)
    print "Average Prec @5: %.6f%%" % (np.mean(precs_5)*100)

In [177]:
evaluate_prec(cpt2ftdist)

Average Prec @1: 21.538462%
Average Prec @5: 15.446154%


In [178]:
def evaluate_map(cpt2ftdist):
    aps = []
    for cpt in cpt2ftdist.iterkeys():
        true_ftbin = np.array([1 if ft in cpt2ft[cpt] else 0 for ft in features])
        pred_ftdist = normalize(cpt2ftdist[cpt])
        aps.append(average_precision_score(true_ftbin, pred_ftdist))
    print "MAP: %.6f%%" % (np.mean(aps)*100)

In [179]:
evaluate_map(cpt2ftdist)

MAP: 11.920285%
