# Independent Beta Count (Syntax-Context)

## 0a. Coding Info

* **Corpus**
    * Brown
    * McRae Norms
* **Parser**
    * SpaCy
* **Algorithm**
    * Count-based Indep. Beta Bayesian Updating (Erk 2016)
* **Evaluation**
    * Precision @1/@5
    * Mean Average Precision (MAP) https://www.youtube.com/watch?v=pM6DJ0ZZee0&index=12&list=PLBv09BD7ez_6nqE9YU9bQXpjJ5jJ1Kgr9

## 0b. Packages

In [216]:
import numpy as np
from collections import defaultdict
from itertools import chain
from __future__ import division
from sklearn.metrics import average_precision_score

## 0c. Input Preparation

In [217]:
import cPickle

In [218]:
concepts,cpt2ft,cpt2ftprob,features = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data_bern.p",'rb'))
triples = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/BROWN/brown_triples.p",'rb'))

In [219]:
concepts = list(set(concepts))

In [220]:
f2i = {f:i for i,f in enumerate(features)}

In [221]:
cpt2ftprob_dic = defaultdict(dict)
for cpt in concepts:
    cpt2ftprob_dic[cpt] = {ft:ftprob for ft,ftprob in zip(cpt2ft[cpt],cpt2ftprob[cpt])}

In [222]:
cpt2ftprob_dic['book']

{'found_in_libraries': 0.36666666666666664,
 'found_in_schools': 0.23333333333333334,
 'found_on_shelves': 0.16666666666666666,
 'has_a_hard_cover': 0.33333333333333331,
 'has_a_soft_cover': 0.20000000000000001,
 'has_authors': 0.43333333333333335,
 'has_information': 0.16666666666666666,
 'has_page_numbers': 0.16666666666666666,
 'has_pages': 0.76666666666666672,
 'has_pictures': 0.16666666666666666,
 'has_words_in_it': 0.46666666666666667,
 'inbeh_-_tells_stories': 0.26666666666666666,
 'made_of_paper': 0.29999999999999999,
 'used_by_reading': 0.56666666666666665,
 'used_for_acquiring/storing_knowledge': 0.16666666666666666,
 'used_for_learning': 0.16666666666666666}

## 0f. General Purpose Functions

In [238]:
def normalize(arr):
    return arr / arr.sum()

def partition(l, k):
    """ args: list, # partitions.
        returns: a list of k partitions.
    """
    k = max(1, k)
    chunk_size = len(l)//k
    if len(l)%k==0:
        return [l[i:i+chunk_size] for i in xrange(0, len(l), chunk_size)]
    return [l[i:i+chunk_size] if idx<k-1 else l[i:] for idx,i in enumerate(xrange(0, len(l), chunk_size))][:-1]

## I. Preprocessing

### A. Mappings

* Verb-Role $\mapsto$ Concept List/Set Mapping
* Concept $\mapsto$ Verb-Role List/Set Mapping

In [240]:
def preproc_mapping(concepts, triples):
    """ args: concepts, features, concept->feature mapping, concept->P(feature) mapping, dependency triples.
        returns: verb-role->concept-set mapping.
    """
    concepts_set = set(concepts) # for fast lookup.
    vr2c = defaultdict(set)
    c2vr = defaultdict(list)
    for word,dep,head in triples:
        if word in concepts_set and (dep=='nsubj' or dep=='dobj'):
            vr2c[head+'-'+dep].add(word)
            c2vr[word].append(head+'-'+dep)
    return vr2c, c2vr

In [241]:
vr2c, c2vr = preproc_mapping(concepts, triples)

In [242]:
vrs = vr2c.keys() # global verb-role pair indexing.
vr2i = {vr:i for i,vr in enumerate(vrs)}
print len(vrs)

975


In [243]:
concepts_subset = c2vr.keys() # ORDER DOESN'T MATTER
print len(concepts_subset)

327


### B. Train-Test Split

In [245]:
def preproc_split(concepts, k=5):
    """ arg: concepts, cv numbers.
        returns: k .8/.2 train-test splits.
    """
    # random.shuffle(concepts) # TODO: messing up with the indexing. will fix later.
    concept_chunks = partition(concepts, k)
    train_test_splits = []
    for i in range(k):
        train_cpts = list(chain.from_iterable([concept_chunk for j,concept_chunk in enumerate(concept_chunks)
                                               if j!=i]))
        test_cpts = concept_chunks[i]
        train_test_splits.append((train_cpts,test_cpts))
    return train_test_splits

In [246]:
train_test_splits = preproc_split(concepts_subset)

In [248]:
print train_test_splits[0][0]; print
print train_test_splits[0][1]; print
print len(train_test_splits)

[u'wagon', u'bat', u'clam', u'peacock', u'onion', u'level', u'cigar', u'bracelet', u'gun', u'glove', u'swimsuit', u'axe', u'bag', u'microscope', u'hatchet', u'subway', u'rock', u'nightingale', u'mouse', u'garage', u'tray', u'trailer', u'lantern', u'turtle', u'butterfly', u'fence', u'cage', u'house', u'duck', u'crown', u'tongs', u'worm', u'lion', u'trouser', u'cellar', u'mirror', u'candle', u'marble', u'hammer', u'chicken', u'whip', u'barn', u'moth', u'peg', u'closet', u'clock', u'kettle', u'fox', u'revolver', u'vest', u'mug', u'stool', u'bureau', u'pistol', u'pea', u'knife', u'tent', u'cockroach', u'sheep', u'pepper', u'hose', u'umbrella', u'pencil', u'camel', u'pear', u'corn', u'slingshot', u'cart', u'cork', u'guitar', u'sandal', u'spoon', u'grasshopper', u'bolt', u'key', u'goldfish', u'comb', u'grenade', u'card', u'box', u'stone', u'drum', u'jet', u'cow', u'saddle', u'jeep', u'walrus', u'biscuit', u'anchor', u'magazine', u'cod', u'falcon', u'ox', u'coin', u'buckle', u'scarf', u'sink'

** ================== NB: The objects above this line are globally referenced!! ==================**

## II. Learning

In [249]:
def to_ft_vec(cpt):
    """ args: a concept
        returns: a len(features) array of feature probs in numpy array.
    """
    return np.array([cpt2ftprob_dic[cpt][ft] if ft in cpt2ftprob_dic[cpt] else 0 for ft in features],dtype=np.float32)

def learn_stc(train_cpts):
    
    vr2ftabs = defaultdict(lambda : np.array([[1,1] for _ in range(len(features))],dtype=np.float32))
    for vr in vrs:
        cpts = list(vr2c[vr])
        for cpt in filter(lambda cpt:cpt in train_cpts, cpts):
            cpt_ftprobs = to_ft_vec(cpt)
            for f_i in range(len(features)):
                if cpt_ftprobs[f_i]!=0:
                    vr2ftabs[vr][f_i] = [vr2ftabs[vr][f_i][0]+cpt_ftprobs[f_i], 
                                         vr2ftabs[vr][f_i][1]+(1-cpt_ftprobs[f_i])]
                else:
                    vr2ftabs[vr][f_i] = [vr2ftabs[vr][f_i][0], 
                                         vr2ftabs[vr][f_i][1]+1]                    
    return vr2ftabs

# def learn_bin(train_cpts):

#     vr2ftabs = defaultdict(lambda : np.array([[1,1] for _ in range(len(features))],dtype=np.float32))
#     for vr in vrs:
#         cpts = list(vr2c[vr])
#         for cpt in filter(lambda cpt:cpt in train_cpts, cpts):
#             cpt_ftprobs = to_ft_vec(cpt)
#             for f_i in range(len(features)):
#                 if cpt_ftprobs[f_i]!=0:
#                     vr2ftabs[vr][f_i] = [vr2ftabs[vr][f_i][0]+1, vr2ftabs[vr][f_i][0]] # if pos, alpha+1
#                 else:
#                     vr2ftabs[vr][f_i] = [vr2ftabs[vr][f_i][0], vr2ftabs[vr][f_i][1]+1] # if neg, beta+1
#     return vr2ftabs

In [188]:
# t = learn_bin(train_test_splits[0][0])

In [199]:
# for i,(a,b) in enumerate(t['feed-dobj']):
#     if a!=1:
#         print a,b,features[i]

In [250]:
t = learn_stc(train_test_splits[0][0])

In [252]:
# for i,(a,b) in enumerate(t['feed-dobj']):
#     print a,b,features[i]

## III. Inference

In [253]:
def infer_stc(train_test_splits):
    
    cpt2ftabs = defaultdict(lambda : np.array([[1,1] for _ in range(len(features))],dtype=np.float32))
    for train_cpts,test_cpts in train_test_splits:
        vr2ftabs = learn_stc(train_cpts)
        for cpt in test_cpts:
            ftabs = np.array([[1,1] for _ in range(len(features))],dtype=np.float32)
            for vr in c2vr[cpt]:
                vr_ftabs = vr2ftabs[vr]
                for f_i in range(len(features)):
                    ftabs[f_i] = [ftabs[f_i][0]+vr_ftabs[f_i][0],
                                  ftabs[f_i][1]+vr_ftabs[f_i][1]]
            cpt2ftabs[cpt] = ftabs
    return cpt2ftabs

# def infer_bin(train_test_splits):
    
#     cpt2ftabs = defaultdict(lambda : np.array([[1,1] for _ in range(len(features))],dtype=np.float32))
#     for train_cpts,test_cpts in train_test_splits:
#         vr2ftabs = learn_bin(train_cpts)
#         for cpt in test_cpts:
#             ftabs = np.array([[1,1] for _ in range(len(features))],dtype=np.float32)
#             for vr in c2vr[cpt]:
#                 vr_ftabs = vr2ftabs[vr]
#                 for f_i in range(len(features)):
#                     if not (vr_ftabs[f_i][0]==1 and vr_ftabs[f_i][1]==1): # means there's evidence at this feature.
#                         ftabs[f_i] = [ftabs[f_i][0]+vr_ftabs[f_i][0],
#                                       ftabs[f_i][1]+vr_ftabs[f_i][1]]                       
#             cpt2ftabs[cpt] = ftabs
#     return cpt2ftabs

In [254]:
%%time
cpt2ftabs_stc = infer_stc(train_test_splits)

CPU times: user 2min 20s, sys: 750 ms, total: 2min 20s
Wall time: 2min 21s


In [194]:
# %%time
# cpt2ftabs_bin = infer_bin(train_test_splits)

CPU times: user 2min 59s, sys: 1.41 s, total: 3min
Wall time: 3min 1s


In [206]:
# for i,(a,b) in enumerate(cpt2ftabs_bin['cat']):
#     if a>b:
#         print a,b, a/b, features[i]

In [200]:
# for i,(a,b) in enumerate(cpt2ftabs_bin['cat']):
#     if a<b:
#         print a,b, a/b, features[i]

In [201]:
# for i,(a,b) in enumerate(cpt2ftabs_stc['cat']):
#     if a>b:
#         print a,b, a/b, features[i]

In [202]:
# for i,(a,b) in enumerate(cpt2ftabs_stc['cat']):
#     if a<b:
#         print a,b, a/b, features[i]

In [255]:
def to_probdist(cpt2ftabs):
    cpt2ftdist = defaultdict(lambda : np.zeros(len(features)))
    for cpt in cpt2ftabs.iterkeys():
        cpt2ftdist[cpt] = np.array([cpt2ftabs[cpt][f_i][0]/(cpt2ftabs[cpt][f_i][0]+cpt2ftabs[cpt][f_i][1])
                                    for f_i in range(len(features))])
    return cpt2ftdist

In [256]:
cpt2ftdist_stc = to_probdist(cpt2ftabs_stc)

In [209]:
# cpt2ftdist_bin = to_probdist(cpt2ftabs_bin)

## IV. Evaluation

In [257]:
def evaluate_prec(cpt2ftdist):
    precs_1, precs_5 = [], []
    for cpt in cpt2ftdist.iterkeys():
        true_fts = cpt2ft[cpt]
        pred_fts = map(lambda f_i:features[f_i], np.argsort(cpt2ftdist[cpt])[::-1][:5])
        precs_1.append(1 if pred_fts[0] in true_fts else 0)
        precs_5.append(sum(1 if pred_ft in true_fts else 0 for pred_ft in pred_fts) / 5)
    print "Average Prec @1: %.6f%%" % (np.mean(precs_1)*100)
    print "Average Prec @5: %.6f%%" % (np.mean(precs_5)*100)

In [258]:
def evaluate_map(cpt2ftdist):
    aps = []
    for cpt in cpt2ftdist.iterkeys():
        true_ftbin = np.array([1 if ft in cpt2ft[cpt] else 0 for ft in features])
        pred_ftdist = normalize(cpt2ftdist[cpt])
        aps.append(average_precision_score(true_ftbin, pred_ftdist))
    print "MAP: %.6f%%" % (np.mean(aps)*100)

In [259]:
evaluate_prec(cpt2ftdist_stc)
evaluate_map(cpt2ftdist_stc)

Average Prec @1: 19.571865%
Average Prec @5: 15.963303%
MAP: 12.274833%


In [260]:
# evaluate_prec(cpt2ftdist_bin)
# evaluate_map(cpt2ftdist_bin)