## 0. Load Data

In [115]:
import cPickle

In [116]:
norm_path = "/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data.p"
norms,norm2prop,norm2propprob,props = cPickle.load(open(norm_path,'rb'))

In [117]:
n2i = {n:i for i,n in enumerate(norms)}
p2i = {p:i for i,p in enumerate(props)}

## 1. Corpus $\Rightarrow$ Parsed Sents

In [118]:
from nltk.corpus import brown
from spacy.en import STOPWORDS, English

In [100]:
def spacy_parse(sents):
    
    parser = English()
    parsed_sents = [parser(sent) for sent in sents]
    triples = []
    for parsed_sent in parsed_sents:
        for token in parsed_sent:
            triples.append((token.lemma_,token.dep_,token.head.lemma_))
    
    return parsed_sents, triples

In [9]:
%%time
brown_sents = [unicode(' '.join(sent)) for sent in brown.sents()]
parsed_sents, triples = spacy_parse(brown_sents)

CPU times: user 1min 35s, sys: 1.66 s, total: 1min 36s
Wall time: 1min 36s


## 2. Parsed Sents $\Rightarrow$ Mappings

* $W$: Norm Vocab; $VR$: Verb-Role Pair Vocab; $Q$: Property Vocab
* $f: VR\mapsto W$
* $g: W\mapsto Q$ (read from preprocessed McRae set)
* $h: VR\mapsto count(Q)$

In [119]:
from collections import defaultdict, Counter

In [131]:
def make_vr2norm(triples,norms):
    
    norms_set = set(norms)
    vr2norm = defaultdict(set)
    for word,dep,head in triples:
        if word in norms_set and dep in {'nsubj','dobj'} and head not in STOPWORDS:
            vr2norm[(head,dep)].add(word)
    
    return vr2norm

In [132]:
%%time
vr2norm = make_vr2norm(triples,norms)

CPU times: user 125 ms, sys: 7.67 ms, total: 133 ms
Wall time: 127 ms


In [136]:
def make_vr2pcdic(vr2norm,norm2prop,norm2propprob,out_word,mode='binary'): # pcdic: property-count dictionary
                                                                  # to directly compare with fagarasan et al. (2015)
                                                                  # i only look at binary.
    vr2pcdic = defaultdict(Counter)
    for (v,r) in vr2norm.iterkeys():
        norm_list = list(vr2norm[(v,r)])
        for norm in norm_list:
            if norm!=out_word:
                if mode=='binary': vr2pcdic[(v,r)] += Counter(norm2prop[norm])
                else: vr2pcdic[(v,r)] += Counter({prop:prob for prop,prob in zip(norm2prop[norm],norm2propprob[norm])})
    
    return vr2pcdic

In [137]:
out_word = 'cat'

In [138]:
%%time
vr2pcdic = make_vr2pcdic(vr2norm,norm2prop,norm2propprob,out_word,mode='binary')

CPU times: user 70.8 ms, sys: 5.63 ms, total: 76.5 ms
Wall time: 72.4 ms


## 3. Property Learning

* Eval 1: Top 1 Frequently-Seen $(v,r)$ for $u$ in Corpus
* Eval 2: Top 3
* Eval 3: Top 5

**NB**: maybe 1,5,10 for BNC, with more data.

In [139]:
import numpy as np
from __future__ import division

In [140]:
def make_norm2vr(triples,norms):
    
    norms_set = set(norms)
    norm2vr = defaultdict(list)
    for word,dep,head in triples:
        if word in norms_set and dep in {'nsubj','dobj'} and head not in STOPWORDS:
            norm2vr[word].append((head,dep))
    
    return norm2vr

In [141]:
%%time
norm2vr = make_norm2vr(triples,norms)

CPU times: user 117 ms, sys: 3.97 ms, total: 121 ms
Wall time: 119 ms


In [142]:
def get_topk_vrs(u, norm2vr, k=1):
    return map(lambda (vr,c): vr, Counter(norm2vr[u]).most_common(k))

In [153]:
def learn_topk_props(u, norm2vr, norm2prop, props, vr2pcdic, k=1):
    prop_vec = np.ones(len(props))
    topk_vrs = get_topk_vrs(u,norm2vr,k)
    for v,r in topk_vrs:
        for prop in props:
            prop_vec[p2i[prop]] += vr2pcdic[(v,r)][prop]
    pred_props = map(lambda (i,c):props[i] if c>1 else 'none', enumerate(prop_vec))
    accuracy = len(set(pred_props).intersection(set(norm2prop[u]))) / len(norm2prop[u])
    print "Target Word: %s | k = %d | Accuracy = %.2f%%" % (u,k,accuracy*100)
    return filter(lambda prop:prop!='none',pred_props)
    

In [160]:
get_topk_vrs('cat',norm2vr,100)

[(u'like', u'dobj'),
 (u'manage', u'nsubj'),
 (u'watch', u'dobj'),
 (u'feed', u'dobj'),
 (u'complement', u'nsubj'),
 (u'meet', u'nsubj'),
 (u'send', u'dobj'),
 (u'dye', u'dobj'),
 (u'come', u'nsubj')]

In [163]:
learn_topk_props('cat',norm2vr,norm2prop,props,vr2pcdic,k=1) # 100 because its seen

Target Word: cat | k = 1 | Accuracy = 0.00%


['is_loud',
 'used_by_the_police',
 'is_black',
 'is_dangerous',
 'has_a_trigger',
 'used_for_war',
 'inbeh_-_fires',
 'used_for_protection',
 'a_weapon',
 'eg_-_rifle',
 'inbeh_-_fires_bullets',
 'made_of_metal',
 'used_for_hunting',
 'used_for_shooting',
 'used_for_crimes',
 'used_for_killing']

In [155]:
norm2vr['cat']

[(u'come', u'nsubj'),
 (u'like', u'dobj'),
 (u'feed', u'dobj'),
 (u'complement', u'nsubj'),
 (u'like', u'dobj'),
 (u'meet', u'nsubj'),
 (u'watch', u'dobj'),
 (u'dye', u'dobj'),
 (u'manage', u'nsubj'),
 (u'send', u'dobj')]

In [158]:
vr2norm[('feed','dobj')]

{u'cat', u'corn', u'crocodile', u'horse', u'pig', u'rice'}

In [156]:
vr2pcdic[('feed','dobj')]

Counter({'a_baby_is_a_piglet': 1,
         'a_food': 1,
         'a_grain': 1,
         'a_mammal': 1,
         'a_reptile': 1,
         'a_vegetable': 1,
         'an_animal': 3,
         'associated_with_China': 1,
         'associated_with_saddles': 1,
         'beh_-_eats': 2,
         'beh_-_eats_animals': 1,
         'beh_-_eats_people': 1,
         'beh_-_lays_eggs': 1,
         'beh_-_oinks': 1,
         'beh_-_squeals': 1,
         'beh_-_swims': 1,
         'comes_on_cobs': 1,
         'eaten_as_bacon': 1,
         'eaten_as_ham': 1,
         'eaten_as_popcorn': 1,
         'eaten_as_pork': 1,
         'eaten_by_cooking': 1,
         'eaten_with_butter': 1,
         'grows_in_fields': 1,
         'grows_in_water': 1,
         'has_4_legs': 3,
         'has_a_curly_tail': 1,
         'has_a_mane': 1,
         'has_a_snout': 1,
         'has_a_tail': 3,
         'has_ears': 1,
         'has_hooves': 1,
         'has_husks': 1,
         'has_kernels': 1,
         'has_legs': 2,


In [113]:
norm2prop['cat']

['a_baby_is_a_kitten',
 'a_feline',
 'a_mammal',
 'a_pet',
 'an_animal',
 'beh_-_eats',
 'beh_-_eats_mice',
 'beh_-_kills',
 'beh_-_meows',
 'beh_-_purrs',
 'has_4_legs',
 'has_a_tail',
 'has_claws',
 'has_eyes',
 'has_fur',
 'has_legs',
 'has_paws',
 'has_whiskers',
 'is_domestic',
 'is_independent']