## Description

* Count-Based Word Property Learning

## I. Prepare Input

* norms: list of all McRae norms (541)
* props: list of all McRae properties (2526)
* norm2prop: norm -> prop mapping
* corpus triples: (word, dep, head)

In [3]:
import cPickle

In [8]:
norms,norm2prop,norm2propprob,props = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data.p",'rb'))
dep_triples = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/BROWN/brown_triples.p",'rb'))

## II. Facilities

In [36]:
import random
import numpy as np
from itertools import chain
from collections import defaultdict
from __future__ import division

In [61]:
# HELPERS

def partition(l, k):
    k = max(1, k)
    chunk_size = len(l)//k
    return [l[i:i+chunk_size] for i in xrange(0, len(l), chunk_size)]

def cosine(norm2vr): # should take ppmi matrix as input
    norm2vr_norm = norm2vr / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r)), 1, norm2vr)[:,np.newaxis]
    return np.dot(norm2vr_norm, norm2vr_norm.T)
    
def ppmi(norm2vr):
    def iszero(m): return m==0
    row_sums, col_sums, total_sums = norm2vr.sum(axis=1), norm2vr.sum(axis=0), norm2vr.sum()
    pwi, pwj, ppmi_mat = row_sums/total_sums, col_sums/total_sums, norm2vr/total_sums
    ppmi_mat /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmi_mat /= pwj # * 1/pwj by col.
    ppmi_mat[iszero(ppmi_mat)] = 1e-10 # handling logging-on-0 issue.
    ppmi_mat = np.log(ppmi_mat) # compute pmi.
    ppmi_mat = np.maximum(ppmi_mat, 0) # compute ppmi.
    return ppmi_mat

In [132]:
# EVALUATORS

def topk_evaluate(norm2propwgts, k): # test norms.
    accuracies = []
    for norm in norm2propwgts.iterkeys():
        true_props = norm2prop[norm]
        pred_props = map(lambda idx:props[idx], np.argsort(norm2propwgts[norm])[::-1][:k])
        accuracy = 0.
        for pred_prop in pred_props:
            if pred_prop in true_props: 
                accuracy = 1. # gold-standard prop retrieved, count 'accuracy'.
                break
        accuracies.append(accuracy)
    print "Percentage Accurate (Test Norms) in Top %d Predictions: %.6f%%" % (k, np.mean(accuracies)*100)

def map_evaluate(norm2propwgts):
    avg_precs = []
    for norm in norm2propwgts.iterkeys():
        true_props = norm2prop[norm]
        pred_props = map(lambda idx:props[idx], np.argsort(norm2propwgts[norm])[::-1])
        num_correct, precs = 0, []
        for i,pred_prop in enumerate(pred_props):
            if pred_prop in true_props:
                num_correct += 1
                precs.append(num_correct/(i+1))
        avg_precs.append(np.mean(precs))
    print "MAP: %.6f%%" % (np.mean(avg_precs)*100)

In [182]:
# MAIN CLASS

class WPCount:
    
    def __init__(self, norms, props, norm2prop, triples):
        self.all_norms = norms
        self.props = props
        self.f2i = {f:i for i,f in enumerate(props)}
        self.norm2prop = norm2prop
        self.triples = triples
    
    def learn(self):
        # build (norm,vr) dict
        all_norms_set = set(self.all_norms)
        norms, vrs = set(), set()
        normvr2count = defaultdict(int)
        for word,dep,head in self.triples:
            if word in all_norms_set and (dep=='nsubj' or dep=='dobj'):
                vr = head+'-'+dep
                norms.add(word)
                vrs.add(vr)
                normvr2count[(word, vr)] += 1
        self.norms = list(norms)
        self.vrs = list(vrs)
        self.n2i = {n:i for i,n in enumerate(self.norms)}
        self.vr2i = {vr:i for i,vr in enumerate(self.vrs)}
        # build (norm,vr) master matrix
        norm2vr = np.zeros((len(self.norms),len(self.vrs)))
        for (norm,vr),count in normvr2count.iteritems():
            norm2vr[self.n2i[norm],self.vr2i[vr]] = count
        self.norm2vr = norm2vr
        # build (norm,norm) similarity matrix
        self.norm2vr_ppmi = ppmi(self.norm2vr)
        self.norm2norm_sim = cosine(self.norm2vr_ppmi)
    
    def infer(self, cv=5): 
        # norm set chunking
        random.shuffle(self.norms)
        norm_chunks = partition(self.norms, cv)
        # keep record for cv results
        results = []
        for i in range(cv): # CV
            # train-test split
            train_norms = list(chain.from_iterable([norm_chunk for j,norm_chunk in enumerate(norm_chunks)
                                                    if j!=i])) # flatten.
            test_norms = norm_chunks[i]
            test_norms_ids = set(map(self.n2i.get, test_norms))
            # get top similar norms for each test norm
            norm2mostsim = defaultdict(list)
            for test_norm in test_norms:
                sim_vec = self.norm2norm_sim[self.n2i[test_norm]]
                topk_ids = filter(lambda idx:idx not in test_norms_ids, np.argsort(sim_vec)[::-1])
                topk_sims = map(lambda idx:self.norms[idx], topk_ids)
                norm2mostsim[test_norm] = topk_sims   
            results.append(norm2mostsim)
        self.results = results
    
    def evaluate(self):
        for i,result in enumerate(self.results):
            test_norm2propwgts = defaultdict(list)
            for test_norm in result.iterkeys():
                prop_wgts = np.ones(len(self.props))
                pred_props = list(chain.from_iterable([self.norm2prop[norm] for norm in result[test_norm]]))
                for pred_prop in pred_props:
                    prop_wgts[self.f2i[pred_prop]] += 1
                test_norm2propwgts[test_norm] = prop_wgts
            print "CV round %d results:" % (i+1)
            topk_evaluate(test_norm2propwgts, k=1)
            topk_evaluate(test_norm2propwgts, k=5)
            topk_evaluate(test_norm2propwgts, k=10)
            topk_evaluate(test_norm2propwgts, k=20)
            map_evaluate(test_norm2propwgts)
            print
      

In [187]:
wpc = WPCount(norms, props, norm2prop, dep_triples)
wpc.learn()
wpc.infer(cv=5)
wpc.evaluate()

CV round 1 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 27.692308%
Percentage Accurate (Test Norms) in Top 5 Predictions: 67.692308%
Percentage Accurate (Test Norms) in Top 10 Predictions: 87.692308%
Percentage Accurate (Test Norms) in Top 20 Predictions: 92.307692%
MAP: 10.852882%

CV round 2 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 26.153846%
Percentage Accurate (Test Norms) in Top 5 Predictions: 67.692308%
Percentage Accurate (Test Norms) in Top 10 Predictions: 90.769231%
Percentage Accurate (Test Norms) in Top 20 Predictions: 93.846154%
MAP: 10.486773%

CV round 3 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 27.692308%
Percentage Accurate (Test Norms) in Top 5 Predictions: 70.769231%
Percentage Accurate (Test Norms) in Top 10 Predictions: 92.307692%
Percentage Accurate (Test Norms) in Top 20 Predictions: 96.923077%
MAP: 11.575330%

CV round 4 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 35.384615%
Perce

In [188]:
wpc = WPCount(norms, props, norm2prop, dep_triples)
wpc.learn()
wpc.infer(cv=10)
wpc.evaluate()

CV round 1 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 21.875000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 59.375000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 81.250000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 87.500000%
MAP: 9.323167%

CV round 2 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 34.375000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 78.125000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 90.625000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 93.750000%
MAP: 10.208668%

CV round 3 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 28.125000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 68.750000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 93.750000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 100.000000%
MAP: 13.114250%

CV round 4 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 18.750000%
Perce