## Description

* Multinomial Bimodal Topic Model (Properties $\sim$ Multinomial)

## I. Prepare Input

* norms: list of all McRae norms (541)
* props: list of all McRae properties (2526)
* norm2prop: norm -> prop mapping
* corpus triples: (word, dep, head)

In [1]:
import cPickle

In [2]:
norms,norm2prop,norm2propprob,props = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data.p",'rb'))
dep_triples = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/BROWN/brown_triples.p",'rb'))

In [3]:
norms = list(set(norms)) # ignore polysemy.

## II. Facilities

In [24]:
import numpy as np
import random
from copy import deepcopy
from itertools import chain
from __future__ import division
from operator import add
from collections import Counter, defaultdict

In [5]:
# HELPERs

def normalize(arr):
    return arr / arr.sum()

def partition(l, k):
    k = max(1, k)
    chunk_size = len(l)//k
    return [l[i:i+chunk_size] for i in xrange(0, len(l), chunk_size)]

In [6]:
# EVALUATORS

def topk_evaluate(norm2propdist, k): # test norms.
    accuracies = []
    for norm in norm2propdist.iterkeys():
        true_props = norm2prop[norm]
        pred_props = map(lambda idx:props[idx], np.argsort(norm2propdist[norm])[::-1][:k])
        accuracy = 0.
        for pred_prop in pred_props:
            if pred_prop in true_props: 
                accuracy = 1. # gold-standard prop retrieved, count 'accuracy'.
                break
        accuracies.append(accuracy)
    print "Percentage Accurate (Test Norms) in Top %d Predictions: %.6f%%" % (k, np.mean(accuracies)*100)

def map_evaluate(norm2propdist):
    avg_precs = []
    for norm in norm2propdist.iterkeys():
        true_props = norm2prop[norm]
        pred_props = map(lambda idx:props[idx], np.argsort(norm2propdist[norm])[::-1])
        num_correct, precs = 0, []
        for i,pred_prop in enumerate(pred_props):
            if pred_prop in true_props:
                num_correct += 1
                precs.append(num_correct/(i+1))
        avg_precs.append(np.mean(precs))
    print "MAP: %.6f%%" % (np.mean(avg_precs)*100)

In [35]:
class WPMBTM:
    
    def __init__(self, norms, props, norm2prop, norm2propprob, triples):
        self.all_norms = norms # train-test split later.
        self.props = props
        self.n2i = {n:i for i,n in enumerate(self.all_norms)} # indexing pseudo-docs, each corresponds to 1 train norm.
        self.f2i = {f:i for i,f in enumerate(self.props)}
        self.norm2prop = norm2prop
        self.norm2propprob = norm2propprob
        self.triples = triples
        self.preprocessing()
 
    def preprocessing(self):
        # construct norm2vr and norm2f mappings.
        all_norms_set = set(self.all_norms) # for fast lookup.
        self.norm2vr_f = defaultdict(lambda : defaultdict(list))
        for norm,dep,head in self.triples:
            if norm in all_norms_set and (dep=='nsubj' or dep=='dobj'):
                vr = head+'-'+dep
                f = np.random.choice(self.norm2prop[norm],p=self.norm2propprob[norm])
                self.norm2vr_f[norm]['vr'].append(vr)
                self.norm2vr_f[norm]['f'].append(f)
        self.all_norms = self.norm2vr_f.keys() # narrow it down to norms appearing in the current corpus.
        
    def learn(self, train_norms, topics):
        # make pseudo-documents.
        vrs = set() 
        norm2vr_f = deepcopy(self.norm2vr_f) # avoid subtle changes to the original mapping dict.
        word_docs, prop_docs = [], [] # word_docs: for v-r pairs.
        for norm in train_norms:
            word_docs.append(norm2vr_f[norm]['vr'])
            prop_docs.append(norm2vr_f[norm]['f'])
            vrs = vrs.union(set(norm2vr_f[norm]['vr']))
        vrs = list(vrs)
        w2i = {vr:i for i,vr in enumerate(vrs)} 
        for word_doc,prop_doc in zip(word_docs,prop_docs):
            for i,(word,prop) in enumerate(zip(word_doc,prop_doc)):
                sampled_topic = np.random.choice(topics)
                word_doc[i] = (word,sampled_topic)
                prop_doc[i] = (prop,sampled_topic)
        # topic modeling
        print "... Running Topic Model"
        W, F, D, T = len(vrs), len(self.props), len(word_docs), len(topics)
        alpha, beta, gamma = 50/2, .001, .001
        alpha_arr = np.array([alpha for _ in range(T)]); Talpha_arr = np.array([alpha*T for _ in range(T)])
        beta_arr = np.array([beta for _ in range(T)]); Wbeta_arr = np.array([beta*W for _ in range(T)])
        gamma_arr = np.array([gamma for _ in range(T)]); Fgamma_arr = np.array([gamma*F for _ in range(T)])
        C_WT, C_FT, C_DT = np.zeros((W,T)), np.zeros((F,T)), np.zeros((D,T))
        wt_counts = reduce(add,[Counter(word_doc) for word_doc in word_docs])
        ft_counts = reduce(add,[Counter(prop_doc) for prop_doc in prop_docs])
        for (w,t_w),wt_count in wt_counts.iteritems(): C_WT[w2i[w]][t_w] = wt_count 
        for (f,t_f),ft_count in ft_counts.iteritems(): C_FT[self.f2i[f]][t_f] = ft_count
        for i,doc in enumerate(word_docs): # word,property pairs have the same topic assignments.
            dt_counts = Counter([t for w,t in doc])
            for j in range(T):
                C_DT[i][j] = dt_counts[j]
        if not C_WT.sum()==C_FT.sum()==C_DT.sum():
            raise Exception, "Error in counts in pseudo-document."
        def sample_topic(w_i,f_i,d): # defined locally to avoid passing around big matrices.
            P_num_arr = (C_WT[w_i,:]+beta_arr) * (C_FT[f_i,:]+gamma_arr) * (C_DT[d,:]+alpha_arr)
            P_denom_arr = (np.apply_along_axis(sum,0,C_WT)+Wbeta_arr) * \
                          (np.apply_along_axis(sum,0,C_FT)+Fgamma_arr) * \
                          (T * (C_DT[d,:].sum()+T*alpha))
            P = normalize(P_num_arr / P_denom_arr)
            return np.random.choice(np.array(topics),p=normalize(P)) 
        def gibbs(n_iters=100, verbose_freq=20): # 30: experimentally where the convergence is achieved.
            for e in range(n_iters):
                if e!=0 and e%verbose_freq==0: print "@ %dth iteration" % e
                for d,(word_doc,prop_doc) in enumerate(zip(word_docs,prop_docs)):
                    for (w,t),(f,_) in zip(word_doc,prop_doc): # a (w,f) pair have the same topic.
                        if C_WT[w2i[w]][t]==0 or C_FT[self.f2i[f]][t]==0 or C_DT[d][t]==0: continue
                        C_WT[w2i[w]][t] -= 1
                        C_FT[self.f2i[f]][t] -= 1
                        C_DT[d][t] -= 1
                        new_t = sample_topic(w2i[w],self.f2i[f],d)
                        C_WT[w2i[w]][new_t] += 1
                        C_FT[self.f2i[f]][new_t] += 1
                        C_DT[d][new_t] += 1
        gibbs()
        return C_WT, C_FT, w2i
        
    def infer(self, cv=5, topics=range(100)): # cv >= 2
        # norm set chunking
        random.shuffle(self.all_norms)
        norm_chunks = partition(self.all_norms, cv)
        self.results = []
        for i in range(cv):
            print "... Running CV round %d" % (i+1)
            train_norms = list(chain.from_iterable([norm_chunk for j,norm_chunk in enumerate(norm_chunks)
                                                    if j!=i])) # flatten.
            test_norms = norm_chunks[i]
            C_WT, C_FT, w2i = self.learn(train_norms, topics)
            def p_z_given_vr(z, vr):
                return C_WT[w2i[vr]][z] / C_WT[w2i[vr],:].sum()
            def p_f_given_z(f, z):
                return C_FT[self.f2i[f]][z] / C_FT[:,z].sum()
            norm2propdist = defaultdict(list)
            for norm in test_norms:
                vrs = list(set(self.norm2vr_f[norm]['vr']))
                p_z_arr = [sum(p_z_given_vr(z,vr) if vr in w2i else 0. for vr in vrs) for z in topics]
                p_f_arr = [np.dot([p_f_given_z(f,z) for z in topics],p_z_arr) for f in self.props]
                norm2propdist[norm] = p_f_arr
            self.results.append(norm2propdist)
    
    def evaluate(self):
        for i,norm2propdist in enumerate(self.results):
            print "CV round %d results:" % (i+1)
            topk_evaluate(norm2propdist, k=1)
            topk_evaluate(norm2propdist, k=5)
            topk_evaluate(norm2propdist, k=10)
            topk_evaluate(norm2propdist, k=20)
            map_evaluate(norm2propdist)
            print            


##### CV = 5

In [36]:
%%time
wpmbtm = WPMBTM(norms, props, norm2prop, norm2propprob, dep_triples)
wpmbtm.infer(cv=5)

... Running CV round 1
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 2
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 3
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 4
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 5
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
CPU times: user 46min 26s, sys: 19 s, total: 46min 45s
Wall time: 46min 35s


In [37]:
%%time
wpmbtm.evaluate()

CV round 1 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 6.153846%
Percentage Accurate (Test Norms) in Top 5 Predictions: 21.538462%
Percentage Accurate (Test Norms) in Top 10 Predictions: 35.384615%
Percentage Accurate (Test Norms) in Top 20 Predictions: 56.923077%
MAP: 3.325931%

CV round 2 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 9.230769%
Percentage Accurate (Test Norms) in Top 5 Predictions: 33.846154%
Percentage Accurate (Test Norms) in Top 10 Predictions: 55.384615%
Percentage Accurate (Test Norms) in Top 20 Predictions: 63.076923%
MAP: 3.697320%

CV round 3 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 9.230769%
Percentage Accurate (Test Norms) in Top 5 Predictions: 30.769231%
Percentage Accurate (Test Norms) in Top 10 Predictions: 43.076923%
Percentage Accurate (Test Norms) in Top 20 Predictions: 58.461538%
MAP: 3.653193%

CV round 4 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 9.230769%
Percentage A

##### CV = 10

In [38]:
%%time
wpmbtm = WPMBTM(norms, props, norm2prop, norm2propprob, dep_triples)
wpmbtm.infer(cv=10)

... Running CV round 1
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 2
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 3
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 4
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 5
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 6
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 7
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 8
... Running Topic Model
@ 20th iteration
@ 40th iteration
@ 60th iteration
@ 80th iteration
... Running CV round 9
... Running Topic Model
@ 20th iteration
@ 40th iteration

In [39]:
%%time
wpmbtm.evaluate()

CV round 1 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 9.375000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 28.125000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 37.500000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 62.500000%
MAP: 3.453744%

CV round 2 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 3.125000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 31.250000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 50.000000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 59.375000%
MAP: 3.348909%

CV round 3 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 9.375000%
Percentage Accurate (Test Norms) in Top 5 Predictions: 31.250000%
Percentage Accurate (Test Norms) in Top 10 Predictions: 46.875000%
Percentage Accurate (Test Norms) in Top 20 Predictions: 53.125000%
MAP: 4.508788%

CV round 4 results:
Percentage Accurate (Test Norms) in Top 1 Predictions: 12.500000%
Percentage 