## 0. Prepare Objects

### A. Parse BNC

In [3]:
from nltk.corpus import BNCCorpusReader
import re

In [2]:
bnc_path = "/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/CORPORA/BNC/Texts/"

In [5]:
bnc = BNCCorpusReader(root=bnc_path,fileids=r'\w*.xml')

In [7]:
type(bnc)

nltk.corpus.reader.bnc.BNCCorpusReader

In [9]:
%%time
bnc_sents = bnc.sents()

CPU times: user 4.25 s, sys: 210 ms, total: 4.46 s
Wall time: 4.69 s


In [10]:
len(bnc_sents)

6026276

In [24]:
from spacy.en import English

In [27]:
parser = English()

In [28]:
%%time
bnc_sents = [unicode(' '.join(sent)) for sent in bnc_sents]

CPU times: user 27min 8s, sys: 37.2 s, total: 27min 46s
Wall time: 27min 51s


In [29]:
%%time
parsed_sents = [parser(sent) for sent in bnc_sents]

CPU times: user 2h 12min 52s, sys: 4min 11s, total: 2h 17min 4s
Wall time: 2h 20min 29s


### B. Prepare Norms

In [32]:
def make_token2lemma(parsed_sents):
    
    lemmas = set()
    token2lemma = {}
    for parsed_sent in parsed_sents:
        for token in parsed_sent:
            token2lemma[token.orth_] = token.lemma_
    
    return token2lemma

In [33]:
%%time
token2lemma = make_token2lemma(parsed_sents)

CPU times: user 1min 32s, sys: 1min 20s, total: 2min 52s
Wall time: 4min 9s


In [34]:
import pandas as pd
import cPickle

In [35]:
norms,norm2prop,norm2propprob,props = cPickle.load(open("/Users/jacobsw/Desktop/UNIV/FALL_2016/LIN389C_RSCH_COMPLING/BAYESIAN/DATA/MCRAE/mcare_norm_data.p",'rb'))

## I. Pipeline

### A. $f: w\rightarrow (v,r)$

In [36]:
from collections import defaultdict
from spacy.en import STOPWORDS

In [37]:
def make_norm2vr(norms,token2lemma,parsed_sents):
    
    norms_set = set(norms)
    norm2vr = defaultdict(list)
    for parsed_sent in parsed_sents:
        for token in parsed_sent:
            if token.lemma_ in norms_set \
                and token.head.lemma_ not in STOPWORDS \
                and (token.dep_.endswith('subj') or token.dep_.endswith('obj')):
                rel = 'subj' if token.dep_.endswith('subj') else 'obj'
                norm2vr[token.lemma_] += [(token.head.lemma_,rel)]
                
    return norm2vr

In [39]:
%%time
norm2vr = make_norm2vr(norms,token2lemma,parsed_sents)

CPU times: user 47.3 s, sys: 1min 2s, total: 1min 49s
Wall time: 2min 47s


In [40]:
print len(norm2vr) # 523 of the 532 unique norms appear in brown

523


In [43]:
print norm2vr['alligator']

[(u'row', 'obj'), (u'remount', 'obj'), (u'shoot', 'obj'), (u'wrestle', 'obj'), (u'include', 'obj'), (u'stand', 'subj'), (u'saw', 'subj'), (u'kill', 'obj'), (u'fee', 'obj'), (u'wee', 'subj'), (u"'", 'subj'), (u'like', 'obj'), (u'swim', 'subj'), (u'disturb', 'obj'), (u'pull', 'subj'), (u'sum', 'subj'), (u'want', 'obj'), (u'receive', 'obj'), (u'jive', 'subj'), (u'escape', 'subj'), (u'm', 'obj'), (u'resemble', 'obj'), (u'like', 'obj'), (u'frighten', 'obj')]


## B. Make Pseudo-Docs

In [44]:
import numpy as np

In [45]:
def sample_prop(norm,norm2prop,norm2propprob):
    sample_idx = np.argmax(np.random.multinomial(1, norm2propprob[norm]))
    return norm2prop[norm][sample_idx]

In [46]:
def make_pseudo_docs(norm2vr,norm2prop,norm2propprob):
    
    docs = []
    for norm in norm2vr.iterkeys():
        doc = []
        for vr in norm2vr[norm]:
            doc += [sample_prop(norm,norm2prop,norm2propprob),vr[0]+'-'+vr[1]]
        docs.append(doc)
    
    return docs

In [47]:
%%time
docs = make_pseudo_docs(norm2vr,norm2prop,norm2propprob)

CPU times: user 1.04 s, sys: 1.04 s, total: 2.08 s
Wall time: 4.6 s


In [48]:
print len(docs)

523


In [49]:
print docs[1]

['used_for_music', u'skirl-subj', 'is_plaid', u'start-obj', 'a_musical_instrument', u'play-obj', 'has_a_bag', u'place-obj', 'used_for_music', u'play-obj', 'a_musical_instrument', u'play-obj', 'has_pipes', u'play-obj', 'associated_with_Scotland', u'bring-obj', 'a_musical_instrument', u'play-obj', 'requires_air', u'bring-obj', 'a_musical_instrument', u'mention-obj', 'has_a_bag', u'pick-obj', 'associated_with_Scotland', u'play-obj', 'a_musical_instrument', u'start-obj', 'used_in_marching_bands', u'play-obj', 'has_pipes', u'fetch-obj', 'used_in_marching_bands', u'pick-obj', 'a_musical_instrument', u'carry-obj', 'used_for_music', u'carry-obj', 'has_pipes', u'play-subj', 'used_in_marching_bands', u'play-obj', 'a_musical_instrument', u'play-obj', 'associated_with_Scotland', u'start-obj', 'a_musical_instrument', u'contain-obj', 'has_a_bag', u'contain-obj', 'associated_with_Scotland', u'assemble-obj', 'has_a_bag', u'play-obj', 'associated_with_Scotland', u'cover-obj', 'has_pipes', u'play-subj',

### C. Topic Modeling

In [50]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [66]:
from scipy.stats import entropy
from operator import itemgetter
from __future__ import division

In [111]:
class TopicModel:
    
    def __init__(self, documents, num_topics=100, num_iters=5):
        dic = corpora.Dictionary(documents)
        documents = [dic.doc2bow(document) for document in documents]
        self.documents = documents
        self.num_topics = num_topics
        self.lda = LdaModel(documents, num_topics=self.num_topics, id2word=dic, passes=num_iters)
    
    def summarize_topics(self, num_words=20):
        print "Statistics by Topic"
        print 
        for i in xrange(self.num_topics):
            print "Most %d-Frequent Words in Topic %d:" % (num_words,i)
            print map(lambda (topicid,prob):(self.lda.id2word[topicid],prob), self.lda.get_topic_terms(i,topn=num_words))
            print 
    
    def summarize_documents(self, norm2vr):
        docid2norm = [norm for norm in norm2vr.iterkeys()] # indexing norms, same indexing pseudo-docs are made.
        print "Statistics by Documents"
        print
        for i,document in enumerate(self.documents):
            print "Topic Distribution in Document %d (correspond to %s)" % (i,docid2norm[i])
            topicidprob_pairs = self.lda.get_document_topics(document)
            for topicid,prob in topicidprob_pairs:
                print topicid, prob
                print map(lambda (tid,p):(self.lda.id2word[tid],p), self.lda.get_topic_terms(topicid))
            print
    
    def evaluate_topic_model(self, norm2vr, norm2prop, props):
        docid2norm = [norm for norm in norm2vr.iterkeys()]
        props_set = set(props) # for faster look up.
        accuracies1, accuracies2, scores1, scores2 = [], [], [], []
        topic_ents = []
        for i,document in enumerate(self.documents):
            norm = docid2norm[i] # the norm corresponding to the doc.
            topicidprob_pairs = self.lda.get_document_topics(document,minimum_probability=0) # return all.
            topic_ent = entropy(map(lambda (tid,p):p, topicidprob_pairs)) # measures topic concentration.
            topic_ents.append(topic_ent)
            top_topic, top_prob = sorted(topicidprob_pairs,key=itemgetter(1),reverse=True)[0]
            inv_wgt = 1 / top_prob 
                # the higher the prob, the smaller this weight.
                # therefore a low-confident top would have higher weight to balance out its prop predictions.
            termprob_pairs = map(lambda (nid,p):(self.lda.id2word[nid],p),
                                 self.lda.get_topic_terms(top_topic,topn=len(self.lda.id2word)))
            # scoring scheme 1: top 5, each makes up 20%
            counter = 5
            top5 = []
            for term,prob in termprob_pairs:
                if counter==0: break
                if term in props_set:
                    top5.append(term)
                    counter -= 1
            accuracy1 = len(set(top5).intersection(set(norm2prop[norm]))) * .2
            score1 = accuracy1 * inv_wgt
            accuracies1.append(accuracy1); scores1.append(score1)
            # scoring scheme 2: top 5, either 1 gives 100%
            accuracy2 = 1. if len(set(top5).intersection(set(norm2prop[norm]))) else 0.
            score2 = accuracy2 * inv_wgt
            accuracies2.append(accuracy2); scores2.append(score2)
            # report
            print "Document %d (correspond to %s)" % (i,norm); print
            print "Top Topic: %d (prob=%.6f%%)" % (top_topic,top_prob*100)
            print "Entropy for Topic Distribution: ", topic_ent
            print "Prediction Accuracy & Score (Scheme 1): %.6f%%, %.6f" % (accuracy1*100, score1)
            print "Prediction Accuracy & Score (Scheme 2): %.6f%%, %.6f" % (accuracy2*100, score2)
        return accuracies1, scores1, accuracies2, scores2, topic_ents
            
                                 

In [112]:
%%time
tm = TopicModel(docs)

CPU times: user 22.5 s, sys: 455 ms, total: 23 s
Wall time: 22.9 s


In [113]:
tm.summarize_topics()

Statistics by Topic

Most 20-Frequent Words in Topic 0:
[(u'made_of_metal', 0.01597430472264658), (u'used_for_transportation', 0.012737787994824001), (u'used_for_passengers', 0.011743516906459292), (u'used_on_rail_tracks', 0.010648065313182513), (u'has_wheels', 0.01043481427391808), (u'has_windows', 0.010434172086995501), (u'catch-obj', 0.0093804298293431766), (u'is_round', 0.0091654239741218932), (u'has_doors', 0.008573326991180517), (u'made_of_wood', 0.007836562045746464), (u'has_a_caboose', 0.0074365474462036287), (u'has_an_engine', 0.0069684594738258056), (u'made_of_brick', 0.0069076120636950523), (u'is_fast', 0.0060670391201919803), (u'run-subj', 0.0054597928765472998), (u'is_large', 0.0053002639162437696), (u'stop-subj', 0.0052693746411553973), (u'come-subj', 0.0049272871143913712), (u'is_expensive', 0.0048299736084594256), (u'is_tall', 0.0046416968170053885)]

Most 20-Frequent Words in Topic 1:
[(u'found_on_water', 0.061141289840650245), (u'made_of_wood', 0.045124161074476421), 

In [53]:
tm.summarize_documents(norm2vr)

Statistics by Documents

Topic Distribution in Document 0 (correspond to fawn)
30 0.285604381748
[(u'used_by_sitting_on', 0.044175469948635242), (u'has_4_legs', 0.042829583446239439), (u'is_white', 0.040198978914558189), (u'has_legs', 0.036414326814258824), (u'like-obj', 0.022802392038389928), (u'an_animal', 0.022183621143558829), (u'push-obj', 0.021958778503408168), (u'beh_-_bahs', 0.021768826345033184), (u'used_for_wool', 0.021344887011905473), (u'beh_-_eats_grass', 0.021171839060403444)]
34 0.368872551906
[(u'wear-obj', 0.12635626989168938), (u'clothing', 0.071284418805822439), (u'worn_by_women', 0.068400137558583055), (u'is_long', 0.039384157977748344), (u'made_of_material', 0.038788698851419029), (u'is_short', 0.036288354742804278), (u'is_formal', 0.026893880716255369), (u'different_styles', 0.026725921794278632), (u'is_feminine', 0.024120243200965005), (u'worn_by_girls', 0.020297096450396703)]
45 0.159908576604
[(u'like-obj', 0.049201779140040265), (u'an_animal', 0.03659567802469

In [107]:
%%time
acc1, scr1, acc2, scr2, ents = tm.evaluate_topic_model(norm2vr, norm2prop, props)

Document 0 (correspond to fawn)

Top Topic: 10 (prob=58.405347%)
Entropy for Topic Distribution:  0.922105706599
Prediction Accuracy & Score (Scheme 1): 0.000000%, 0.000000
Prediction Accuracy & Score (Scheme 2): 0.000000%, 0.000000
Document 1 (correspond to bagpipe)

Top Topic: 13 (prob=98.887640%)
Entropy for Topic Distribution:  0.11221732337
Prediction Accuracy & Score (Scheme 1): 20.000000%, 0.202250
Prediction Accuracy & Score (Scheme 2): 100.000000%, 1.011249
Document 2 (correspond to pheasant)

Top Topic: 33 (prob=69.540092%)
Entropy for Topic Distribution:  0.851705358948
Prediction Accuracy & Score (Scheme 1): 100.000000%, 1.438019
Prediction Accuracy & Score (Scheme 2): 100.000000%, 1.438019
Document 3 (correspond to chain)

Top Topic: 90 (prob=99.298149%)
Entropy for Topic Distribution:  0.0500406064014
Prediction Accuracy & Score (Scheme 1): 100.000000%, 1.007068
Prediction Accuracy & Score (Scheme 2): 100.000000%, 1.007068
Document 4 (correspond to gown)

Top Topic: 81 (p

In [114]:
print "Scheme 1"
print np.mean(acc1)*100
print np.mean(scr1)
print "Scheme 2"
print np.mean(acc2)*100
print np.mean(scr2)
print "Avg Ent"
print np.mean(ents)
print np.std(ents)

Scheme 1
46.8068833652
0.65630350233
Scheme 2
80.3059273423
1.17819174853
Avg Ent
0.715120891179
0.541728548244


In [115]:
entropy([1/100]*100)

4.6051701859880918

In [117]:
1*1

1