## Bi-TM with raw Python

* **Collapsed Gibbs Sampling** (Steyvers & Griffiths, 2007)
* **Online Variational Bayes** (Hoffman, Blei & Bach, 2010)
* **Bimodal Topic Model** (Wang, Roller and Erk, 2017)

NB: the proof-of-concept demo works with the Quantitative McRae (QMR, Herbelot & Vecchi, 2015) in a bimodal clustering task. There does not seem to be direct extension with Variational methods (as opposed to Gibbs, which does), so Gibbs can even run faster and faster as the topics concentrate. **The clustering looks reasonably even after only 10 Gibbs rounds**.

In [175]:
from __future__ import division

import time
import random
import cPickle, dill
import pandas as pd
import numpy as np

from collections import defaultdict, Counter
from itertools import chain
from operator import add
from functools import partial
from sklearn.metrics import average_precision_score
from scipy.stats.stats import pearsonr, spearmanr

In [225]:
# Helpers

class Indexer(object):
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}
    def __repr__(self):
        return str([str(self.get_object(i)) for i in xrange(0, len(self))])
    def __len__(self):
        return len(self.objs_to_ints)
    def get_object(self, index):
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]
    def contains(self, object):
        return self.index_of(object) != -1
    def index_of(self, object):
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]
    def get_index(self, object, add=True):
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]
    
def normalize(arr):
    if type(arr)==list: arr = np.array(arr)
    return arr / arr.sum()

def safe_log(x, minval=0.0000000001):
    return np.log(x.clip(min=minval))

$\large{\textbf{Collapsed Gibbs}}$

In [177]:
# Load data

# data_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/ad.p'
data_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/qmr-freq.p'
concepts, cpt2ft, cpt2ftprob, features = cPickle.load(open(data_path, 'r'))
# cutoff = int(0.5*len(concepts))
# train_concepts, test_concepts = concepts[:cutoff], concepts[cutoff:]

# Organize input

num_topics = 10

concepts = set(concepts)
cpt_indexer = Indexer()
vr_indexer = Indexer()
ft_indexer = Indexer()
cpt2doc = defaultdict(list)

bnc_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/bnc-triples.txt'

with open(bnc_path, 'r') as f:
    for line in f:
        line = line.strip().split()
        if len(line)!=3: continue
        cpt,dep,v = line
        if cpt in concepts:
            cpt_id = cpt_indexer.get_index(cpt)
            vr_id = vr_indexer.get_index(v+'-'+dep)
            ft = np.random.choice(cpt2ft[cpt], p=normalize(np.array(cpt2ftprob[cpt])))
            ft_id = ft_indexer.get_index(ft)
            topic = np.random.randint(0, num_topics)
            cpt2doc[cpt_id].append((vr_id, ft_id, topic))

concepts = set(cpt_indexer.objs_to_ints.keys()) # update concepts
            
print '#concepts = ' + repr(len(cpt_indexer))
print '#documents = ' + repr(len(cpt2doc))
print '#verb-roles = ' + repr(len(vr_indexer))
print '#features = ' + repr(len(ft_indexer))

#concepts = 500
#documents = 500
#verb-roles = 29124
#features = 220


In [171]:
W, F, D, T = len(vr_indexer), len(ft_indexer), len(cpt_indexer), num_topics

C_WT, C_FT, C_DT = np.zeros((W,T)), np.zeros((F,T)), np.zeros((D,T))

for cpt_id in cpt2doc.iterkeys():
    for vr_id,ft_id,topic in cpt2doc[cpt_id]:
        C_WT[vr_id,topic] += 1
        C_FT[ft_id,topic] += 1
        C_DT[cpt_id,topic] += 1
        
beta, gamma, alpha = .1, .1, .1
beta_vec = np.array([beta]*T); W_beta = W*beta_vec
gamma_vec = np.array([gamma]*T); F_gamma = F*gamma_vec
alpha_vec = np.array([alpha]*T); T_alpha = T*alpha_vec

def gibbs_sample(w_i, f_i, d): # TODO: need to offset by priors!
    a = (C_WT[w_i,:] + beta_vec) / (C_WT.sum(axis=0) + W_beta)
    b = (C_FT[f_i,:] + gamma_vec) / (C_FT.sum(axis=0) + F_gamma)
    c = (C_DT[d,:] + alpha_vec) / (C_DT[d,:].sum() + T_alpha)
    d = a * b * c
    p_z = normalize(d)
    return np.random.choice(np.arange(T), p=p_z)

def document_loglikelihood(d):
    a = C_WT / C_WT.sum(axis=0)
    b = C_DT[d,:] / C_DT[d,:].sum()
    m = a * b
    return safe_log(m.sum(axis=1)).sum() # log(W|d)

def data_loglikelihood(): # sum_d log(W|d)
    return np.array([document_loglikelihood(d) for d in cpt2doc.iterkeys()]).sum()

def predict(d, k=5):
    a = C_FT / C_FT.sum(axis=0)
    b = C_DT[d,:] / C_DT[d,:].sum()
    p_f = (a * b).sum(axis=1)
    return [ft_indexer.get_object(f_id) for f_id in np.argsort(p_f)[::-1][:k]]

def gibbs_loop(n_iter=10, n_print=1):
    for e in range(n_iter):
        start = time.time()
        for cpt_id in cpt2doc.iterkeys(): # for each doc
            for vr_id,ft_id,topic in cpt2doc[cpt_id]: # for each entry
                if C_WT[vr_id,topic]==0 or C_FT[ft_id,topic]==0 or C_DT[cpt_id,topic]==0:
                    continue
                C_WT[vr_id,topic] -= 1
                C_FT[ft_id,topic] -= 1
                C_DT[cpt_id,topic] -= 1
                new_topic = gibbs_sample(vr_id,ft_id,cpt_id)
                C_WT[vr_id,new_topic] += 1
                C_FT[ft_id,new_topic] += 1
                C_DT[cpt_id,new_topic] += 1
        if e % n_print == 0:
            print 'Epoch ' + repr(e+1) + ' time = ' + repr(time.time()-start)
            print repr(document_loglikelihood(0))

In [172]:
print "Before"
for cpt_id in cpt2doc.keys()[:5]:
    print cpt_indexer.get_object(cpt_id) + ' ' + repr(predict(cpt_id))
print
print "Training"
gibbs_loop()
print
print "After"
for cpt_id in cpt2doc.keys()[:5]:
    print cpt_indexer.get_object(cpt_id) + ' ' + repr(predict(cpt_id))

Before
level [u'made_of_paper', u'made_of_wood', u'made_of_metal', u'a_tool', u'is_round']
book [u'made_of_paper', u'made_of_wood', u'made_of_metal', u'a_tool', u'is_round']
certificate [u'made_of_paper', u'made_of_wood', u'made_of_metal', u'a_tool', u'is_round']
card [u'made_of_paper', u'made_of_wood', u'made_of_metal', u'a_tool', u'is_round']
telephone [u'made_of_paper', u'made_of_wood', u'made_of_metal', u'a_tool', u'is_round']

Training
Epoch 1 time = 210.933434009552
-346433.28793915332
Epoch 2 time = 168.16139888763428
-349299.93902262847
Epoch 3 time = 106.8126711845398
-367112.80546143511
Epoch 4 time = 60.34106707572937
-401908.83980329288
Epoch 5 time = 41.439839124679565
-460935.82489537925
Epoch 6 time = 33.05083417892456
-497566.87269232917
Epoch 7 time = 28.408526182174683
-526617.91637018719
Epoch 8 time = 25.34993600845337
-528625.00486221421
Epoch 9 time = 23.177672863006592
-530670.47874918918
Epoch 10 time = 21.459289073944092
-532923.07565391448

After
level [u'a_to

$\large{\textbf{Online Variational Bayes}}$

In [227]:
from __future__ import division

import time
import random
import numpy as np

from sklearn.preprocessing import normalize as sk_normalize
from scipy.special import gammaln
from scipy.special import psi

In [235]:
# Load data

# data_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/ad.p'
data_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/qmr-freq.p'
concepts, cpt2ft, cpt2ftprob, features = cPickle.load(open(data_path, 'r'))
# cutoff = int(0.5*len(concepts))
# train_concepts, test_concepts = concepts[:cutoff], concepts[cutoff:]

# Organize input

num_topics = 10

concepts = set(concepts)
cpt_indexer = Indexer()
vr_indexer = Indexer()
ft_indexer = Indexer()
cpt2vrdoc = defaultdict(list)
cpt2ftdoc = defaultdict(list)

bnc_path = '/home/jacobsuwang/Documents/CS TRAINING/VI/DATA/BI-TM/bnc-triples.txt'

with open(bnc_path, 'r') as f:
    for line in f:
        line = line.strip().split()
        if len(line)!=3: continue
        cpt,dep,v = line
        if cpt in concepts:
            cpt_id = cpt_indexer.get_index(cpt)
            vr_id = vr_indexer.get_index(v+'-'+dep)
            ft = np.random.choice(cpt2ft[cpt], p=normalize(np.array(cpt2ftprob[cpt])))
            ft_id = ft_indexer.get_index(ft)
            cpt2vrdoc[cpt_id].append(vr_id)
            cpt2ftdoc[cpt_id].append(ft_id)

concepts = set(cpt_indexer.objs_to_ints.keys()) # update concepts
            
print '#concepts = ' + repr(len(cpt_indexer))
print '#documents = ' + repr(len(cpt2vrdoc)); assert len(cpt2vrdoc)==len(cpt2ftdoc)
print '#verb-roles = ' + repr(len(vr_indexer))
print '#features = ' + repr(len(ft_indexer))

#concepts = 500
#documents = 500
#verb-roles = 29124
#features = 220


In [236]:
D = len(cpt_indexer)
K = num_topics
W = len(vr_indexer)
F = len(ft_indexer)

alpha = 0.1
eta   = 0.01
zeta  = 0.01
tau0  = 1.0
kappa = 0.5

def rho(t):
    return np.power(tau0 + t, -kappa)

alpha_v = np.array([alpha]*K)[:,np.newaxis] # (K,1)
eta_m   = np.zeros((K, W)); eta_m.fill(eta)
zeta_m  = np.zeros((K, F)); zeta_m.fill(eta)

# todo: term4 (NB: haven't worked out the ELBO)
#       if this works, monitor MAP score of property predictions
#       instead!

lambda_m = np.random.gamma(100, 1/100, (K, W))
xi_m     = np.random.gamma(100, 1/100, (K, F))

# todo: predict p(f|d) (require repackaging, because ...)
def predict(topic, k=5):
    w_ids = np.argsort(lambda_m[topic]/lambda_m[topic].sum())[::-1][:k]
    f_ids = np.argsort(xi_m[topic]/xi_m[topic].sum())[::-1][:k]
    vrs   = [vr_indexer.get_object(w_id) for w_id in w_ids]
    fts   = [ft_indexer.get_object(f_id) for f_id in f_ids]
    return vrs, fts
# todo: print before results

num_epochs    = 5
num_iters     = 5
epsilon       = 1e-3
docidxs       = np.arange(D)
# todo: batch matrix (may be not able to do batch)
batch_size    = 10
t             = 0 # time step
for e in range(num_epochs):
    
    start = time.time()
    
    for _ in range(int(D/batch_size)):
        batch_docidxs = np.random.choice(docidxs,batch_size,replace=False)
        phi_w_m = np.zeros((K, W)) # batch cache of phi info
        phi_f_m = np.zeros((K, F))    
        for d in batch_docidxs:
            wf_pairs = zip(cpt2vrdoc[d],cpt2ftdoc[d])
            N_d      = len(wf_pairs)
            shift    = 0.0
            gamma_d_v = np.random.gamma(100, 1/100, (K, 1))
            for _ in range(num_iters):
                phi_d_m = np.zeros((K,N_d)) # (K,N_d)
                for i,(w,f) in enumerate(wf_pairs): # d,w,f are int ids
                    Eq_logtheta_d = psi(gamma_d_v) - psi(gamma_d_v.sum()) # (K,1)
                    Eq_logbeta_w  = psi(lambda_m[:,w][:,np.newaxis]) \
                                    - psi(lambda_m.sum(axis=1)[:,np.newaxis]) # (K,1)
                    Eq_logdelta_f = psi(xi_m[:,f][:,np.newaxis]) \
                                    - psi(xi_m.sum(axis=1)[:,np.newaxis]) # (K,1)
                    phi_d_m[:,i]  = np.exp(Eq_logtheta_d+Eq_logbeta_w+Eq_logdelta_f).T # update the ith column
                    phi_i_v = sk_normalize([phi_d_m[:,i]], 'l1')[0] # (K,), phi info for the ith entry
                    phi_d_m[:,i] = phi_i_v # replace the original with normalized
                    phi_w_m[:,w] += phi_i_v # update for corresponding word
                    phi_f_m[:,f] += phi_i_v # update for corresponding feature
                # todo: select cols to update 
                gamma_d_v_next = alpha_v + phi_d_m.sum(axis=1)[:,np.newaxis] # (K,1)
                shift = (1/K)*np.abs(gamma_d_v - gamma_d_v_next).sum()
                if shift < epsilon:
                    break
                gamma_d_v = gamma_d_v_next
                
        t            += 1
        lambda_hat_m = eta_m + (D/batch_size)*phi_w_m
        lambda_m     = (1-rho(t))*lambda_m + rho(t)*lambda_hat_m
        xi_hat_m     = zeta_m + (D/batch_size)*phi_f_m
        xi_m         = (1-rho(t))*xi_m + rho(t)*xi_hat_m
    
    print 'Epoch ' + repr(e+1) + ' (time elapsed: ' + "%.2f secs" % (time.time()-start) + ')'    

Epoch 1 (time elapsed: 268.46 secs)
Epoch 2 (time elapsed: 254.09 secs)
Epoch 3 (time elapsed: 298.25 secs)
Epoch 4 (time elapsed: 254.21 secs)
Epoch 5 (time elapsed: 261.84 secs)


In [237]:
for topic in range(K):
    vrs,fts = predict(topic)
    print 'Topic ' + repr(topic)
    print 'contexts: ' + repr(vrs)
    print 'properties: ' + repr(fts)
    print

Topic 0
contexts: ['high-amod', 'low-amod', 'sea-compound', 'local-amod', 'different-amod']
properties: [u'a_tool', u'made_of_wood', u'used_for_construction', u'used_for_carpentry', u'is_hard']

Topic 1
contexts: ['little-amod', 'old-amod', 'young-amod', 'large-amod', 'come-nsubj']
properties: [u'an_animal', u'a_mammal', u'has_legs', u'has_4_legs', u'has_a_tail']

Topic 2
contexts: ['hit-dobj', 'come-nsubj', 'use-dobj', 'new-amod', 'catch-dobj']
properties: [u'is_fun', u'used_for_transportation', u'a_tool', u'made_of_wood', u'has_wheels']

Topic 3
contexts: ['dressing-compound', 'stone-compound', 'new-amod', 'big-amod', 'white-amod']
properties: [u'found_in_houses', u'used_for_protection', u'made_of_wood', u'has_buttons', u'is_electrical']

Topic 4
contexts: ['read-dobj', 'write-dobj', 'new-amod', 'seat-compound', 'frying-compound']
properties: [u'made_of_paper', u'has_a_lid', u'used_for_holding_things', u'is_round', u'is_transparent']

Topic 5
contexts: ['play-dobj', 'diamond-compound