## Hoffman et al. (2010)

* **Batch Variational Bayes**
* **Online Variational Bayes (Stochastic)**
* **Online Variational Bayes (Mini-batch)**

In [1]:
from nltk.corpus import brown
from string import punctuation as punc
from spacy.en import STOPWORDS
from sklearn.preprocessing import normalize
from collections import Counter

In [3]:
word2count = Counter([word.lower() for word in brown.words()])

In [90]:
def strip(word):
    word = word.lower()
    return False if (word in STOPWORDS or word in punc or word2count[word]>1000 or word2count[word]<10 or (not word.isalpha())) else word
def strip_doc(doc): # doc: a list of sullied words.
    clean_doc = []
    for word in doc:
        clean_word = strip(word)
        if clean_word:
            clean_doc.append(clean_word)
    return clean_doc

In [91]:
vocab = list(set(word for word in brown.words() if strip(word)))
fileids = brown.fileids()
i2w = {i:w for i,w in enumerate(vocab)}
w2i = {w:i for i,w in i2w.iteritems()}
print 'vocab size:', len(vocab)
print '# files:', len(fileids)

vocab size: 11554
# files: 500


In [92]:
brown_docs = [strip_doc(brown.words(fileids=fileid)) for fileid in fileids] 
print len(brown_docs)

500


In [144]:
import numpy as np
import random
import time
import pandas as pd
from __future__ import division
from scipy.special import digamma
from scipy.special import gammaln

### Batch Variational Bayes

In [147]:
N = {}
num_tokens = 0
for d,doc in enumerate(brown_docs):
    w2c = Counter(doc)
    N[d] = np.array([w2c[w] for w in vocab])
    num_tokens += N[d].sum()

In [149]:
K = 10 # number of topics
W = len(vocab)

alpha = 0.1
eta   = 0.01

num_docs   = len(N)
num_epochs = 5
num_iters  = 10

In [151]:
%%time

Alpha = np.ones(K); Alpha.fill(alpha)
Eta   = np.ones((K, W)); Eta.fill(eta)

term4 = gammaln(K*alpha) - K*gammaln(alpha) + (gammaln(W*eta) - W*gammaln(eta))/num_docs

Lambda = np.random.gamma(100., 1./100, (K, W))

for e in range(num_epochs):
    
    print 'Epoch', e+1 
    L = 0
    
    t = time.time()
    
    n_Phi = np.zeros((K, W))
    for d in range(num_docs):
        
        n_d = N[d]

        shift = np.inf
        epsilon = 1e-3

        Gamma_d = np.random.gamma(100., 1./100, K)                  # K
        for _ in range(num_iters):

            sum_Gamma_d = Gamma_d.sum()                             # 1
            sum_Lambda  = Lambda.sum(axis=1)                        # K x 1
            
            Eq_logtheta_d = digamma(Gamma_d) - digamma(sum_Gamma_d)               # K - 1 = K
            Eq_logbeta    = digamma(Lambda) - digamma(sum_Lambda[:,np.newaxis])   # K x W - K x 1 = K x W
            
            Eq_logtheta_add_Eq_logbeta = Eq_logtheta_d[:,np.newaxis] + Eq_logbeta # for likelihood computation later.

            Phi_d = np.exp(Eq_logtheta_add_Eq_logbeta)  # K x W
            Phi_d = normalize(Phi_d, norm='l1', axis=0) # each word relates to a mult. over topics
            n_Phi_d = Phi_d * n_d # K x W

            Gamma_delta = Alpha + n_Phi_d.sum(axis=1) # K

            shift = (1/K)*abs(Gamma_d - Gamma_delta).sum()
            if shift < epsilon:
                break
                
            Gamma_d = Gamma_delta
        
        if d%50==0:
            print '... current shift =', shift

        n_Phi += n_Phi_d
        
        # COMPUTE LIKELIHOOD (at some point)
        term1 = (n_d * (Phi_d * (Eq_logtheta_add_Eq_logbeta - np.log(Phi_d))).sum(axis=0)).sum()
        term2 = -gammaln(sum_Gamma_d) + ((Alpha-Gamma_d)*Eq_logtheta_d + gammaln(Gamma_d)).sum()
        term3 = (-gammaln(sum_Lambda) + ((Eta-Lambda)*Eq_logbeta + gammaln(Lambda)).sum(axis=1)).sum() / num_docs 
        l = term1 + term2 + term3 + term4 
        L += l

    Lambda = Eta + n_Phi # K x W

    print "@epoch", e+1
    print "   ELBO =", L
    print "   Perplexity =", np.exp(-(L/num_tokens))

    print '(time elapsed:', str(time.time()-t)+')'


Epoch 1
... current shift = 0.572711781583
... current shift = 0.414873927668
... current shift = 0.74968012405
... current shift = 0.767346051529
... current shift = 0.561619147587
... current shift = 0.676928498971
... current shift = 0.675965892509
... current shift = 0.563462110808
... current shift = 0.708754030714
... current shift = 0.537567183138
@epoch 1
   ELBO = -3277734.55452
   Perplexity = 15217.8956769
(time elapsed: 51.6999080181)
Epoch 2
... current shift = 1.35892358408
... current shift = 1.05407248658
... current shift = 1.95318664544
... current shift = 1.36361188849
... current shift = 0.921860171214
... current shift = 1.1463568151
... current shift = 2.03817040101
... current shift = 1.62714043656
... current shift = 1.18262902962
... current shift = 1.18801256657
@epoch 2
   ELBO = -2665063.41491
   Perplexity = 2515.31692128
(time elapsed: 47.8911709785)
Epoch 3
... current shift = 2.81271704324
... current shift = 2.2642581148
... current shift = 3.6772839987

In [152]:
def top_words(topic, topk=20):
    word_dist = Lambda[topic] / Lambda[topic].sum()
    print map(lambda i:i2w[i], np.argsort(word_dist)[::-1][:topk])

In [153]:
for topic in range(K):
    print 'Topic:', topic
    top_words(topic, 20)

Topic: 0
[u'world', u'life', u'social', u'great', u'human', u'way', u'people', u'group', u'experience', u'men', u'years', u'sense', u'fact', u'society', u'work', u'art', u'power', u'war', u'good', u'feed']
Topic: 1
[u'college', u'people', u'school', u'years', u'students', u'course', u'stress', u'student', u'class', u'service', u'medical', u'year', u'stations', u'high', u'world', u'national', u'work', u'fact', u'good', u'president']
Topic: 2
[u'church', u'use', u'school', u'area', u'clay', u'economic', u'social', u'god', u'costs', u'people', u'life', u'index', u'churches', u'world', u'members', u'program', u'action', u'cost', u'electronic', u'place']
Topic: 3
[u'state', u'law', u'tax', u'year', u'policy', u'states', u'number', u'program', u'development', u'public', u'years', u'local', u'property', u'government', u'problem', u'fiscal', u'business', u'world', u'general', u'economic']
Topic: 4
[u'states', u'united', u'general', u'use', u'government', u'shall', u'business', u'cost', u'small

### Online Variational Bayes (Stochastic)

In [155]:
N = {}
num_tokens = 0
for d,doc in enumerate(brown_docs):
    w2c = Counter(doc)
    N[d] = np.array([w2c[w] for w in vocab])
    num_tokens += N[d].sum()

In [156]:
K = 10 # number of topics
W = len(vocab)

alpha = 0.1
eta   = 0.01
tau0  = 1.0
kappa = 0.5
rho   = lambda t: np.power(tau0 + t, -kappa)

num_docs   = len(N)
num_epochs = 5 # Doesn't have too. If too many docs, 1 pass.
num_iters  = 10

doc_ids = range(num_docs)

In [158]:
%%time

Alpha = np.ones(K); Alpha.fill(alpha)
Eta   = np.ones((K, W)); Eta.fill(eta)

term4 = gammaln(K*alpha) - K*gammaln(alpha) + (gammaln(W*eta) - W*gammaln(eta))/num_docs

Lambda = np.random.gamma(100., 1./100, (K, W))

for e in range(num_epochs):
    
    print 'Epoch:', e+1
    L = 0

    random.shuffle(doc_ids)
    for d in doc_ids:

        n_d = N[d]

        shift = np.inf
        epsilon = 1e-3

        Gamma_d = np.random.gamma(100., 1./100, K)                  # K
        for _ in range(num_iters):

            sum_Gamma_d = Gamma_d.sum()                             # 1
            sum_Lambda  = Lambda.sum(axis=1)                        # K x 1

            Eq_logtheta_d = digamma(Gamma_d) - digamma(sum_Gamma_d)               # K - 1 = K
            Eq_logbeta    = digamma(Lambda) - digamma(sum_Lambda[:,np.newaxis])   # K x W - K x 1 = K x W
            
            Eq_logtheta_add_Eq_logbeta = Eq_logtheta_d[:,np.newaxis] + Eq_logbeta # for likelihood computation later.

            Phi_d = np.exp(Eq_logtheta_add_Eq_logbeta)  # K x W
            Phi_d = normalize(Phi_d, norm='l1', axis=0) # each word relates to a mult. over topics
            n_Phi_d = Phi_d * n_d # K x W

            Gamma_delta = Alpha + n_Phi_d.sum(axis=1) # K

            shift = (1/K)*abs(Gamma_d - Gamma_delta).sum()
            if shift < epsilon:
                break

            Gamma_d = Gamma_delta

        if d%50==0:
            print '... current shift =', shift

        Lambda_hat = Eta + num_docs*n_Phi_d # K x W
        Lambda = (1 - rho(d))*Lambda + rho(d)*Lambda_hat
        
        # COMPUTE LIKELIHOOD (at some point)
        term1 = (n_d * (Phi_d * (Eq_logtheta_add_Eq_logbeta - np.log(Phi_d))).sum(axis=0)).sum()
        term2 = -gammaln(sum_Gamma_d) + ((Alpha-Gamma_d)*Eq_logtheta_d + gammaln(Gamma_d)).sum()
        term3 = (-gammaln(sum_Lambda) + ((Eta-Lambda)*Eq_logbeta + gammaln(Lambda)).sum(axis=1)).sum() / num_docs 
        l = term1 + term2 + term3 + term4
        L += l
    
    print "@epoch", e+1
    print "   ELBO =", L
    print "   Perplexity =", np.exp(-(L/num_tokens))    

    print '(time elapsed:', str(time.time()-t)+')'

Epoch: 1
... current shift = 0.362500538985
... current shift = 0.127364215178
... current shift = 0.000908639833014
... current shift = 0.000838237428591
... current shift = 0.000543284603562
... current shift = 0.808451832755
... current shift = 0.172544761524
... current shift = 0.237097432929
... current shift = 0.0166639313605
... current shift = 0.0209491011069
@epoch 1
   ELBO = -4554554.33239
   Perplexity = 647982.274947
(time elapsed: 753.123986006)
Epoch: 2
... current shift = 0.00104451531615
... current shift = 0.000445520845658
... current shift = 0.000896192262166
... current shift = 0.000423511232513
... current shift = 0.000792958741359
... current shift = 0.000542594748382
... current shift = 0.000687762180361
... current shift = 0.000780057671401
... current shift = 0.000212051581171
... current shift = 0.00572045628016
@epoch 2
   ELBO = -4447472.03846
   Perplexity = 473072.009268
(time elapsed: 798.888237)
Epoch: 3
... current shift = 0.00974466854595
... current 

In [159]:
def top_words(topic, topk=20):
    word_dist = Lambda[topic] / Lambda[topic].sum()
    print map(lambda i:i2w[i], np.argsort(word_dist)[::-1][:topk])

In [160]:
for topic in range(K):
    print 'Topic:', topic
    top_words(topic, 20)

Topic: 0
[u'state', u'candidate', u'committee', u'democratic', u'issue', u'court', u'administration', u'aid', u'announced', u'nomination', u'president', u'department', u'federal', u'political', u'governor', u'chairman', u'senate', u'legislation', u'plans', u'congress']
Topic: 1
[u'primary', u'trial', u'think', u'open', u'help', u'land', u'going', u'care', u'april', u'dead', u'safety', u'red', u'right', u'showed', u'ahead', u'muscle', u'small', u'proud', u'knew', u'later']
Topic: 2
[u'number', u'medium', u'areas', u'research', u'cells', u'conservation', u'normal', u'major', u'health', u'provide', u'degree', u'available', u'activity', u'obtained', u'costs', u'plastics', u'increased', u'use', u'increase', u'results']
Topic: 3
[u'home', u'night', u'asked', u'school', u'house', u'hospital', u'day', u'old', u'days', u'sheets', u'held', u'told', u'death', u'mother', u'took', u'long', u'office', u'ask', u'place', u'sign']
Topic: 4
[u'president', u'tax', u'government', u'medical', u'statements'

### Online Variational Bayes (Mini-Batch)

In [161]:
N = {}
num_tokens = 0
for d,doc in enumerate(brown_docs):
    w2c = Counter(doc)
    N[d] = np.array([w2c[w] for w in vocab])
    num_tokens += N[d].sum()

In [162]:
K = 10 # number of topics
W = len(vocab)

alpha = 0.01
eta   = 0.01
tau0  = 1.0
kappa = 0.5
rho   = lambda t: np.power(tau0 + t, -kappa)

num_docs   = len(N)
num_epochs = 5 # Doesn't have too. If too many docs, 1 pass.
num_iters  = 10

batch_size = 10

doc_ids = range(num_docs)

In [164]:
%%time

Alpha = np.ones(K); Alpha.fill(alpha)
Eta   = np.ones((K, W)); Eta.fill(eta)

term4 = gammaln(K*alpha) - K*gammaln(alpha) + (gammaln(W*eta) - W*gammaln(eta))/num_docs

Lambda = np.random.gamma(100., 1./100, (K, W))

doc_count = 0
n_Phi = np.zeros((K, W))

for e in range(num_epochs):
    
    print 'Epoch:', e+1
    L = 0
    
    random.shuffle(doc_ids)
    for d in doc_ids:
        
        doc_count += 1

        n_d = N[d]

        shift = np.inf
        epsilon = 1e-3

        Gamma_d = np.random.gamma(100., 1./100, K)                  # K
        for _ in range(num_iters):

            sum_Gamma_d = Gamma_d.sum()                             # 1
            sum_Lambda  = Lambda.sum(axis=1)[:,np.newaxis]          # K x 1

            Eq_logtheta_d = digamma(Gamma_d) - digamma(sum_Gamma_d) # K - 1 = K
            Eq_logbeta    = digamma(Lambda) - digamma(sum_Lambda)   # K x W - K x 1 = K x W

            Phi_d = np.exp(Eq_logtheta_d[:,np.newaxis] + Eq_logbeta) # K x W
            Phi_d = normalize(Phi_d, norm='l1', axis=0) # each word relates to a mult. over topics
            n_Phi_d = Phi_d * n_d # K x W

            Gamma_delta = Alpha + n_Phi_d.sum(axis=1) # K

            shift = (1/K)*abs(Gamma_d - Gamma_delta).sum()
            if shift < epsilon:
                break

            Gamma_d = Gamma_delta
            
        if d%50==0:
            print '... current shift =', shift

        n_Phi += n_Phi_d
        
        # COMPUTE LIKELIHOOD (at some point)
        term1 = (n_d * (Phi_d * (Eq_logtheta_add_Eq_logbeta - np.log(Phi_d))).sum(axis=0)).sum()
        term2 = -gammaln(sum_Gamma_d) + ((Alpha-Gamma_d)*Eq_logtheta_d + gammaln(Gamma_d)).sum()
        term3 = (-gammaln(sum_Lambda) + ((Eta-Lambda)*Eq_logbeta + gammaln(Lambda)).sum(axis=1)).sum() / num_docs 
        l = term1 + term2 + term3 + term4
        L += l        
        
        if doc_count%batch_size==0:      
            Lambda_hat = Eta + (num_docs/batch_size)*n_Phi # K x W
            Lambda = (1 - rho(d))*Lambda + rho(d)*Lambda_hat
            n_Phi = np.zeros((K, W))

    print "@epoch", e+1
    print "   ELBO =", L
    print "   Perplexity =", np.exp(-(L/num_tokens))             
            
    print '(time elapsed:', str(time.time()-t)+')'

Epoch: 1
... current shift = 0.918569088914
... current shift = 2.09215378584
... current shift = 2.28966296829
... current shift = 3.25613512494
... current shift = 1.29245904396
... current shift = 0.863406840169
... current shift = 1.12765526966
... current shift = 1.80294429832
... current shift = 0.139416608273
... current shift = 0.377625763314
@epoch 1
   ELBO = -28516065.3403
   Perplexity = 2.43331079158e+36
(time elapsed: 1047.57516408)
Epoch: 2
... current shift = 0.475328863889
... current shift = 0.293460987258
... current shift = 0.288433391785
... current shift = 0.592354977604
... current shift = 0.0729993856156
... current shift = 0.340496535977
... current shift = 0.176994585004
... current shift = 0.0733331505887
... current shift = 0.194059968369
... current shift = 0.106563928432
@epoch 2
   ELBO = -25963585.4265
   Perplexity = 1.34666702412e+33
(time elapsed: 1096.67845511)
Epoch: 3
... current shift = 0.153179627433
... current shift = 0.364054970768
... current

In [167]:
def top_words(topic, topk=20):
    word_dist = Lambda[topic] / Lambda[topic].sum()
    print map(lambda i:i2w[i], np.argsort(word_dist)[::-1][:topk])

In [168]:
for topic in range(K):
    print 'Topic:', topic
    top_words(topic, 20)

Topic: 0
[u'state', u'states', u'united', u'government', u'community', u'local', u'development', u'program', u'public', u'business', u'year', u'department', u'general', u'national', u'areas', u'act', u'service', u'cent', u'society', u'fiscal']
Topic: 1
[u'years', u'year', u'day', u'city', u'john', u'home', u'week', u'school', u'good', u'house', u'men', u'days', u'board', u'high', u'president', u'radio', u'farm', u'great', u'north', u'war']
Topic: 2
[u'music', u'labor', u'radiation', u'moon', u'level', u'opportunity', u'earth', u'stock', u'programs', u'condition', u'estate', u'individual', u'observations', u'systems', u'stations', u'appeal', u'ship', u'station', u'decision', u'marine']
Topic: 3
[u'values', u'number', u'class', u'order', u'point', u'different', u'analysis', u'change', u'theory', u'value', u'image', u'lines', u'larger', u'line', u'particular', u'skywave', u'obvious', u'general', u'paper', u'space']
Topic: 4
[u'world', u'life', u'people', u'great', u'fact', u'way', u'sense