<a href="https://colab.research.google.com/github/tanthongtan/ptm/blob/master/vptm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameters

In [None]:
num_topic = 50
dataset = 'nips'
method = 'vptm'

#vptm hyperparameters
alpha_scalar = 50./num_topic
v= 0.01
c = 0.005
prior_mu = 'neg' #possible options: neg, pos, mean
 
#GMC hyperparameters
num_samples = 1
num_burn = 5000
S = 25000
L = 20
eta_theta = 1e-1
rho_theta = 1e-1
eta_mu = 5e-4
rho_mu = 1e-1
eta_kappa = 5e-1
rho_kappa = 1e-1

# Run GMC Inference

In [None]:
#only for google colab
import sys
import os
if 'google.colab' in sys.modules:
    #lets see what gpu we were given
    !nvidia-smi
    #get repository
    !git clone https://github.com/tanthongtan/ptm.git
    %cd '/content/ptm'
    #get ref corp if doesn't exist
    if not os.path.isdir('wiki_final'):
        !unzip -q "/content/drive/My Drive/wiki_final.zip"
 
import torch
import torch.nn.functional as F
from geodesic import GeodesicMonteCarlo
from dataset import load_data, csr_to_torchsparse
import geodesic as g
import distributions as D
from tqdm.notebook import tqdm
import torch.distributions as dist
import numpy as np
from utils import print_topics, get_topics, vmf_perplexity, clustering_metrics_20news, print_summary
 
#make all tensors cuda if available and double
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    gpu = True
else:
    torch.set_default_tensor_type(torch.DoubleTensor)
    gpu = False
 
#Load Data
data_tr, data_te, vocab, vocab_size, num_tr = load_data(use_tfidf = True, sublinear = False, normalize = True, dataset = dataset)    
tensor_te = csr_to_torchsparse(data_te, gpu)
tensor_tr = csr_to_torchsparse(data_tr, gpu)

#declare tensor hyperparameters
alpha = torch.full((1,num_topic), alpha_scalar)
if prior_mu == 'neg':
    mu0 = F.normalize(torch.full((vocab_size,),-1.0),dim=-1)
if prior_mu == 'pos':
    mu0 = F.normalize(torch.full((vocab_size,),1.0),dim=-1)
if prior_mu == 'mean':
    mu0 = F.normalize(torch.sparse.sum(tensor_tr,dim=0).to_dense(),dim=-1)
 
#randomly initialize model parameters
theta = torch.randn(num_tr,num_topic-1)
mu = F.normalize(torch.randn(num_topic, vocab_size) / (vocab_size ** 0.5) + mu0, p=2, dim=-1)
kappa = torch.randn(num_topic,1)*50 + 50. * (vocab_size ** 0.5) 
 
#declare GMC transition kernels
kernel = GeodesicMonteCarlo(L)
params = {'theta': theta, 'mu':mu, 'kappa': kappa}
init_etas = {'theta': eta_theta, 'mu':eta_mu, 'kappa': eta_kappa}
geodesics = {'theta': g.RnGeodesic(eta = eta_theta, rho = rho_theta), 'mu': g.SphericalGeodesic(eta = eta_mu, rho = rho_mu), 'kappa': g.PositiveGeodesic(eta = eta_kappa, rho = rho_kappa)}
vs = {name: geodesics[name].projection(params[name],dist.MultivariateNormal(torch.zeros(params[name].shape[-1]), torch.eye(params[name].shape[-1])).sample([params[name].shape[0]])) for name in params}
 
#start sampling loop
t = tqdm(range(num_samples+num_burn))
theta_samples = 0
mu_samples = 0
kappa_samples = 0
for i in t:
    
    idx = torch.randperm(num_tr)[:S]
    x_batch = csr_to_torchsparse(data_tr[idx.cpu()], gpu)
    theta = params['theta']
    params['theta'] = theta[idx]
    v_theta = vs['theta']
    vs['theta'] = v_theta[idx]
 
    for name in geodesics:
        geodesics[name].eta = init_etas[name] * ((i+1) ** (-1./5.))
    params, vs = kernel.stochastic_transition(params, vs, geodesics, D.VptmJointDistributionWithStickDirConjugatePrior(x_batch, alpha, c, mu0, v))
    
    theta[idx] = params['theta']
    v_theta[idx] = vs['theta']
    params['theta'] = theta
    vs['theta'] = v_theta
    
    theta = params['theta']
    kappa = params['kappa']
    mu = params['mu']
    
    if torch.any(kappa != kappa):
        break
   
    if i >= num_burn:
        theta_samples += theta
        mu_samples += mu
        kappa_samples += kappa

    if i % 100 == 0:
        print("\ncurrent iteration:", i)
        print("kappa mean",kappa.mean())
        print("kappas",kappa)
        pi = dist.StickBreakingTransform()(theta)
        print("mu norms", mu.norm(dim=-1).sum(), num_topic)
        print("sparsity",(torch.abs(mu)**2.).norm(dim=-1))
        print("sparsitymean",(torch.abs(mu)**2.).norm(dim=-1).mean())
        print("pi sums", pi.sum(dim=-1).sum(), num_tr)

        sum_ll = 0.0
        sum_cs = 0.0
        for j in range(int(np.ceil(num_tr/S))):
            curr_pi = pi[j*S:j*S+S]
            curr_tensor_tr = csr_to_torchsparse(data_tr[j*S:j*S+S], gpu)
            curr_avg = torch.matmul(curr_pi,kappa*mu)
            sum_ll += D.log_prob_von_mises_fisher(curr_avg, curr_tensor_tr).sum()
            curr_avg = F.normalize(curr_avg,dim=-1)
            sum_cs += D.sparse_dense_dot(curr_tensor_tr, curr_avg).sum()
        
        print("log likelihood", sum_ll / num_tr)        
        print("cosine similarity", sum_cs / num_tr)
        print("perplexity", vmf_perplexity(tensor_te, mu, kappa, alpha, N=1000))

        sum_cs_spread = 0
        count_cs = 0
        for j in range(num_topic-1):
            for k in range(j+1,num_topic):
                sum_cs_spread += (mu[j] * mu[k]).sum(dim=-1)
                count_cs += 1
        print("mean cs spread", sum_cs_spread / count_cs,"\n")
        
    if i % 1000 == 0:
        emb = mu.cpu().numpy()
        print_topics(get_topics(emb,vocab))
        print("")


Fri Dec  4 09:16:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    27W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


current iteration: 0
kappa mean tensor(5172.3379)
kappas tensor([[5083.4648],
        [5119.6885],
        [5017.8979],
        [5009.2305],
        [5119.1484],
        [5061.9297],
        [5106.7583],
        [5270.1182],
        [5100.1440],
        [5122.5825],
        [5028.4761],
        [5111.4448],
        [5136.0039],
        [5128.9897],
        [5065.4106],
        [5229.4136],
        [5189.0845],
        [5225.4443],
        [5139.1753],
        [5151.6855],
        [5183.7974],
        [5111.7266],
        [5113.9839],
        [5139.1299],
        [5101.2417],
        [5145.9336],
        [5209.9116],
        [5141.1841],
        [5234.7539],
        [5195.6099],
        [5229.4707],
        [5139.3750],
        [5161.4214],
        [5270.7144],
        [5187.8486],
        [5162.5488],
        [5210.6133],
        [5242.1304],
        [5191.7227],
        [5233.1021],
        [5257.4390],
        [5191.1816],
        [5266.6650],
        [5155.2026],
        [5227.0430

# Get Topic Coherence

In [None]:
mu_final = mu_samples / num_samples
kappa_final = kappa_samples / num_samples
theta_final = theta_samples / num_samples
print("final perplexity", vmf_perplexity(tensor_te, mu_final, kappa_final, alpha, N=1000))
emb = mu_final.cpu().numpy()
topics = get_topics(emb, vocab)
print('prior_mu:', prior_mu)
print_summary(topics,method,dataset)

if dataset == '20news':
    pi = dist.StickBreakingTransform()(theta_final)
    pi = pi.cpu().numpy()
    clustering_metrics_20news(pi)

final perplexity tensor(-38096.4688)
prior_mu: neg

Method  = vptm
Number of topics = 50
Dataset = nips 

 NPMI       TU         Topic
 0.04860    0.77000    topic topics lda document documents dirichlet word words latent hdp
 0.08867    0.78333    matrix rank pca matrices subspace principal entries completion singular covariance
 0.08778    0.73095    clustering cluster clusters data spectral algorithm means points partition cut
 0.11839    0.65333    variational posterior gaussian bayesian inference prior log latent likelihood covariance
 0.09632    0.52929    kernel kernels svm learning space hilbert feature function data regression
 0.05045    0.67833    units layer hidden deep unit layers rbm learning training weights
 0.03557    0.63667    policy policies action state reward function gradient value reinforcement iteration
 0.05889    0.67262    unlabeled label supervised labeled learning labels semi data classification task
 0.09674    0.86429    manifold data points tangent embe