<a href="https://colab.research.google.com/github/tanthongtan/ptm/blob/master/vptm_used_in_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameters

In [None]:
num_topic = 100
dataset = 'wiki'
method = 'vptm'

#vptm hyperparameters
alpha_scalar = 0.5
v= 0.05
c = 0.005
prior_mu = 'neg' #possible options: neg, pos, mean
 
#GMC hyperparameters
num_samples = 1
num_burn = 5000
S = 25000
L = 20
eta_theta = 1e-1
rho_theta = 1e-1
eta_mu = 5e-4
rho_mu = 1e-1
eta_kappa = 5e-1
rho_kappa = 1e-1

# Run GMC Inference

In [None]:
#only for google colab
import sys
import os
if 'google.colab' in sys.modules:
    #lets see what gpu we were given
    !nvidia-smi
    #get repository
    !git clone https://github.com/tanthongtan/ptm.git
    %cd '/content/ptm'
    #get ref corp if doesn't exist
    if not os.path.isdir('wiki_final'):
        !unzip -q "/content/drive/My Drive/wiki_final.zip"
 
import torch
import torch.nn.functional as F
from geodesic import GeodesicMonteCarlo
from dataset import load_data, csr_to_torchsparse
import geodesic as g
import distributions as D
from tqdm.notebook import tqdm
import torch.distributions as dist
import numpy as np
import time
from utils import print_topics, get_topics, vmf_perplexity, clustering_metrics_20news, print_summary, get_invalid_topics
 
#make all tensors cuda if available and double
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.DoubleTensor)
    gpu = True
else:
    torch.set_default_tensor_type(torch.DoubleTensor)
    gpu = False
 
#Load Data
data_tr, data_te, vocab, vocab_size, num_tr = load_data(use_tfidf = True, sublinear = False, normalize = True, dataset = dataset)    
tensor_te = csr_to_torchsparse(data_te, gpu)
tensor_tr = csr_to_torchsparse(data_tr, gpu)

Wed Apr 12 13:18:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture cap 

#declare tensor hyperparameters
alpha = torch.full((1,num_topic), alpha_scalar)
if prior_mu == 'neg':
    mu0 = F.normalize(torch.full((vocab_size,),-1.0),dim=-1)
if prior_mu == 'pos':
    mu0 = F.normalize(torch.full((vocab_size,),1.0),dim=-1)
if prior_mu == 'mean':
    mu0 = F.normalize(torch.sparse.sum(tensor_tr,dim=0).to_dense(),dim=-1)
 
#randomly initialize model parameters
theta = torch.randn(num_tr,num_topic-1)
mu = F.normalize(torch.randn(num_topic, vocab_size) / (vocab_size ** 0.5) + mu0, p=2, dim=-1)
kappa = torch.randn(num_topic,1)*50 + 50. * (vocab_size ** 0.5) 
 
#declare GMC transition kernels
kernel = GeodesicMonteCarlo(L)
params = {'theta': theta, 'mu':mu, 'kappa': kappa}
init_etas = {'theta': eta_theta, 'mu':eta_mu, 'kappa': eta_kappa}
geodesics = {'theta': g.RnGeodesic(eta = eta_theta, rho = rho_theta), 'mu': g.SphericalGeodesic(eta = eta_mu, rho = rho_mu), 'kappa': g.PositiveGeodesic(eta = eta_kappa, rho = rho_kappa)}
vs = {name: geodesics[name].projection(params[name],dist.MultivariateNormal(torch.zeros(params[name].shape[-1]), torch.eye(params[name].shape[-1])).sample([params[name].shape[0]])) for name in params}
 
#start sampling loop
t = tqdm(range(num_samples+num_burn))
theta_samples = 0
mu_samples = 0
kappa_samples = 0
mu_save_collection_samples = [] #this is to test for coherence by time
kappa_save_collection_samples = [] #in case we need to test perp or something
start_time = time.time() #get start time
sampling_its = {50, 100, 200, 400, 800, 1600, 3200, 5000}
for i in t:

    idx = torch.randperm(num_tr)[:S]
    x_batch = csr_to_torchsparse(data_tr[idx.cpu()], gpu)
    theta = params['theta']
    params['theta'] = theta[idx]
    v_theta = vs['theta']
    vs['theta'] = v_theta[idx]
 
    for name in geodesics:
        geodesics[name].eta = init_etas[name] * ((i+1) ** (-1./5.))
    params, vs = kernel.stochastic_transition(params, vs, geodesics, D.VptmJointDistributionWithStickDirConjugatePrior(x_batch, alpha, c, mu0, v))
    
    theta[idx] = params['theta']
    v_theta[idx] = vs['theta']
    params['theta'] = theta
    vs['theta'] = v_theta
    
    theta = params['theta']
    kappa = params['kappa']
    mu = params['mu']
    
    if torch.any(kappa != kappa):
        break
   
    if i >= num_burn:
        theta_samples += theta
        mu_samples += mu
        kappa_samples += kappa

    if i % 100 == 0 or i in sampling_its:
        print("\ncurrent iteration:", i)
        print("elapsed time", time.time() - start_time)
        print("kappa mean",kappa.mean())
        print("kappas",torch.flatten(kappa))
        pi = dist.StickBreakingTransform()(theta)
        print("mu norms", mu.norm(dim=-1).sum(), num_topic)
        print("sparsity",(torch.abs(mu)**2.).norm(dim=-1))
        print("sparsitymean",(torch.abs(mu)**2.).norm(dim=-1).mean())
        print("pi sums", pi.sum(dim=-1).sum(), num_tr)

        sum_ll = 0.0
        sum_cs = 0.0
        for j in range(int(np.ceil(num_tr/S))):
            curr_pi = pi[j*S:j*S+S]
            curr_tensor_tr = csr_to_torchsparse(data_tr[j*S:j*S+S], gpu)
            curr_avg = torch.matmul(curr_pi,kappa*mu)
            sum_ll += D.log_prob_von_mises_fisher(curr_avg, curr_tensor_tr).sum()
            curr_avg = F.normalize(curr_avg,dim=-1)
            sum_cs += D.sparse_dense_dot(curr_tensor_tr, curr_avg).sum()
        
        print("log likelihood", sum_ll / num_tr)        
        print("cosine similarity", sum_cs / num_tr)
        print("perplexity", vmf_perplexity(tensor_te, mu, kappa, alpha, N=1000))

        sum_cs_spread = 0
        count_cs = 0
        for j in range(num_topic-1):
            for k in range(j+1,num_topic):
                sum_cs_spread += (mu[j] * mu[k]).sum(dim=-1)
                count_cs += 1
        print("mean cs spread", sum_cs_spread / count_cs,"\n")

        pi_cpu = pi.cpu().numpy()
        kappa_cpu = kappa.cpu().numpy()
        print("invalid topics")
        print("normal thres", get_invalid_topics(pi_cpu, kappa_cpu))
        print("thres 2x", get_invalid_topics(pi_cpu, kappa_cpu, 1/(num_topic*2)))
        print("thres 10x", get_invalid_topics(pi_cpu, kappa_cpu, 1/(num_topic*10)))
        print()
        
        
    if i % 1000 == 0:
        emb = mu.cpu().numpy()
        print_topics(get_topics(emb,vocab))
        print("")
      
    if i in sampling_its:
        mu_save_collection_samples.append(mu.cpu().numpy())
        kappa_save_collection_samples.append(kappa.cpu().numpy())

#get topic coherence
mu_final = mu_samples / num_samples
kappa_final = kappa_samples / num_samples
theta_final = theta_samples / num_samples
print('\nalpha = ', alpha_scalar)
print("final perplexity", vmf_perplexity(tensor_te, mu_final, kappa_final, alpha, N=1000))
print('prior_mu:', prior_mu)
print('v:', v)
print('c:', c)
emb = mu_final.cpu().numpy()
topics = get_topics(emb, vocab)
print_summary(topics,method,dataset)

if dataset == '20news':
    pi = dist.StickBreakingTransform()(theta_final)
    pi = pi.cpu().numpy()
    clustering_metrics_20news(pi)

In [None]:
import random
run = random.randint(0,100000)
current_run_name = "method="+str(method)+", alpha="+str(alpha_scalar)+", K="+str(num_topic)+", dataset="+str(dataset)+ ", prior_mu="+str(prior_mu)+ ", v="+str(v)+ ", c="+str(c)+ ", run="+ str(run)
dir_first_part = str(v) + "/" + str(num_topic) + "/" + str(alpha_scalar) + "/" + str(dataset) + "/"
dirname = '/content/drive/My Drive/masters_results/'+ dir_first_part + current_run_name + "/"
os.makedirs(dirname, exist_ok=False)
filename = current_run_name +".txt"
with open(dirname + filename, 'w') as f:
     f.write(cap.stdout)

#save mus
emb_filename = current_run_name +" emb.npy"

with open(dirname +emb_filename, 'wb') as f:
    np.save(f, emb)

#save kappas
kappa_filename = current_run_name +" kappa.npy"

with open(dirname +kappa_filename, 'wb') as f:
    np.save(f, kappa_final.cpu().numpy())

#save means
mean_filename = current_run_name +" mean.npy"

with open(dirname + mean_filename, 'wb') as f:
    np.save(f, mu0.cpu().numpy())

#save pis
pi = dist.StickBreakingTransform()(theta_final)
pi = pi.cpu().numpy()

pi_filename = current_run_name + " pi.npy"

with open(dirname + pi_filename, 'wb') as f:
    np.save(f, pi)

#save sample collections
mu_collection_filename = current_run_name + " mucollection.npz"
with open(dirname + mu_collection_filename, 'wb') as f:
    np.savez_compressed(f, *mu_save_collection_samples)

kappa_collection_filename = current_run_name + " kappacollection.npz"
with open(dirname + kappa_collection_filename, 'wb') as f:
    np.savez_compressed(f, *kappa_save_collection_samples)

In [None]:
from google.colab import runtime
runtime.unassign()