Upload the 5 Python scripts (text_processing_utils.py, variational_inference_utils.py, variational_inference_sLDA_E_step.py, variational_inference_sLDA_M_step_diagnostics.py, parallelized_sLDA_E_step.py) to the same (temporary runtime) directory as this Google Colab notebook. Also, need to upload the movie reviews/rating data to my Google Drive Home Directory using the structure "data/scaledata/xxx.pickle".

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import pickle
import numpy as np
from text_processing_utils import *
from variational_inference_utils import *
import glob
import datetime as dt
import math
from variational_inference_sLDA_M_step_diagnostics import *
start_time = dt.datetime.now()

In [None]:
cleaned_ratings = np.array(pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("/content/drive/MyDrive/data/scaledata/vocabulary_dict.pickle", "rb"))

In [None]:
# randomly split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [None]:
K = 24 # number of topics
V = len(vocabulary_dict) # vocabulary size

In [6]:
## initialization
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
input_data_x = train_bow
input_data_y = train_y
fpath = "fragmented_output_files_2/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
else:
    delete_all_files(fpath[:-1])
epsilon = 1e-4 # stopping criteria for convergence of local parameters in E step and for convergence of alpha and xi in M step
predict = False
## Run batch mode variational EM
elbo_vs_time = [-math.inf]
improve_in_elbo = math.inf
time_elapsed = 0
j = 0
while improve_in_elbo > 0.01 and time_elapsed < 23.5 * 3600: # there's a time limit to the Google Colab Pro+

    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)

    ### Run one iteration of M step
    m_step = batch_VI_sLDA_M_Step(K, train_bow, train_y,
                                  new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                                  new_gamma, new_phi,
                                  len(train_bow), 1e-5,
                                  closed_form=False)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run()
    improve_in_elbo = pct_diff(elbo_vs_time[-1], new_elbo)
    elbo_vs_time.append(new_elbo)
    current_time = dt.datetime.now()
    time_elapsed = (current_time - start_time).seconds
    j += 1
    print("variational EM iteration {}: elbo =".format(j), new_elbo)

## save final results to Google Drive
output_dir = "/content/drive/MyDrive/data/scaledata/diagnostic_1"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
    pickle.dump(eval("new_"+var), open(output_dir + "/{}.pickle".format(var), "wb"))
pickle.dump(elbo_vs_time, open(output_dir + "/{}.pickle".format("elbo_vs_time"), "wb"))
pickle.dump(time_elapsed, open(output_dir + "/{}.pickle".format("time_elapsed"), "wb"))

variational EM iteration 1: elbo = -6507523.201547623
variational EM iteration 2: elbo = -5941896.600916386
variational EM iteration 3: elbo = -5752574.858434677
variational EM iteration 4: elbo = -5649482.200006008
variational EM iteration 5: elbo = -5578930.944798708
variational EM iteration 6: elbo = -5528985.242948532
variational EM iteration 7: elbo = -5491995.564368486
variational EM iteration 8: elbo = -5463712.74181962
variational EM iteration 9: elbo = -5440926.913976431
variational EM iteration 10: elbo = -5422121.318188429
variational EM iteration 11: elbo = -5406165.707626581
variational EM iteration 12: elbo = -5392572.939574242
variational EM iteration 13: elbo = -5380844.025789499
variational EM iteration 14: elbo = -5370661.517929077
variational EM iteration 15: elbo = -5361584.172087431
variational EM iteration 16: elbo = -5353675.1738398075
variational EM iteration 17: elbo = -5346686.429689169
variational EM iteration 18: elbo = -5340186.2337681055
variational EM ite