Upload the 5 Python scripts (text_processing_utils.py, variational_inference_utils.py, variational_inference_sLDA_E_step.py, variational_inference_sLDA_M_step.py, parallelized_sLDA_E_step.py) to the same (temporary runtime) directory as this Google Colab notebook. Also, need to upload the movie reviews/rating data to my Google Drive Home Directory using the structure "data/scaledata/xxx.pickle".

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os
import pickle
import numpy as np
from text_processing_utils import *
import glob
import datetime as dt
import math
from variational_inference_sLDA_M_step import *
start_time = dt.datetime.now()

In [4]:
cleaned_ratings = np.array(pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("/content/drive/MyDrive/data/scaledata/vocabulary_dict.pickle", "rb"))

In [5]:
# randomly split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [6]:
K = 12 # number of topics
V = len(vocabulary_dict) # vocabulary size

In [None]:
## initialization
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
input_data_x = train_bow
input_data_y = train_y
fpath = "fragmented_output_files/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
epsilon = 1e-4 # stopping criteria for convergence in E step

## Run batch mode variational EM
elbo_vs_time = [-math.inf]
improve_in_elbo = math.inf
time_elapsed = 0
j = 0
while improve_in_elbo > 0.01 and time_elapsed < 23.5 * 3600: # there's a time limit to the Google Colab Pro+

    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)

    ### Run one iteration of M step
    m_step = batch_VI_sLDA_M_Step(K, train_bow, train_y,
                                  new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                                  new_gamma, new_phi,
                                  len(train_bow), 1e-4)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run()
    improve_in_elbo = pct_diff(elbo_vs_time[-1], new_elbo)
    elbo_vs_time.append(new_elbo)
    current_time = dt.datetime.now()
    time_elapsed = (current_time - start_time).seconds
    j += 1
    print("variational EM iteration {}: elbo =".format(j), new_elbo)

## save final results to Google Drive
for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
    pickle.dump(eval("new_"+var), open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format(var), "wb"))
pickle.dump(elbo_vs_time, open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format("elbo_vs_time"), "wb"))
pickle.dump(time_elapsed, open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format("time_elapsed"), "wb"))

variational EM iteration 1: elbo = -5908122.871837378
variational EM iteration 2: elbo = -5630797.479478836
variational EM iteration 3: elbo = -5535514.019721031
variational EM iteration 4: elbo = -5485360.767313361
variational EM iteration 5: elbo = -5452580.504116774
variational EM iteration 6: elbo = -5428502.491692662
variational EM iteration 7: elbo = -5409659.490482688
variational EM iteration 8: elbo = -5394283.701215148
variational EM iteration 9: elbo = -5381420.426229715
variational EM iteration 10: elbo = -5370430.354780674
variational EM iteration 11: elbo = -5360796.517108142
variational EM iteration 12: elbo = -5352409.469674647
variational EM iteration 13: elbo = -5345080.30678916
variational EM iteration 14: elbo = -5338489.6775407195
variational EM iteration 15: elbo = -5332661.142350972
variational EM iteration 16: elbo = -5327512.476150274
variational EM iteration 17: elbo = -5322884.21004504
variational EM iteration 18: elbo = -5318650.788538456
variational EM itera