In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import pickle
import numpy as np
from text_processing_utils import *
from variational_inference_utils import *
import glob
import math
from variational_inference_sLDA_M_step_diagnostics import *

In [None]:
cleaned_ratings = np.array(pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("/content/drive/MyDrive/data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("/content/drive/MyDrive/data/scaledata/vocabulary_dict.pickle", "rb"))

In [None]:
# split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [None]:
K = 24 # number of topics
V = len(vocabulary_dict) # vocabulary size
output_dir = "/content/drive/MyDrive/data/scaledata/diagnostic_4"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
else:
    delete_all_files(output_dir)
fpath = "fragmented_output_files_4/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
else:
    delete_all_files(fpath[:-1])

In [None]:
## Initialization of parameters
np.random.seed(1234567)
initial_batch_size = 1000
# sample_indices = np.random.choice(np.arange(len(train_bow)), initial_batch_size, replace=False)
# train_bow_sample = {new_i:train_bow[new_i] for new_i,i in enumerate(sample_indices)}
# train_y_sample = train_y[sample_indices]
# input_data_x = train_bow_sample
# input_data_y = train_y_sample
predict = False
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
epsilon = 1e-4
elbo_epsilon=0.01 # percentage

# ## Run batch mode variational EM on a randomly selected sample with moderately large size to initialize (warm-up) the stochastic procedure
# elbo_vs_time = [-math.inf]
# improve_in_elbo = math.inf
# j = 0 # record total # of warm-up variational EM iterations
# while improve_in_elbo > elbo_epsilon:
#     ### Run one iteration of E step (parallelized)
#     %run -i "parallelized_sLDA_E_step.py"
#     all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
#     new_gamma_dict = merge_dict(all_gamma)
#     new_gamma = create_gamma_matrix(new_gamma_dict)
#     all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
#     new_phi = merge_dict(all_phi)
#     ### Run one iteration of M step
#     m_step = batch_VI_sLDA_M_Step(K, input_data_x, input_data_y,
#                                   new_alpha, new_xi, new_eta, new_delta, new_Lambda,
#                                   new_gamma, new_phi,
#                                   len(input_data_y), epsilon)
#     new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run(optimize_alpha_and_xi=False)
#     improve_in_elbo = pct_diff(elbo_vs_time[-1], new_elbo)
#     elbo_vs_time.append(new_elbo)
#     j += 1
#     for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
#         pickle.dump(eval("new_"+var), open(output_dir + "/warmup_{0}_{1}.pickle".format(var, j), "wb"))
#     print("Batch mode variational EM iteration {}: elbo =".format(j), new_elbo)
j = 0

In [None]:
## Run minibatch (stochastic) mode variational EM
np.random.seed(654321)
delete_all_files(fpath[:-1])
S = len(train_bow) # set the minibatch size to be equal to the actual batch size
n_iter_batch = len(pickle.load(open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/K_{}/elbo_vs_time.pickle".format(K), "rb")))
n_iter = int((3*n_iter_batch+1) * len(train_bow) / S) # total # of iterations of minibatch variational EM: equivalent to 3 times of the number of full passes of training data needed for the batch variational EM to converge
temp = S * np.arange(1,n_iter+1) // len(train_bow)
check_points = [np.arange(1,n_iter+1)[temp == i][0] for i in range(1, 3*n_iter_batch+1)]  # minibatch iterations that correspond to each iteration in batch mode
switch_point = int(j * initial_batch_size / S) + 1    ## save parameters to Google Drive
check_points = np.array(check_points)
check_points = check_points[check_points >= switch_point]
check_points = {int(val):(i+1) for i,val in enumerate(check_points)}
kappa = 0.6 # "forgetting rate"
tau = 1 # "delay"

for t in range(switch_point, n_iter+1):

    ### randomly sample a minibatch with size S
    sample_indices = np.random.choice(np.arange(len(train_bow)), S, replace=False)
    train_bow_sample = {new_i:train_bow[new_i] for new_i,i in enumerate(sample_indices)}
    train_y_sample = train_y[sample_indices]
    input_data_x = train_bow_sample
    input_data_y = train_y_sample

    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)

    ### Run one iteration of M step
    rho = step_size(t,tau,kappa) # rho_t
    m_step = VI_sLDA_M_Step(K, train_bow_sample, train_y_sample,
                            new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                            new_gamma, new_phi,
                            len(train_bow), rho)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta = m_step.run(update_alpha_and_xi = True)
    if t in check_points:
       for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
           pickle.dump(eval("new_"+var), open(output_dir + "/{0}_{1}.pickle".format(var, check_points[t]), "wb"))
    print("Stochastic (minibatch) variational EM iteration {} complete!".format(t-switch_point+1))
for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
    pickle.dump(eval("new_"+var), open(output_dir + "/{}.pickle".format(var), "wb"))

Stochastic (minibatch) variational EM iteration 1 complete!
Stochastic (minibatch) variational EM iteration 2 complete!
Stochastic (minibatch) variational EM iteration 3 complete!
Stochastic (minibatch) variational EM iteration 4 complete!
Stochastic (minibatch) variational EM iteration 5 complete!
Stochastic (minibatch) variational EM iteration 6 complete!
Stochastic (minibatch) variational EM iteration 7 complete!
Stochastic (minibatch) variational EM iteration 8 complete!
Stochastic (minibatch) variational EM iteration 9 complete!
Stochastic (minibatch) variational EM iteration 10 complete!
Stochastic (minibatch) variational EM iteration 11 complete!
Stochastic (minibatch) variational EM iteration 12 complete!
Stochastic (minibatch) variational EM iteration 13 complete!
Stochastic (minibatch) variational EM iteration 14 complete!
Stochastic (minibatch) variational EM iteration 15 complete!
Stochastic (minibatch) variational EM iteration 16 complete!
Stochastic (minibatch) variationa