In [20]:
import os
import pickle
import numpy as np
from text_processing_utils import *
import glob
import datetime as dt
import math
from variational_inference_sLDA_M_step import *
start_time = dt.datetime.now()

In [22]:
cleaned_ratings = np.array(pickle.load(open("data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("data/scaledata/vocabulary_dict.pickle", "rb"))

In [25]:
# randomly split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.1), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

500 4506


In [26]:
K = 12 # number of topics
V = len(vocabulary_dict) # vocabulary size

In [27]:
## initialization
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
input_data_x = train_bow
input_data_y = train_y
fpath = "fragmented_output_files/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
epsilon = 1e-4 # stopping criteria for convergence in E step

## Run batch mode variational EM
elbo_vs_time = [-math.inf]
improve_in_elbo = math.inf
time_elapsed = 0
while improve_in_elbo > 0.01 and time_elapsed < 23.5 * 3600:
    
    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)
    
    ### Run one iteration of M step
    m_step = batch_VI_sLDA_M_Step(K, train_bow, train_y,
                                  new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                                  new_gamma, new_phi,
                                  len(train_bow), 1e-4)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run()
    improve_in_elbo = pct_diff(elbo_vs_time[-1], new_elbo)
    elbo_vs_time.append(new_elbo)
    current_time = dt.datetime.now()
    time_elapsed = (current_time - start_time).seconds

## save final results to Google Drive
for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
    pickle.dump(eval("new_"+var), open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format(var), "wb"))
pickle.dump(elbo_vs_time, open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format("elbo_vs_time"), "wb"))
pickle.dump(time_elapsed, open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/{}.pickle".format("time_elapsed"), "wb"))

[-1.         -0.81818182 -0.63636364 -0.45454545 -0.27272727 -0.09090909
  0.09090909  0.27272727  0.45454545  0.63636364  0.81818182  1.        ]
[0.31003397 0.38466031 0.46049951 0.42435579 0.66895848 0.82385343
 0.55517983 0.71290784 0.5003519  0.73620155 0.67127189 0.78223165]
[0.43759085 0.43685975 0.45601401 0.44761257 0.71268622 0.91113265
 0.56679989 0.73485994 0.45278773 0.68957464 0.62365497 0.61290834]


KeyboardInterrupt: 

[0.45895859 0.4306704  0.42090926 0.45069844 0.72746269 0.96979401
 0.59261699 0.73306759 0.45643796 0.67794526 0.62905784 0.54293733]


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/batch_VI_sLDA_movie_rating/Lambda.pickle'