In [5]:
import os
import pickle
import numpy as np
from text_processing_utils import *
from variational_inference_utils import *
import glob
import math
from variational_inference_sLDA_M_step import *

In [6]:
cleaned_ratings = np.array(pickle.load(open("data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("data/scaledata/vocabulary_dict.pickle", "rb"))

In [7]:
# split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [17]:
## save parameters to Google Drive
output_dir = "data/scaledata/stochastic_K_{}".format(K)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
else:
    delete_all_files(output_dir)

In [13]:
K = 24 # number of topics
V = len(vocabulary_dict) # vocabulary size
fpath = "fragmented_output_files/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
else:
    delete_all_files(fpath[:-1])
    
## Initialization
np.random.seed(1234567)
initial_batch_size = 1000
sample_indices = np.random.choice(np.arange(len(train_bow)), initial_batch_size, replace=False)
train_bow_sample = {new_i:train_bow[new_i] for new_i,i in enumerate(sample_indices)}
train_y_sample = train_y[sample_indices]
input_data_x = train_bow_sample
input_data_y = train_y_sample
predict = False
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
epsilon = 1e-4
elbo_epsilon=0.1 # percentage

## Run batch mode variational EM
elbo_vs_time = [-math.inf]
improve_in_elbo = math.inf
j = 0
while improve_in_elbo > elbo_epsilon:
    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)
    ### Run one iteration of M step
    m_step = batch_VI_sLDA_M_Step(K, input_data_x, input_data_y,
                                  new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                                  new_gamma, new_phi,
                                  len(input_data_y), epsilon)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run()
    improve_in_elbo = pct_diff(elbo_vs_time[-1], new_elbo)
    elbo_vs_time.append(new_elbo)
    j += 1
    for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
        pickle.dump(eval("new_"+var), open(output_dir + "/warmup_{0}_{1}.pickle".format(var, check_points[t]), "wb")) 
    print("Batch mode variational EM iteration {}: elbo =".format(j), new_elbo)

Batch mode variational EM iteration 1: elbo = -300934.3569049835
Batch mode variational EM iteration 2: elbo = -208433.97087574005
Batch mode variational EM iteration 3: elbo = -186011.03345251083
Batch mode variational EM iteration 4: elbo = -177071.1480937004
Batch mode variational EM iteration 5: elbo = -172514.37959241867
Batch mode variational EM iteration 6: elbo = -169803.0082168579


In [7]:
np.random.seed(654321)
delete_all_files(fpath[:-1])
## Run minibatch (stochastic) mode variational EM
S = 500 # minibatch size
n_iter_batch = len(pickle.load(open("/content/drive/MyDrive/batch_VI_sLDA_movie_rating/K_{}/elbo_vs_time.pickle".format(K), "rb")))
n_iter = int((n_iter_batch+1) * len(train_bow) / S) # total # of iterations of minibatch variational EM: equivalent to # of full passes of training data needed for the batch variational EM to converge
temp = S * np.arange(1,n_iter+1) // len(train_bow)
check_points = [np.arange(1,n_iter+1)[temp == i][0] for i in range(1, n_iter_batch+1)]  # minibatch iterations that correspond to each iteration in batch mode
switch_point = check_points[int(j * initial_batch_size / S)] + 1    ## save parameters to Google Drive
check_points = np.array(check_points)
check_points = check_points[check_points >= switch_point]
check_points = {val:(i+1) for i,val in enumerate(check_points)}
kappa = 0.6 # "forgetting rate"
tau = 1 # "delay"

for t in range(switch_point, n_iter+1):

    ### randomly sample a minibatch with size S
    sample_indices = np.random.choice(np.arange(len(train_bow)), S, replace=False)
    train_bow_sample = {new_i:train_bow[new_i] for new_i,i in enumerate(sample_indices)}
    train_y_sample = train_y[sample_indices]
    input_data_x = train_bow_sample
    input_data_y = train_y_sample
    
    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)

    ### Run one iteration of M step
    rho = step_size(t,tau,kappa) # rho_t
    m_step = VI_sLDA_M_Step(K, train_bow_sample, train_y_sample,
                            new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                            new_gamma, new_phi,
                            len(train_bow), rho)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta = m_step.run()
    print(new_eta)
    print(new_delta)
    if t in check_points:
        for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
            pickle.dump(eval("new_"+var), open(output_dir + "/{0}_{1}.pickle".format(var, check_points[t]), "wb"))        
    print("Complete minibatch variational EM iteration {}!".format(t-switch_point+1))

[0.44969256 0.50547697 0.44973926 0.26830226 0.53737989 0.62371218
 0.61031766 0.77913044 0.48074952 0.73049012 0.65799448 0.81802608]
0.03199283682215594


In [22]:
phi_bar_times_y = np.dot(m_step.y, m_step.phi_bar) # K-dimensional vector
expect_x_x_t_times_eta = np.dot(m_step.expect_x_x_t, m_step.eta) # K-dimensional vector
y_t_y = np.sum(m_step.y**2)
temp_var = np.dot(m_step.eta, phi_bar_times_y - expect_x_x_t_times_eta/2) # dot product
g_eta = (1/m_step.delta)*(phi_bar_times_y - expect_x_x_t_times_eta) # K-dimensional vector
g_delta = -m_step.D/2/m_step.delta + 1/2/m_step.delta**2 * (y_t_y - 2*temp_var)
g = m_step.scale_factor * np.hstack([g_eta, np.array([g_delta])]) # gradient is of K+1 dimensional, scale based on minibatch size
h_11 = -m_step.expect_x_x_t/m_step.delta
h_21 = -g_eta / m_step.delta # mixed partial derivatives: K-dimensional vector
h_22 = m_step.D/2/m_step.delta**2 - 1/m_step.delta**3 * (y_t_y - 2*temp_var)
h = np.zeros(shape=(m_step.K+1, m_step.K+1)) # Hessian of L w.r.t (eta, delta)
h[:m_step.K, :m_step.K] = h_11
h[m_step.K, m_step.K] = h_22
h[m_step.K, :m_step.K] = h_21
h[:m_step.K, m_step.K] = h_21
h = m_step.scale_factor * h # (scaled) Hessian is of (K+1) x (K+1) dimensional
h_inv = np.linalg.inv(h)
eta_delta_hat = h_inv @ g