In [1]:
import os
import pickle
import numpy as np
from text_processing_utils import *
from variational_inference_utils import *
import glob
import math
from variational_inference_sLDA_M_step import *

In [2]:
cleaned_ratings = np.array(pickle.load(open("data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("data/scaledata/vocabulary_dict.pickle", "rb"))

In [3]:
# split the movie reviews data into training/testing parts (80:20)
np.random.seed(54321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [4]:
K = 12 # number of topics
V = len(vocabulary_dict) # vocabulary size

In [7]:
## initialization
np.random.seed(12345)
new_alpha = np.array([1/K]*K)
new_xi = np.array([1/V]*V)
new_eta = np.linspace(-1,1,K)
new_delta = np.var(train_y, ddof=1)
new_Lambda = np.abs(np.random.normal(loc=0, scale=0.1, size=K*V)).reshape((K,V)) # initialize Lambda randomly (add a small half-normal distribution to 1)
input_data_x = train_bow
input_data_y = train_y
fpath = "fragmented_output_files/" # where to store the temporary fragmented files during parallelized E steps
predict = False
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
else:
    delete_all_files(fpath[:-1])
epsilon = 1e-4 # stopping criteria for convergence of local parameters in E step
### Run one iteration of batch E step (parallelized)
%run -i "parallelized_sLDA_E_step.py"
all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
new_gamma_dict = merge_dict(all_gamma)
new_gamma = create_gamma_matrix(new_gamma_dict)
all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
new_phi = merge_dict(all_phi)
### Run one iteration of batch M step
m_step = batch_VI_sLDA_M_Step(K, train_bow, train_y,
                              new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                              new_gamma, new_phi,
                              len(train_bow), 1e-4)
new_Lambda, new_alpha, new_xi, new_eta, new_delta, new_elbo = m_step.run()
print(new_eta)
print(new_delta)

[0.44969256 0.50547697 0.44973926 0.26830226 0.53737989 0.62371218
 0.61031766 0.77913044 0.48074952 0.73049012 0.65799448 0.81802608]
0.03199283682215594


In [11]:
delete_all_files(fpath[:-1])

In [12]:
## Run minibatch (stochastic) mode variational EM
S = 100 # minibatch size
n_iter_batch = len(pickle.load(open("data/scaledata/K_{}/elbo_vs_time.pickle".format(K), "rb")))
n_iter = int((n_iter_batch+1) * len(train_bow) / S) # total # of iterations of minibatch variational EM: equivalent to # of full passes of training data needed for the batch variational EM to converge
temp = S * np.arange(1,n_iter+1) // len(train_bow)
check_points = [np.arange(1,n_iter+1)[temp == i][0] for i in range(1, n_iter_batch+1)]  # minibatch iterations that correspond to each iteration in batch mode
switch_point = check_points[0]
check_points = {val:(i+1) for i,val in enumerate(check_points[1:])}
kappa = 0.6 # "forgetting rate"
tau = switch_point + 1 # "delay"

In [None]:
for t in range(1, n_iter+1):

    ### randomly sample a minibatch with size S
    sample_indices = np.random.choice(np.arange(len(train_bow)), S, replace=False)
    train_bow_sample = {new_i:train_bow[new_i] for new_i,i in enumerate(sample_indices)}
    train_y_sample = train_y[sample_indices]
    input_data_x = train_bow_sample
    input_data_y = train_y_sample
    
    ### Run one iteration of E step (parallelized)
    %run -i "parallelized_sLDA_E_step.py"
    all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
    new_gamma_dict = merge_dict(all_gamma)
    new_gamma = create_gamma_matrix(new_gamma_dict)
    all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
    new_phi = merge_dict(all_phi)

    ### Run one iteration of M step
    rho = step_size(t,tau,kappa) # rho_t
    m_step = VI_sLDA_M_Step(K, train_bow_sample, train_y_sample,
                            new_alpha, new_xi, new_eta, new_delta, new_Lambda,
                            new_gamma, new_phi,
                            len(train_bow), rho)
    new_Lambda, new_alpha, new_xi, new_eta, new_delta = m_step.run()
    print(new_eta)
    print(new_delta)
    if t in check_points:
        ## save final results to Google Drive
        output_dir = "data/scaledata/stochastic_K_{}".format(K)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        for var in ['Lambda', 'alpha', 'xi', 'eta', 'delta']:
            pickle.dump(eval("new_"+var), open(output_dir + "/{0}_{1}.pickle".format(var, check_points[t]), "wb"))        
    print("Complete minibatch variational EM iteration {}!".format(t))


## save final results to Google Drive
output_dir = "/content/drive/MyDrive/batch_VI_sLDA_movie_rating/stochastic_K_{}".format(K)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[0.49388029 0.49906426 0.45276773 0.2604802  0.5164606  0.61634337
 0.63179114 0.84425719 0.41218956 0.82165665 0.60394125 0.76271743]
0.03135951573580121
Complete minibatch variational EM iteration 1!
[0.44597377 0.55358922 0.4024345  0.27987654 0.49890882 0.54091868
 0.67075294 0.8105408  0.4596579  0.80835544 0.62158059 0.77320979]
0.03061122671254577
Complete minibatch variational EM iteration 2!
[0.52550652 0.5166626  0.43821679 0.39658554 0.5814124  0.53844027
 0.5922236  0.82578376 0.39065335 0.78878315 0.56341547 0.71889057]
0.02927642665436051
Complete minibatch variational EM iteration 3!
[0.5538499  0.51253916 0.46380618 0.38711009 0.61468216 0.4902054
 0.64340239 0.79054151 0.40377304 0.75437107 0.57061783 0.69588783]
0.029613850641196513
Complete minibatch variational EM iteration 4!
[0.62428336 0.49625381 0.53403712 0.3404159  0.63722618 0.52397789
 0.75173505 0.7325581  0.42207042 0.75194442 0.52222377 0.64656611]
0.028741672793136646
Complete minibatch variational EM it

KeyboardInterrupt: 

[0.61900968 0.60905984 0.53339034 0.60720331 0.62108294 0.58060788
 0.58674932 0.60884945 0.60400867 0.51931438 0.55844132 0.56122249]
0.023536195269300956
Complete minibatch variational EM iteration 160!
[0.62640476 0.61489498 0.52463038 0.60253556 0.63330996 0.56509603
 0.59557279 0.61152251 0.59743215 0.50560905 0.58063775 0.55479321]
0.02340949791118042
Complete minibatch variational EM iteration 161!
[0.61527907 0.60488074 0.51205725 0.60761124 0.66019421 0.55229587
 0.59307546 0.60893163 0.59204806 0.52155278 0.58195988 0.562717  ]
0.023385678589686498
Complete minibatch variational EM iteration 162!


In [32]:
step_size(20, tau, kappa)

0.16094164024930613

In [21]:
m_step.expect_x_x_t

array([[14.65728224, 12.30921185, 11.77309185, 10.8060656 , 12.59470989,
        12.18734841, 11.34577679, 10.8131603 , 14.02680931, 12.90502614,
        13.95533069, 13.96690943],
       [12.30921185, 17.48810712, 13.02572319, 11.654981  , 13.86735706,
        13.08048636, 12.42066635, 11.889831  , 15.35946799, 14.17032383,
        15.38340621, 15.16669453],
       [11.77309185, 13.02572319, 16.415678  , 11.07309199, 13.33428991,
        12.76211989, 11.86867422, 11.31805046, 15.05656871, 13.50034963,
        14.77214419, 14.56388597],
       [10.8060656 , 11.654981  , 11.07309199, 14.28129462, 11.95145929,
        11.63088667, 10.6603755 , 10.34975444, 13.50931318, 12.39633525,
        13.33264388, 13.22306773],
       [12.59470989, 13.86735706, 13.33428991, 11.95145929, 18.9683695 ,
        13.9894664 , 12.85793225, 12.48213683, 15.96556471, 14.72645402,
        15.94515025, 15.64768185],
       [12.18734841, 13.08048636, 12.76211989, 11.63088667, 13.9894664 ,
        17.31238708, 1

In [22]:
phi_bar_times_y = np.dot(m_step.y, m_step.phi_bar) # K-dimensional vector
expect_x_x_t_times_eta = np.dot(m_step.expect_x_x_t, m_step.eta) # K-dimensional vector
y_t_y = np.sum(m_step.y**2)
temp_var = np.dot(m_step.eta, phi_bar_times_y - expect_x_x_t_times_eta/2) # dot product
g_eta = (1/m_step.delta)*(phi_bar_times_y - expect_x_x_t_times_eta) # K-dimensional vector
g_delta = -m_step.D/2/m_step.delta + 1/2/m_step.delta**2 * (y_t_y - 2*temp_var)
g = m_step.scale_factor * np.hstack([g_eta, np.array([g_delta])]) # gradient is of K+1 dimensional, scale based on minibatch size
h_11 = -m_step.expect_x_x_t/m_step.delta
h_21 = -g_eta / m_step.delta # mixed partial derivatives: K-dimensional vector
h_22 = m_step.D/2/m_step.delta**2 - 1/m_step.delta**3 * (y_t_y - 2*temp_var)
h = np.zeros(shape=(m_step.K+1, m_step.K+1)) # Hessian of L w.r.t (eta, delta)
h[:m_step.K, :m_step.K] = h_11
h[m_step.K, m_step.K] = h_22
h[m_step.K, :m_step.K] = h_21
h[:m_step.K, m_step.K] = h_21
h = m_step.scale_factor * h # (scaled) Hessian is of (K+1) x (K+1) dimensional
h_inv = np.linalg.inv(h)
eta_delta_hat = h_inv @ g

In [28]:
m_step.scale_factor

2.002

In [27]:
np.linalg.inv(h_11) @ g_eta

array([-1.31084345, -1.17858552, -1.06127419, -0.82333022, -0.7281923 ,
       -0.63460334, -0.61099773, -0.46087697, -0.2898978 , -0.12512447,
        0.2027727 ,  0.17439202])

In [24]:
eta_delta_hat

array([-13.57333026, -12.2038451 , -10.98912691,  -8.52529941,
        -7.54017918,  -6.57109793,  -6.32667001,  -4.77222149,
        -3.00179137,  -1.2956206 ,   2.09964111,   1.80576898,
         0.30489289])

In [18]:
np.linalg.det(m_step.expect_x_x_t)

1154768469.6294944

In [20]:
np.linalg.det(np.linalg.inv(m_step.expect_x_x_t))

8.659744583438833e-10