In [73]:
import os
import pickle
import numpy as np
from text_processing_utils import *
import glob
import math
from scipy.stats import pearsonr

In [74]:
cleaned_ratings = np.array(pickle.load(open("data/scaledata/cleaned_ratings.pickle", "rb")))
cleaned_reviews = pickle.load(open("data/scaledata/cleaned_reviews.pickle", "rb"))
vocabulary_dict = pickle.load(open("data/scaledata/vocabulary_dict.pickle", "rb"))

In [75]:
# randomly split the movie reviews data into training/testing parts (80:20)
np.random.seed(7654321)
train_indices = np.random.choice(np.arange(len(cleaned_ratings)), int(len(cleaned_ratings)*0.8), replace=False)
test_indices = np.setdiff1d(np.arange(len(cleaned_ratings)), train_indices)
print(len(train_indices), len(test_indices))
train_bow = convert_bow([cleaned_reviews[i] for i in train_indices])
test_bow = convert_bow([cleaned_reviews[i] for i in test_indices])
train_y = cleaned_ratings[train_indices]
test_y = cleaned_ratings[test_indices]

4004 1002


In [76]:
K = 36 # number of topics
V = len(vocabulary_dict) # vocabulary size

In [77]:
## Loading the optimized global parameters from the model training phase
np.random.seed(12345)
new_alpha = pickle.load(open("data/scaledata/K_{}/alpha.pickle".format(K), "rb"))
new_xi = pickle.load(open("data/scaledata/K_{}/xi.pickle".format(K), "rb"))
new_eta = pickle.load(open("data/scaledata/K_{}/eta.pickle".format(K), "rb"))
new_delta = pickle.load(open("data/scaledata/K_{}/delta.pickle".format(K), "rb"))
new_Lambda = pickle.load(open("data/scaledata/K_{}/Lambda.pickle".format(K), "rb"))
input_data_x = test_bow
input_data_y = test_y
fpath = "fragmented_output_files/" # where to store the temporary fragmented files during parallelized E steps
if not os.path.exists(fpath[:-1]):
    os.makedirs(fpath[:-1])
else:
    for fn in glob.glob(fpath + "*"):
        os.remove(fn)
epsilon = 1e-4 # stopping criteria for convergence in E step
predict = True # prediction mode for evaluating the test set

## Run batch mode variational EM
elbo_vs_time = [-math.inf]
improve_in_elbo = math.inf
time_elapsed = 0
    
## Run one iteration of unsupervised E step (parallelized) to identify the optimal local variational parameters for all documents in the test set.
## The optimized phi identified from the unsupervised batch VI will be used to generate the predictions for response variable y in the test set
%run -i "parallelized_sLDA_E_step.py" # unsupervised batch VI for sLDA is way faster than supervised one
all_gamma = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "gamma*")]
new_gamma_dict = merge_dict(all_gamma)
new_gamma = create_gamma_matrix(new_gamma_dict)
all_phi = [pickle.load(open(fn, "rb")) for fn in glob.glob(fpath + "phi*")]
new_phi = merge_dict(all_phi)

In [78]:
# Topic words of each topic learned from training sLDA, with corresponding coefficients (eta)
inverse_vocabulary_dict = {v:k for k,v in vocabulary_dict.items()}
for t in range(K):
    word_indices = np.argsort(new_Lambda[t,:])[::-1][:10]
    print("Topic {}:".format(t+1), [inverse_vocabulary_dict[j] for j in word_indices], new_eta[t])

Topic 1: ['emotional', 'art', 'visual', 'producers', 'despite', 'however', 'nearly', 'color', 'aspect', 'deep'] 1.31036044964282
Topic 2: ['movies', 'humor', 'why', 'ever', 'then', 'sex', 'comes', 'sexual', 'gives', 'old'] 0.3611204456296484
Topic 3: ['line', 'subject', 'men', 'style', 'subscribe', 'details', 'women', 'find', 'word', 'attempt'] 0.7674899959713208
Topic 4: ['seemed', 'clear', 'plenty', 'williams', 'fairly', 'chris', 'parker', 'kiss', 'creepy', 'terribly'] -0.686860373922249
Topic 5: ['through', 'where', 'feels', 'narrative', 'stories', 'yet', 'adult', 'mark', 'times', 'modern'] 0.9095796544160848
Topic 6: ['right', 'half', 'actually', 'watching', 'lines', 'works', 'going', 'along', 'believe', 'whose'] 0.4034043438733903
Topic 7: ['kids', 'son', 'pg', 'jeffrey', '10', 'laugh', 'thought', 'joke', 'call', 'bunch'] 0.5537137774476548
Topic 8: ['de', 'rich', 'oscar', 'colors', 'force', 'today', 'images', 'contrast', 'road', 'intense'] 2.2689603737657578
Topic 9: ['funny', 'p

In [79]:
# Generate predictions for the response variable y for the test set:
# For Gaussian response, y_hat = E[phi_bar^T eta]
phi_bar = {k:v.mean(axis=0) for k,v in new_phi.items()}
pred_y = np.empty((len(test_y),))
for indx in range(len(test_y)):
    pred_y[indx] = np.dot(phi_bar[indx], new_eta)

In [80]:
# predictive R^2
def predictive_R2(test_y, pred_y):
    return 1 - np.mean((test_y - pred_y)**2) / np.var(test_y)
predictive_R2(test_y, pred_y)

0.1869059803755514

In [81]:
# Pearson correlation coefficient
pearsonr(test_y, pred_y)

PearsonRResult(statistic=0.43325864600685615, pvalue=4.1492091221226707e-47)