In [113]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict
import seaborn as sns
import warnings
from IPython.display import clear_output

warnings.filterwarnings('ignore')

np.random.seed(1)

% matplotlib inline

In [14]:
ratings = pd.read_csv('../data/reviews_sample_100.csv').drop(['Unnamed: 0', 'reviewTime'], axis = 1)
ratings.columns = ['item', 'user', 'rating']
ratings = ratings[['user', 'item', 'rating']]
ratings['rating'] = ratings['rating'].astype(int)
ratings['rating'] = (ratings['rating'] >= 4).astype(bool)
ratings.head()

Unnamed: 0,user,item,rating
0,A1JH5J1KQAUBMP,B00005U0JX,True
1,A1RSXP7MB772E3,B001DHXT1G,True
2,AJGP5XYKKBGBG,0792840054,True
3,A2SQJPUCZNHMZE,B005LAIHSG,True
4,A3QVAKVRAH657N,B00005K3OU,False


In [140]:
class PLSI():
    
    def __init__(self, n_factors = 5, n_iters = 5, threshold = 0.5, optimize_threshold = False,
                 verbose = False, user = 'user', item = 'item', rating = 'rating'):
        self.n_items = 0
        self.n_users = 0
        self.n_latent_factors = n_factors
        self.n_iters = n_iters
        
        self.verbose = verbose
        self.threshold = threshold
        self.b_optimize_threshold = optimize_threshold
        
        self.user = user
        self.item = item
        self.rating = rating
        
    def _make_bool_matrix(self):
        self.data.loc[:,self.rating] = self.data[self.rating].astype(bool)
        data_matrix = self.data.pivot_table(index=self.user, columns=self.item).fillna(False)
        return data_matrix
        
    def _train_initialize(self, data):
        
        self.data = data[data[self.rating] > 0]
        self.data_zero = data.copy()
        self.data_matrix = self._make_bool_matrix()
        print('num users:', self.data_matrix.shape[0])
        print('num items:', self.data_matrix.shape[1])
        
        print('proportion positive:', len(self.data) / len(self.data_zero))

        self.user_array = np.array(self.data_matrix.index)
        self.item_array = np.array(self.data_matrix.columns.levels[-1])
        
        self.n_users = len(self.user_array)
        self.n_items = len(self.item_array)
        self.n_impl_ratings = self.data[self.rating].sum()
        
        self.items_dict = defaultdict(list)
        for index, value in enumerate(self.item_array):
            self.items_dict[value] = index
            
        self.users_dict = defaultdict(list)
        for index, value in enumerate(self.user_array):
            self.users_dict[value] = index
            
        print('')
            
    def _param_initialize(self):
        self.prob_z_given_user = np.random.rand(self.n_users, self.n_latent_factors)
        self.prob_item_given_z = np.random.rand(self.n_latent_factors, self.n_items)
        self.prob_z_given_user_item = np.random.rand(self.n_latent_factors)
        
        user_group = self.data.groupby(self.user)
        self.count_user = user_group[self.rating].sum()
        self.prob_user = self.count_user / self.n_impl_ratings
        
        item_group = self.data.groupby(self.item)
        self.count_item = item_group[self.rating].sum()
    
    def _update_params(self):
        
        self.prob_item_given_user = np.dot(self.prob_z_given_user, self.prob_item_given_z)
        
        for z in range(self.n_latent_factors):
            print('\n===========Z={}==========='.format(z))
            
            print('\n=======E-step=======')
            
            prob_z_given_user_item = np.dot(self.prob_z_given_user[:,z].reshape(1, -1).T,\
                                            self.prob_item_given_z[z,:].reshape(1, -1)) / \
                                            self.prob_item_given_user
            
            data_matrix_z = self.data_matrix * prob_z_given_user_item
            
            print('\n=======LOOP 1=======')
            prob_z_given_user_num_array = data_matrix_z.sum(axis = 1)
            prob_z_given_user_den_array = self.data_matrix.sum(axis = 1)
            
            prob_z_given_user_array = prob_z_given_user_num_array.values / prob_z_given_user_den_array
            self.prob_z_given_user[:,z] = prob_z_given_user_array
            
            print('\n=======LOOP 2=======')
            prob_item_given_z_den = data_matrix_z.sum().sum()
            prob_item_given_z_num_array = data_matrix_z.sum(axis = 0)
            prob_item_given_z_array = prob_item_given_z_num_array / prob_item_given_z_den
            self.prob_item_given_z[z, :] = prob_item_given_z_array
            clear_output()

    def _calc_log_likelihood(self):

        summand_1 = (self.data_matrix * np.log(self.prob_item_given_user)).sum().sum()
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values
        count_user_array = self.data_matrix.sum(axis = 1)
        
        summand_2 = (count_user_array * np.log(prob_user_array)).sum()
        log_likelihood = summand_1 + summand_2
        
        return log_likelihood
    
    def _calc_reconstruction(self):
        
        total_instances = self.data_matrix.sum().sum()
        
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values.reshape(1, -1).T
        self.prob_joint_user_item = self.prob_item_given_user * prob_user_array * total_instances
        return self.prob_joint_user_item
    
    def _train_predict(self):
        
        self._calc_reconstruction()
        pred_list = list()
        for row_index in range(len(self.data_zero)):
            row = self.data_zero.iloc[row_index, :]
            user = row[self.user]
            item = row[self.item]
            rating = row[self.rating]
            
            item_index = self.items_dict[item]
            user_index = self.users_dict[user]
            
            if isinstance(item_index, list) or isinstance(user_index, list):
                rating_pred = 0
            else:
                rating_pred = self.prob_joint_user_item[user_index][item_index]
            pred_list.append(rating_pred)
            
        pred_array = np.array(pred_list)
        ratings_array = np.array(self.data_zero[self.rating])
        
        return pred_array, ratings_array
    
    def _display_metrics(self):
        pred_array, ratings_array = self._train_predict()
        
        mae = np.abs(self.data_matrix - self.prob_joint_user_item).sum().sum() / \
            self.data_matrix.size
        precision = sum((pred_array >= self.threshold) & (ratings_array == 1)) / sum(pred_array >= self.threshold)
        recall = sum((pred_array >= self.threshold) & (ratings_array == 1)) / sum(ratings_array)
        accuracy = sum((pred_array >= self.threshold) == (ratings_array == 1)) / len(ratings_array)
        f1_score = (2 * precision * recall) / (precision + recall)

        print('mae:', mae)
        print('accuracy:', accuracy)
        print('f1-score:', f1_score)
        print('precision:', precision)
        print('recall:', recall)
    
    def optimize_threshold(self, data, upper = 0.1, steps = 100, target_precision = 0.9):
        print('\n=========optimizing threshold=========')
        ratings_array = data[self.rating]
        pred_array = self.predict_proba(data)
        threshold_array = np.linspace(0, upper, steps)[1:]
        
        best_threshold = 0
        for threshold in threshold_array:
            precision = sum((pred_array >= threshold) & (ratings_array == 1)) / sum(pred_array >= threshold)
            recall = sum((pred_array >= threshold) & (ratings_array == 1)) / sum(ratings_array)
            f1_score = (2 * precision * recall) / (precision + recall)
            if precision >= target_precision:
                best_threshold = threshold
                break
                
        self.threshold = best_threshold
        print('optimal threshold:', best_threshold)
        
    def fit(self, data):
        self._train_initialize(data)
        self._param_initialize()
        
        for i in range(self.n_iters):
            self._update_params()
        
            if self.verbose:
                print('\n==================ITER {}=================='.format(i+1))
                self._display_metrics()
                
                log_l = self._calc_log_likelihood()
                print('log-likelihood:', log_l)
                
        if self.b_optimize_threshold:
            self._optimize_threshold()
        if not self.verbose:
            self._display_metrics()
    
    def predict_proba(self, data):
        pred_list = list()
        for row_index in range(len(data)):
            row = data.iloc[row_index, :]
            user = row[self.user]
            item = row[self.item]
            #rating = row[self.rating]
            
            item_index = self.items_dict[item]
            user_index = self.users_dict[user]
            
            if isinstance(item_index, list) or isinstance(user_index, list):
                rating_pred = 0
            else:
                rating_pred = self.prob_joint_user_item[user_index][item_index]
            pred_list.append(rating_pred)
            
        pred_array = np.array(pred_list)
        #ratings_array = np.array(self.data_zero[self.rating])
        
        return pred_array#, ratings_array
        
    def predict(self, data):
        pred_array = self.predict_proba(data)
        return pred_array >= self.threshold 

In [143]:
ratings_train = ratings[:10000]

plsi = PLSI(n_factors = 10, n_iters = 10, threshold = 0.001, verbose = False)
plsi.fit(ratings_train)

mae: 0.0010037195998
accuracy: 0.9776
f1-score: 0.984577251446
precision: 0.971203477316
recall: 0.998324490366


In [167]:
plsi.optimize_threshold(ratings_test, steps=100, upper=0.0001)


optimal threshold: 1.0101010101e-06


In [None]:
results == ratings_test['rating']

In [134]:
x = plsi._calc_reconstruction()

In [138]:
ratings_test['rating'].sum()

35807

In [178]:
plsi.threshold = 0.02
ratings_test = ratings[10000:13000]
results = plsi.predict(ratings_test)
print('accuracy')
print(sum(results == ratings_test['rating']) / len(ratings_test))
print('precision')
print(sum(results & (ratings_test['rating'] == True)) / sum(results))
print('recall')
print(sum(results & (ratings_test['rating'] == True)) / sum(ratings_test['rating'] == True))

accuracy
0.287666666667
precision
0.869565217391
recall
0.00928505106778


In [85]:
np.random.seed(2)

df_6 = pd.DataFrame([[1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [0,0,0,1,1,0],
                [0,0,0,1,1,0],
                [0,0,0,1,1,0],
                [0,0,0,0,0,0]],
                columns = list('abcdef'), index = list('qrstuvw')).reset_index()
df_6 = df_6.melt(value_vars=list('abcdef'), id_vars = 'index')
df_6.columns = ['user', 'item', 'rating']
df_6['rating'] = df_6['rating'].astype(bool)
#df_6 = df_6.sample(38, replace=False)

plsi = PLSI(n_factors = 2, n_iters = 8)
plsi.fit(df_6)

np.round(plsi._calc_reconstruction(), 2)

num users: 6
num items: 5
proportion positive: 0.35714285714285715

mae: 0.0358382091568
accuracy: 1.0
f1-score: 1.0
precision: 1.0
recall: 1.0


array([[ 0.94,  0.94,  0.92,  0.1 ,  0.1 ],
       [ 1.  ,  0.99,  0.97,  0.02,  0.02],
       [ 1.  ,  0.99,  0.97,  0.02,  0.02],
       [ 0.02,  0.03,  0.04,  0.96,  0.96],
       [ 0.02,  0.03,  0.04,  0.96,  0.96],
       [ 0.02,  0.03,  0.04,  0.96,  0.96]])

In [89]:
plsi.predict(df_6) == df_6['rating']

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
29    True
30    True
31    True
32    True
33    True
34    True
35    True
36    True
37    True
38    True
39    True
40    True
41    True
Name: rating, dtype: bool

In [494]:
(plsi.calc_reconstruction() >= 0.5).sum()

4

In [123]:
len(ratings) / 10000 * (end - begin)

316.606014110446

In [124]:
total_prob = 0
for i in range(plsi.n_latent_factors):
    total_prob += plsi.prob_z_given_user_item[i]['A1GHUN5HXMHZ89']['076400459X']
print(total_prob)

1.0
