In [100]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict
import seaborn as sns

np.random.seed(1)

% matplotlib inline

In [29]:
ratings = pd.read_csv('../data/reviews_sample_100.csv').drop(['Unnamed: 0', 'reviewTime'], axis = 1)
ratings.columns = ['item', 'user', 'rating']
ratings = ratings[['user', 'item', 'rating']]
ratings['rating'] = ratings['rating'].astype(int)
ratings['rating'] = (ratings['rating'] >= 4).astype(int)

In [30]:
ratings['rating'].sum()

316891

In [155]:
x = np.zeros([7500,7500]).astype(bool)
pd.DataFrame(x).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Columns: 7500 entries, 0 to 7499
dtypes: bool(7500)
memory usage: 53.6 MB


In [160]:
ratings['rating'] = ratings['rating'].astype(bool)
ratings.pivot_table(index='user', columns='item').fillna(False)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
item,0307142493,0307514161,0310263662,0764001035,0764003828,0764005022,0764005685,0764005693,0764005707,0764006770,...,B00J4LMHMK,B00J5LXST0,B00JA3RPAG,B00JAQJMJ0,B00JBBJJ24,B00JKPHUE0,B00K2CHVJ4,B00K2CHWOI,B00KM9LY3Q,B00L4IDS4W
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A02755422E9NI29TCQ5W3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A08324222HTYZDE4L3F8Z,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A100JCBNALJFAW,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A100RW34WSLTUW,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A1010QRG4BH51B,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A10175AMUHOQC4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A102B8D74H64TO,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A102RDJLOHWS0W,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A10386251WFUSDQRAMLL1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
A103EXN5Q7HX6Z,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [171]:
class PLSI():
    
    def __init__(self, n_factors = 5, n_iters = 5, user = 'user', item = 'item', rating = 'rating'):
        self.n_items = 0
        self.n_users = 0
        self.n_latent_factors = n_factors
        self.n_iters = n_iters
        
        self.user = user
        self.item = item
        self.rating = rating
        
    def _make_bool_matrix(self):
        self.data[self.rating] = self.data[self.rating].astype(bool)
        data_matrix = self.data.pivot_table(index=self.user, columns=self.item).fillna(False)
        return data_matrix
        
    def _train_initialize(self, data):
        
        self.data = data[data[self.rating] > 0]
        self.data_zero = data.copy()
        self.data_matrix = self._make_bool_matrix()
        
        self.user_array = self.data[self.user].unique()
        self.item_array = self.data[self.item].unique()
        
        self.n_users = len(self.user_array)
        self.n_items = len(self.item_array)
        self.n_impl_ratings = self.data[self.rating].sum()
        
        self.items_dict = defaultdict(list)
        for index, value in enumerate(self.item_array):
            self.items_dict[value] = index
            
        self.users_dict = defaultdict(list)
        for index, value in enumerate(self.user_array):
            self.users_dict[value] = index
            
    def _param_initialize(self):
        self.prob_z_given_user = np.random.rand(self.n_users, self.n_latent_factors)
        self.prob_item_given_z = np.random.rand(self.n_latent_factors, self.n_items)
        self.prob_z_given_user_item = np.random.rand(self.n_latent_factors)
        
        user_group = self.data.groupby(self.user)
        self.count_user = user_group[self.rating].sum()
        self.prob_user = self.count_user / self.n_impl_ratings
        
        item_group = self.data.groupby(self.item)
        self.count_item = item_group[self.rating].sum()
        
#         self.prob_item_given_user = dict()
#         for user in self.user_array:
#             self.prob_item_given_user[user] = dict()
            
        self.prob_z_given_user_item = dict()
        for z in range(self.n_latent_factors):
            self.prob_z_given_user_item[z] = dict()
            for user in self.user_array:
                self.prob_z_given_user_item[z][user] = dict()
                
        self.prob_z_given_item_user = dict()
        for z in range(self.n_latent_factors):
            self.prob_z_given_item_user[z] = dict()
            for item in self.item_array:
                self.prob_z_given_item_user[z][item] = dict()
        
    def E_step(self):
        
#         for row_index in range(len(self.data)):
#             row = self.data.iloc[row_index, :]
#             user = row[self.user]
#             item = row[self.item]
#             rating = row[self.rating]
            
#             item_index = self.items_dict[item]
#             user_index = self.users_dict[user]
            
#             self.prob_item_given_user[user][item] = \
#                 np.dot(self.prob_z_given_user[user_index,:], self.prob_item_given_z[:,item_index])
                
#             for z in range(self.n_latent_factors):
#                 self.prob_z_given_user_item[z][user][item] = \
#                     self.prob_item_given_z[z, item_index] * self.prob_z_given_user[user_index, z] / \
#                     self.prob_item_given_user[user][item]
                    
#                 self.prob_z_given_item_user[z][item][user] = \
#                     self.prob_item_given_z[z, item_index] * self.prob_z_given_user[user_index, z] / \
#                     self.prob_item_given_user[user][item]

        for user in self.user_array:
            for item in self.item_array:
                item_index = self.items_dict[item]
                user_index = self.users_dict[user]
                
                self.prob_item_given_user[user][item] = \
                    np.dot(self.prob_z_given_user[user_index,:], self.prob_item_given_z[:,item_index])
                    
                for z in range(self.n_latent_factors):
                    self.prob_z_given_user_item[z][user][item] = \
                        self.prob_item_given_z[z, item_index] * self.prob_z_given_user[user_index, z] / \
                        self.prob_item_given_user[user][item]

                    self.prob_z_given_item_user[z][item][user] = \
                        self.prob_item_given_z[z, item_index] * self.prob_z_given_user[user_index, z] / \
                        self.prob_item_given_user[user][item]
        
        print('z|user')
        print(self.prob_z_given_user)
        print('item|z')
        print(self.prob_item_given_z)
        
        print('z=0|user,item')
        print(self.prob_z_given_user_item[0])
        print('z=0|item,user')
        print(self.prob_z_given_item_user[0])
        
        print('z=1|user,item')
        print(self.prob_z_given_user_item[1])
        print('z=1|item,user')
        print(self.prob_z_given_item_user[0])
            
        return
    
    def M_step(self):
        
        self.prob_item_given_user = np.dot(self.prob_z_given_user, self.prob_item_given_z)
        
        for z in range(self.n_latent_factors):
            
            #E-step
            prob_z_given_user_item = np.dot(self.prob_z_given_user[:,z], self.prob_item_given_z[z,:])
            
            data_matrix_z = self.data_matrix * prob_z_given_user_item
            
            #print(data_matrix_z)
            
            #print('loop 1')
            prob_item_given_z_den = 0
            for item in self.item_array:
                prob_item_given_z_den += sum(self.prob_z_given_item_user[z][item].values())
            
            print('prob_item_given_z_den, z={}:'.format(z), prob_item_given_z_den)
            
            #print('loop 2')
            for item in self.item_array:                
                prob_item_given_z_num = sum(self.prob_z_given_item_user[z][item].values())
                self.prob_item_given_z[z][self.items_dict[item]] = prob_item_given_z_num / prob_item_given_z_den
                
            #print('loop 3')
            for user in self.user_array:
                prob_z_given_user_num = sum(self.prob_z_given_user_item[z][user].values())
                self.prob_z_given_user[self.users_dict[user]][z] = prob_z_given_user_num / self.count_user[user]

    def calc_log_likelihood(self):
        log_likelihood = 0
        
        for user in self.user_array:
            prob_user = np.log(self.prob_user[user])
            prob_user *= self.count_user[user]
            summand = sum(np.log(list(self.prob_item_given_user[user].values())))
            
            log_likelihood += prob_user + summand
            
        return log_likelihood
    
    def calc_metrics(self):
        
        pred_list = list()
        for row_index in range(len(self.data_zero)):
            row = self.data_zero.iloc[row_index, :]
            user = row[self.user]
            item = row[self.item]
            #rating = 1 if row[self.rating] >= 4 else 0
            rating = row[self.rating]
            
            item_index = self.items_dict[item]
            user_index = self.users_dict[user]
            
            if isinstance(item_index, list) or isinstance(user_index, list):
                rating_pred = 0
            else:
                rating_pred = np.dot(self.prob_z_given_user[user_index, :], self.prob_item_given_z[:, item_index])
            
            pred_list.append(rating_pred)
            
        pred_array = np.array(pred_list)
        ratings_array = np.array(self.data_zero[self.rating])# >= 4
        
        precision = sum((pred_array >= 0.5) & (ratings_array == 1)) / sum(pred_array >= 0.5)
        recall = sum((pred_array >= 0.5) & (ratings_array == 1)) / sum(ratings_array)
        accuracy = sum((pred_array >= 0.5) == (ratings_array == 1)) / len(ratings_array)
            
        return accuracy, precision, recall
        
    def train(self, data):
        self._train_initialize(data)
        self._param_initialize()
        
        for i in range(self.n_iters):
#             print('iter', i)
            
#             print('e-step')
#             self.E_step()
            
            print('m-step')
            self.M_step()
            
#             log_l = self.calc_log_likelihood()
#             print(log_l)
            
#             print(np.dot(self.prob_z_given_user, self.prob_item_given_z))

#             accuracy, precision, recall = self.calc_metrics()
#             print(accuracy, precision, recall)

In [172]:
df_6 = pd.DataFrame([[1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [1,1,1,0,0,0],
                [0,0,0,1,1,1],
                [0,0,0,1,1,1],
                [0,0,0,1,1,1]],
                columns = ['a','b','c','d','e','f'], index = list('qrstuv')).reset_index()
df_6 = df_6.melt(value_vars=['a','b','c','d','e','f'], id_vars = 'index')
df_6.columns = ['user', 'item', 'rating']
df_6.head()

Unnamed: 0,user,item,rating
0,q,a,1
1,r,a,1
2,s,a,1
3,t,a,0
4,u,a,0


In [173]:
np.random.seed(2)

plsi = PLSI(n_factors = 2, n_iters = 1)
plsi.train(df_6)
plsi.data_matrix

m-step
        rating                                                  
item         a         b         c         d         e         f
user                                                            
q     1.142106  1.142106  1.142106  0.000000  0.000000  0.000000
r     1.142106  1.142106  1.142106  0.000000  0.000000  0.000000
s     1.142106  1.142106  1.142106  0.000000  0.000000  0.000000
t     0.000000  0.000000  0.000000  1.142106  1.142106  1.142106
u     0.000000  0.000000  0.000000  1.142106  1.142106  1.142106
v     0.000000  0.000000  0.000000  1.142106  1.142106  1.142106
prob_item_given_z_den, z=0: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


ZeroDivisionError: division by zero

In [65]:
ratings_test = ratings[:10000]

plsi = PLSI(n_factors = 400, n_iters = 20)
plsi.train(ratings_test)

iter 0
-23515.2243062
iter 1
-111963.582965
iter 2
-109215.4194
iter 3
-104119.158405
iter 4
-96005.9792216
iter 5
-87290.1396422
iter 6
-81576.1443498
iter 7
-78695.9903795
iter 8
-77289.5329979
iter 9
-76549.103263
iter 10
-76108.4585402
iter 11
-75841.0631767
iter 12
-75672.6936806
iter 13
-75549.6142656
iter 14
-75455.2296552
iter 15
-75385.0226722
iter 16
-75332.2919763
iter 17
-75291.9931119
iter 18
-75263.7474682
iter 19
-75243.8713755
0.2862 1.0 0.00335101926836


In [161]:
begin = time.time()
print(plsi.calc_log_likelihood(ratings_test))
end = time.time()
print(end - begin)

nan
0.02793288230895996


In [123]:
len(ratings) / 10000 * (end - begin)

316.606014110446

In [124]:
total_prob = 0
for i in range(plsi.n_latent_factors):
    total_prob += plsi.prob_z_given_user_item[i]['A1GHUN5HXMHZ89']['076400459X']
print(total_prob)

1.0
