In [1]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict
import seaborn as sns
import warnings
from IPython.display import clear_output

warnings.filterwarnings('ignore')

np.random.seed(1)

% matplotlib inline

In [2]:
ratings = pd.read_csv('../data/reviews_sample_100.csv').drop(['Unnamed: 0', 'reviewTime'], axis = 1)
ratings.columns = ['item', 'user', 'rating']
ratings = ratings[['user', 'item', 'rating']]
ratings['rating'] = ratings['rating'].astype(int)
ratings['rating'] = (ratings['rating'] >= 4).astype(bool)
ratings.head()

Unnamed: 0,user,item,rating
0,A1JH5J1KQAUBMP,B00005U0JX,True
1,A1RSXP7MB772E3,B001DHXT1G,True
2,AJGP5XYKKBGBG,0792840054,True
3,A2SQJPUCZNHMZE,B005LAIHSG,True
4,A3QVAKVRAH657N,B00005K3OU,False


In [37]:
class PLSI():
    
    def __init__(self, n_factors = 5, n_iters = 5,
                 verbose = False, user = 'user', item = 'item', rating = 'rating'):
        self.n_items = 0
        self.n_users = 0
        self.n_latent_factors = n_factors
        self.n_iters = n_iters
        
        self.verbose = verbose
        
        self.user = user
        self.item = item
        self.rating = rating
        
    def _make_bool_matrix(self):
        self.data.loc[:,self.rating] = self.data[self.rating].astype(bool)
        data_matrix = self.data.pivot_table(index=self.user, columns=self.item).fillna(False)
        return data_matrix
        
    def _train_initialize(self, data):
        
        self.data = data[data[self.rating] > 0]
        self.data_zero = data.copy()
        self.data_matrix = self._make_bool_matrix()
        print('num users:', self.data_matrix.shape[0])
        print('num items:', self.data_matrix.shape[1])
        
        print('proportion positive:', len(self.data) / len(self.data_zero))

        self.user_array = np.array(self.data_matrix.index)
        self.item_array = np.array(self.data_matrix.columns.levels[-1])
        
        self.n_users = len(self.user_array)
        self.n_items = len(self.item_array)
        self.n_impl_ratings = self.data[self.rating].sum()
        
        self.items_dict = defaultdict(list)
        for index, value in enumerate(self.item_array):
            self.items_dict[value] = index
            
        self.users_dict = defaultdict(list)
        for index, value in enumerate(self.user_array):
            self.users_dict[value] = index
            
        print('')
            
    def _param_initialize(self):
        self.prob_z_given_user = np.random.rand(self.n_users, self.n_latent_factors)
        self.prob_item_given_z = np.random.rand(self.n_latent_factors, self.n_items)
        self.prob_z_given_user_item = np.random.rand(self.n_latent_factors)
        
        user_group = self.data.groupby(self.user)
        self.count_user = user_group[self.rating].sum()
        self.prob_user = self.count_user / self.n_impl_ratings
        
        item_group = self.data.groupby(self.item)
        self.count_item = item_group[self.rating].sum()
    
    def _update_params(self):
        
        self.prob_item_given_user = np.dot(self.prob_z_given_user, self.prob_item_given_z)
        
        for z in range(self.n_latent_factors):
            #print('\n===========Z={}==========='.format(z))
            
            #print('\n=======E-step=======')
            
            prob_z_given_user_item = np.dot(self.prob_z_given_user[:,z].reshape(1, -1).T,\
                                            self.prob_item_given_z[z,:].reshape(1, -1)) / \
                                            self.prob_item_given_user
            
            data_matrix_z = self.data_matrix * prob_z_given_user_item
            
            #print('\n=======LOOP 1=======')
            prob_z_given_user_num_array = data_matrix_z.sum(axis = 1)
            prob_z_given_user_den_array = self.data_matrix.sum(axis = 1)
            
            prob_z_given_user_array = prob_z_given_user_num_array.values / prob_z_given_user_den_array
            self.prob_z_given_user[:,z] = prob_z_given_user_array
            
            #print('\n=======LOOP 2=======')
            prob_item_given_z_den = data_matrix_z.sum().sum()
            prob_item_given_z_num_array = data_matrix_z.sum(axis = 0)
            prob_item_given_z_array = prob_item_given_z_num_array / prob_item_given_z_den
            self.prob_item_given_z[z, :] = prob_item_given_z_array
            #clear_output()

    def _calc_log_likelihood(self):

        summand_1 = (self.data_matrix * np.log(self.prob_item_given_user)).sum().sum()
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values
        count_user_array = self.data_matrix.sum(axis = 1)
        
        summand_2 = (count_user_array * np.log(prob_user_array)).sum()
        log_likelihood = summand_1 + summand_2
        
        return log_likelihood
    
    def _calc_joint_matrix(self):
        
        total_instances = self.data_matrix.sum().sum()
        
        prob_user_array = (self.data_matrix.sum(axis = 1) / self.data_matrix.sum().sum()).values.reshape(1, -1).T
        self.prob_joint_user_item = self.prob_item_given_user * prob_user_array * total_instances
        return self.prob_joint_user_item
        
    def fit(self, data):
        self._train_initialize(data)
        self._param_initialize()
        
        for i in range(self.n_iters):
            self._update_params()
        
            if self.verbose:
                print('\n==================ITER {}=================='.format(i+1))
                
                log_l = self._calc_log_likelihood()
                print('log-likelihood:', log_l)
        self._calc_joint_matrix()
    
    def predict_proba(self, data):
        pred_list = list()
        for row_index in range(len(data)):
            row = data.iloc[row_index, :]
            user = row[self.user]
            item = row[self.item]
            #rating = row[self.rating]
            
            item_index = self.items_dict[item]
            user_index = self.users_dict[user]
            
            if isinstance(item_index, list) or isinstance(user_index, list):
                rating_pred = 0
            else:
                rating_pred = self.prob_item_given_user[user_index][item_index]
            pred_list.append(rating_pred)
            
        pred_array = np.array(pred_list)
        
        return pred_array
    
    def recommend_top_k(self, user, k = 10):
        user_index = self.users_dict[user]
        item_list = self.data_matrix.columns.levels[-1]
        probas = self.prob_item_given_user[user_index,:]
        indices = np.argsort(probas)[::-1]
        return item_list[indices], probas[indices]
    
    def calc_precision(self, data, k = 10):
        user_array = data[self.user].unique()
        
        precision_list = list()
        i = 0
        for user in user_array:
            print(i, user)
            i += 1
            recommendations, probas = self.recommend_top_k(user, k)
            
            precision_list_user = list()
            for recommendation in recommendations:
                query = data[(data[self.user] == user) & (data[self.item] == recommendation)]
                if len(query) > 0:
                    precision_list_user.append(query[self.rating].values[0])
                if len(precision_list_user) >= k:
                    break
            if len(precision_list_user):
                precision_list.append(sum(precision_list_user) / len(precision_list_user))
                print(sum(precision_list_user), len(precision_list_user))
        if len(precision_list):
            return sum(precision_list) / len(precision_list)
        else:
            return None

In [78]:
df_4_orig = pd.DataFrame([[1,1,0,0],
                    [1,1,0,0],
                    [0,0,1,1],
                    [0,0,1,1]], columns = list('abcd'), index=list('qrst'))
df_4 = df_4_orig.reset_index()
df_4 = df_4.melt(value_vars=list('abcd'), id_vars = 'index')
df_4.columns = ['user', 'item', 'rating']

plsi = PLSI(n_factors = 2, n_iters = 20)
plsi.fit(df_4)
pred = plsi.prob_joint_user_item
print((pred - df_4_orig).values.mean())

num users: 4
num items: 4
proportion positive: 0.5

0.0


In [80]:
np.random.seed(2)

df_6_orig = pd.DataFrame([[1,1,1,0,0,0],
                    [1,1,1,0,0,0],
                    [1,1,0,0,1,0],
                    [0,1,0,1,1,1],
                    [0,1,0,1,0,1],
                    [0,0,0,1,1,1]],
                    columns = list('abcdef'), index = list('qrstuv'))
df_6 = df_6_orig.reset_index()
df_6 = df_6.melt(value_vars=list('abcdef'), id_vars = 'index')
df_6.columns = ['user', 'item', 'rating']

plsi = PLSI(n_factors = 2, n_iters = 20)
plsi.fit(df_6)
pred = plsi.prob_joint_user_item
print((pred - df_6_orig).values.mean())

num users: 6
num items: 6
proportion positive: 0.5277777777777778

-6.7846962616e-17


In [None]:
ratings_train = ratings[:10000]

plsi = PLSI(n_factors = 20, n_iters = 10, verbose = True)
plsi.fit(ratings_train)

In [7]:
y_pred = plsi.predict_proba(ratings_train)
y_train = ratings_train['rating']

In [12]:
y_train.mean()

0.71619999999999995

In [8]:
np.abs(y_pred - y_train).mean()

0.7127523675420432