In [1]:
#https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html
import pandas as pd
import numpy as np
import datetime

from sklearn.metrics.pairwise import pairwise_distances
from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from math import sqrt

class CF():
    def __init__(self, path_read= "data.csv", path_write= 'rating_rank.csv'):
        self.path_read= path_read
        self.path_write= path_write
        
    def read_data(self, path_read):
        df= pd.read_csv(path_read)
        print( df.head() )
        return df

    def write_data(self, data_numpy, path_write): #numpy
        df= pd.DataFrame(data_numpy)
        df.to_csv(path_write)
        #print( df.head() )
        print(path_write)
        return df    
    
    def split_data(self, df):
        n_users = df.user_id.unique().shape[0]
        n_items = df.item_id.unique().shape[0]
        print ('Number of users = ' + str(n_users) + ' | Number of apts = ' + str(n_items) )

        train_data, test_data = cv.train_test_split(df, test_size=0.2)
        train_data.to_csv('pipenv/data/train.csv',index=False)
        test_data.to_csv('pipenv/data/test.csv', index=False)        
        return n_users, n_items, train_data, test_data
    
    def sparsity_level(self, df, n_users, n_items):
        sparsity=round(1.0-len(df)/float(n_users*n_items),3)
        print ('The sparsity level is ' +  str(sparsity*100) + '%')

    def generate_user_item_matrix(self, n_users, n_items, data):
        #Create two user-item matrices, one for training and another for testing
        data_matrix = np.zeros((n_users, n_items))
        for row in data.itertuples():
            data_matrix[row[1]-1, row[2]-1] = row[3]        
        return data_matrix
        
    def generate_sim(self, train_data_matrix):
        user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
        item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
        return user_similarity, item_similarity

    def rmse(self, prediction, ground_truth):
        prediction = prediction[ground_truth.nonzero()].flatten()
        ground_truth = ground_truth[ground_truth.nonzero()].flatten()
        return (mean_squared_error(prediction, ground_truth))**.5   
    
    def predict_memory_based(self, ratings, similarity, type='user'):
        if type == 'user':
            mean_user_rating = ratings.mean(axis=1)
            #You use np.newaxis so that mean_user_rating has same format as ratings
            ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
            pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        elif type == 'item':
            pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        return pred

    def predict_svd(self, train_data_matrix): #user based
        #get SVD components from train matrix. Choose k.
        u, s, vt = svds(train_data_matrix, k = 20)
        s_diag_matrix=np.diag(s)
        pred = np.dot(np.dot(u, s_diag_matrix), vt)
        return pred
    
    def predict(self, model='normal'):
        df= self.read_data(self.path_read)    
        n_users, n_items, train_data, test_data= self.split_data(df)  
        self.sparsity_level(df, n_users, n_items) 

        train_data_matrix= self.generate_user_item_matrix(n_users, n_items, train_data)
        test_data_matrix= self.generate_user_item_matrix(n_users, n_items, test_data) 
        
        print('\ntime evalation: ')
        start_time= datetime.datetime.now()                
        user_similarity, item_similarity= self.generate_sim(train_data_matrix)
        print("generate similarity time: ", (datetime.datetime.now()-start_time), "h:m:s")        
        
        start_time= datetime.datetime.now() 
        svd_prediction  = self.predict_svd(train_data_matrix)
        print("User-based SVD prediction time: ", (datetime.datetime.now()-start_time), "h:m:s")        
        
        if model=='normal': return svd_prediction
        
        #-------------------analysis---------------------------
        start_time= datetime.datetime.now()        
        item_prediction = self.predict_memory_based(train_data_matrix, item_similarity, type='item')
        print("Item-based CF prediction time: ", (datetime.datetime.now()-start_time), "h:m:s")
                
        start_time= datetime.datetime.now() 
        user_prediction = self.predict_memory_based(train_data_matrix, user_similarity, type='user')     
        print("Item-User CF prediction time: ", (datetime.datetime.now()-start_time), "h:m:s")
 
        print('\nmetrics evalation: ')
        print ( 'User-based SVD CF RMSE: '   + str(self.rmse(svd_prediction, test_data_matrix )) )
        print ( 'User-based CF RMSE: '      + str(self.rmse(user_prediction, test_data_matrix)) )
        print ( 'Item-based CF RMSE: '      + str(self.rmse(item_prediction, test_data_matrix)) ) 

        #select SVD and save it
        df_svd_pred= self.write_data(svd_prediction, self.path_write)
 
        return pd.DataFrame(test_data_matrix), df_svd_pred 
        

if __name__ == "__main__": 
    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('path_read_data', help='path_read_data')
        parser.add_argument('path_write_data', help='path_write_data')          
        args = parser.parse_args()
        path_read_data= args.path_read_data
        path_write_data= args.path_write_data 
        
    except:
        path_read_data= 'pipenv/data/data.csv'
        path_write_data= 'pipenv/result/rating_rank.csv'    
    
df_true, df_pred= CF(path_read_data, path_write_data).predict('analysis')    



   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
Number of users = 943 | Number of apts = 1682
The sparsity level is 93.7%

time evalation: 
generate similarity time:  0:00:00.435255 h:m:s
User-based SVD prediction time:  0:00:00.282325 h:m:s
Item-based CF prediction time:  0:00:00.106003 h:m:s
Item-User CF prediction time:  0:00:00.076989 h:m:s

metrics evalation: 
User-based SVD CF RMSE: 2.6591159284440264
User-based CF RMSE: 3.099161799976676
Item-based CF RMSE: 3.4477960848660216
pipenv/result/rating_rank.csv


In [2]:
df_pred.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
938,1.868818,-0.282111,0.292818,-0.24295,0.23406,0.097612,0.919129,0.282898,1.37442,0.154453,...,0.010724,0.01415,0.002332,0.001555,0.014886,-0.00514,-0.015419,-0.01028,0.0,0.001389
939,0.205127,-0.077635,0.029684,1.616899,0.095754,0.036495,1.869024,0.846353,1.291662,-0.028745,...,0.016183,0.007509,5e-05,3.3e-05,-0.005664,0.009079,0.027237,0.018158,0.0,-0.017878
940,1.484907,0.028483,0.375796,-0.133522,-0.100966,0.031121,1.105262,0.150616,0.606985,0.162601,...,-0.004439,0.000929,0.005599,0.003732,0.011975,-0.001646,-0.004939,-0.003293,0.0,-8.9e-05
941,1.179122,0.115087,-0.647073,0.352824,-0.21105,-0.129424,-0.407211,0.88266,-0.265515,-0.316056,...,0.023457,0.026153,0.011557,0.007705,-0.004462,0.009871,0.029613,0.019742,0.0,-0.029647
942,1.270242,1.823104,0.76695,2.117326,1.314199,-0.014187,2.727828,1.074376,2.438255,-0.497862,...,0.001061,-0.042455,-0.006228,-0.004152,-0.002907,-0.004588,-0.013763,-0.009175,0.0,0.023157


In [3]:
df_true

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
