# 初始化数据

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
header = ['user_id', 'item_id','rating', 'timestamp']

In [22]:
df = pd.read_csv('data/recommender_system.data', sep='\t', names=header)

In [23]:
user_number = df.user_id.unique().shape[0]

In [24]:
item_number = df.item_id.unique().shape[0]

In [25]:
print('Number of users is: %i, Number of item is %i'% (user_number,item_number))

Number of users is: 943, Number of item is 1682


# 分离样本的训练集和验证集

In [26]:
from sklearn import cross_validation

In [27]:
train_data, test_data = cross_validation.train_test_split(df, test_size=0.3)

In [28]:
train_matrix = np.zeros((user_number,item_number))

In [29]:
for row in train_data.itertuples():
    train_matrix[row[1]-1,row[2]-1] = row[3]

In [30]:
test_matrix = np.zeros((user_number,item_number))

In [31]:
for row in test_data.itertuples():
    test_matrix[row[1]-1,row[2]-1] = row [3]

In [32]:
train_matrix

array([[ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [33]:
test_matrix

array([[ 0.,  3.,  4., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

# 推荐

In [34]:
from sklearn.metrics.pairwise import pairwise_distances as pairdis

In [35]:
user_similarity = pairdis(train_matrix, metric='cosine')

In [36]:
item_similarity = pairdis(train_matrix.T, metric='cosine')

In [37]:
mean_user_rating = train_matrix.mean(axis=1)

In [38]:
ratings_diff = (train_matrix - mean_user_rating[:,np.newaxis])

In [39]:
user_based_prediction = mean_user_rating[:,np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_based_prediction

array([[ 1.48466306,  0.52915746,  0.44836066, ...,  0.26549171,
         0.26298553,  0.26537561],
       [ 1.24721491,  0.26345505,  0.14171869, ..., -0.05802702,
        -0.05997408, -0.05662606],
       [ 1.28010412,  0.23288549,  0.12009011, ..., -0.08736594,
        -0.08899521, -0.08559455],
       ..., 
       [ 1.18641069,  0.19476484,  0.07621989, ..., -0.11630005,
        -0.11847175, -0.11532217],
       [ 1.30056996,  0.28407907,  0.18782669, ..., -0.01548953,
        -0.01755495, -0.01439748],
       [ 1.33897329,  0.34826858,  0.27346428, ...,  0.09111681,
         0.08867613,  0.09116272]])

In [40]:
item_based_prediction = train_matrix.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])
item_based_prediction

array([[ 0.33444292,  0.35576666,  0.37469504, ...,  0.40764396,
         0.40571089,  0.39612763],
       [ 0.082002  ,  0.09408052,  0.09067276, ...,  0.09603233,
         0.09756098,  0.09749222],
       [ 0.0690916 ,  0.07046979,  0.06879647, ...,  0.06690415,
         0.06960143,  0.07037731],
       ..., 
       [ 0.03000795,  0.03471791,  0.03283393, ...,  0.03797959,
         0.03807258,  0.03742809],
       [ 0.11869197,  0.12522957,  0.13101813, ...,  0.13370035,
         0.13444378,  0.13428967],
       [ 0.19006687,  0.18871901,  0.2082593 , ...,  0.23412461,
         0.23259964,  0.22650464]])

# 结果评估

In [41]:
from sklearn.metrics import mean_squared_error

In [42]:
from math import sqrt

In [43]:
user_based_prediction = user_based_prediction[test_matrix.nonzero()].flatten()

In [44]:
test_matrix_flatten = test_matrix[test_matrix.nonzero()].flatten()

In [45]:
sqrt(mean_squared_error(user_based_prediction, test_matrix_flatten))

3.155816750329327

In [46]:
item_based_prediction = item_based_prediction[test_matrix.nonzero()].flatten()

In [47]:
sqrt(mean_squared_error(item_based_prediction, test_matrix_flatten))

3.4615757467039345

# 基于SVD的最大似然法

In [48]:
#稀疏度
sparsity = 1.0-(len(df)/float(user_number*item_number))
sparsity

0.9369533063577546

In [49]:
from scipy.sparse.linalg import svds

In [50]:
u, s, v = svds(train_matrix, k=30)

In [51]:
s_matrix = np.diag(s)

In [52]:
SVD_prediction = np.dot(np.dot(u, s_matrix), v)

In [53]:
SVD_prediction = SVD_prediction[test_matrix.nonzero()].flatten()

In [54]:
sqrt(mean_squared_error(SVD_prediction, test_matrix_flatten))

2.868767677605884

In [55]:
SVD_prediction

array([ 1.02842755,  0.48607747,  0.61409702, ...,  0.35520634,
        0.57598903,  0.26415906])

In [56]:
np.dot(np.dot(u, s_matrix), v)

array([[  6.52989416e+00,   1.02842755e+00,   4.86077466e-01, ...,
         -1.93725941e-02,   0.00000000e+00,   3.76396292e-02],
       [  2.35497392e+00,  -3.53985663e-02,  -3.02997367e-02, ...,
          9.76522087e-03,   0.00000000e+00,  -8.06179166e-03],
       [  3.87049801e-02,  -4.14799804e-02,   3.29515490e-02, ...,
          1.71175031e-02,   0.00000000e+00,   2.77705362e-03],
       ..., 
       [  9.36432901e-01,  -1.44691206e-01,   3.68017974e-01, ...,
         -2.18081980e-03,   0.00000000e+00,  -2.45209496e-03],
       [ -1.27691497e-01,   1.29606246e-01,  -4.93986250e-03, ...,
          1.00372016e-02,   0.00000000e+00,  -3.40191801e-02],
       [ -6.91830217e-01,   1.31094932e+00,   8.67003782e-01, ...,
         -6.76844336e-03,   0.00000000e+00,   2.45127548e-02]])