# 初始化数据

In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [67]:
header = ['user_id', 'item_id','rating', 'timestamp']

In [68]:
df = pd.read_csv('data/recommender_system.data', sep='\t', names=header)

In [69]:
user_number = df.user_id.unique().shape[0]

In [70]:
item_number = df.item_id.unique().shape[0]

In [71]:
print('Number of users is: %i, Number of item is %i'% (user_numb,item_numer))

Number of users is: 943, Number of item is 1682


# 分离样本的训练集和验证集

In [72]:
from sklearn import cross_validation

In [73]:
train_data, test_data = cross_validation.train_test_split(df, test_size=0.3)

In [74]:
train_matrix = np.zeros((user_numb,item_numer))

In [75]:
for row in train_data.itertuples():
    train_matrix[row[1]-1,row[2]-1] = row[3]

In [76]:
test_matrix = np.zeros((user_numb,item_numer))

In [77]:
for row in test_data.itertuples():
    test_matrix[row[1]-1,row[2]-1] = row [3]

In [78]:
train_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [79]:
test_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

# 推荐

In [80]:
from sklearn.metrics.pairwise import pairwise_distances as pairdis

In [81]:
user_similarity = pairdis(train_matrix, metric='cosine')

In [82]:
item_similarity = pairdis(train_matrix.T, metric='cosine')

In [83]:
mean_user_rating = train_matrix.mean(axis=1)

In [84]:
ratings_diff = (train_matrix - mean_user_rating[:,np.newaxis])

In [85]:
user_based_prediction = mean_user_rating[:,np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [86]:
item_based_prediction = train_matrix.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])

# 结果评估

In [87]:
from sklearn.metrics import mean_squared_error

In [88]:
from math import sqrt

In [89]:
user_based_prediction = user_based_prediction[test_matrix.nonzero()].flatten()

In [90]:
test_matrix_flatten = test_matrix[test_matrix.nonzero()].flatten()

In [91]:
sqrt(mean_squared_error(user_based_prediction, test_matrix_flatten))

3.1645514389073783

In [92]:
item_based_prediction = item_based_prediction[test_matrix.nonzero()].flatten()

In [93]:
sqrt(mean_squared_error(item_based_prediction, test_matrix_flatten))

3.471888586102751

# 基于SVD的最大似然法

In [94]:
#稀疏度
sparsity = 1.0-(len(df)/float(user_number*item_number))
sparsity

0.9369533063577546

In [95]:
from scipy.sparse.linalg import svds

In [96]:
u, s, v = svds(train_matrix, k=30)

In [97]:
s_matrix = np.diag(s)

In [98]:
SVD_prediction = np.dot(np.dot(u, s_matrix), v)

In [99]:
SVD_prediction = SVD_prediction[test_matrix.nonzero()].flatten()

In [101]:
sqrt(mean_squared_error(SVD_prediction, test_matrix_flatten))

2.88342788086035

In [102]:
SVD_prediction

array([ 0.95898978, -0.2230337 ,  1.34469774, ...,  0.26087589,
        0.60748926,  0.15531035])

In [103]:
np.dot(np.dot(u, s_matrix), v)

array([[  3.85190272e+00,   1.69976231e+00,   1.07794478e+00, ...,
         -1.90213770e-02,   0.00000000e+00,   8.39664731e-02],
       [  2.78668124e+00,  -1.12757779e-01,   2.47213273e-01, ...,
          6.54086113e-03,   0.00000000e+00,  -3.17054436e-02],
       [ -4.45805804e-01,   4.72402121e-02,   1.53585948e-01, ...,
          1.71396500e-02,   0.00000000e+00,   2.82809250e-03],
       ..., 
       [  2.27495862e+00,   2.76869050e-01,   1.69744221e-01, ...,
         -4.28162870e-03,   0.00000000e+00,  -9.50223169e-03],
       [  2.77408126e-01,   3.13400422e-02,  -1.45324752e-01, ...,
          1.26125382e-02,   0.00000000e+00,  -3.06516362e-02],
       [  1.62917748e+00,   1.41637410e+00,   8.24824591e-01, ...,
         -1.00857679e-02,   0.00000000e+00,   6.12382934e-02]])