In [12]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movies = pd.read_csv( './data/douban/movies.csv')

print('电影数目（有名称）：%d' % movies[~pd.isnull(movies.title)].shape[0])
print('电影数目（没有名称）：%d' % movies[pd.isnull(movies.title)].shape[0])
print('电影数目（总计）：%d' % movies.shape[0])
movies.sample(10)

电影数目（有名称）：33258
电影数目（没有名称）：24166
电影数目（总计）：57424


Unnamed: 0,movieId,title
43061,43061,
28477,28477,Du er ikke alene
46119,46119,大明奇才
52907,52907,
34785,34785,ハンチョウ〜神南署安積班〜
36043,36043,Plus Belle La Vie
54048,54048,
55479,55479,
8137,8137,
39433,39433,Amor Só de Mãe


In [3]:
ratings = pd.read_csv('./data/douban/ratings.csv')
print('用户数据：%d' % ratings.userId.unique().shape[0])
print('电影数据：%d' % ratings.movieId.unique().shape[0])
print('评分数目：%d' % ratings.shape[0])
ratings.head()

用户数据：28718
电影数据：57424
评分数目：2828500


Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,5,1318222486
1,0,1,4,1313813583
2,0,2,5,1313458035
3,0,3,5,1313327802
4,0,4,3,1312126734


In [4]:
combine_movie_rating= pd.merge(ratings,movies,on='movieId')
combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)
print(len(combine_movie_rating))
combine_movie_rating.head()

2828500


Unnamed: 0,userId,movieId,rating,title
0,0,0,5,
1,529,0,4,
2,1247,0,5,
3,1335,0,5,
4,1397,0,5,


In [5]:
combine_movie_rating = combine_movie_rating.dropna(axis = 0 ,subset=['title'])
print(len(combine_movie_rating))
combine_movie_rating.head()

2604995


Unnamed: 0,userId,movieId,rating,title
22,0,1,4,Harry Potter and the Deathly Hallows: Part II
23,21,1,4,Harry Potter and the Deathly Hallows: Part II
24,25,1,5,Harry Potter and the Deathly Hallows: Part II
25,34,1,4,Harry Potter and the Deathly Hallows: Part II
26,36,1,5,Harry Potter and the Deathly Hallows: Part II


In [6]:
movie_rating_count=pd.DataFrame(combine_movie_rating.
                    groupby(['movieId'])['rating'].
                    count().
                    reset_index().
                    rename(columns={'rating':'totalRatingCount'})                   
                   )
movie_rating_count.head()

Unnamed: 0,movieId,totalRatingCount
0,1,1703
1,2,1080
2,4,1898
3,5,2218
4,10,4981


In [7]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_rating_count['totalRatingCount'].describe())

count   33258.000
mean       78.327
std       262.606
min         1.000
25%         3.000
50%        10.000
75%        38.000
max      6574.000
Name: totalRatingCount, dtype: float64


In [8]:
print(movie_rating_count['totalRatingCount'].quantile(np.arange(.9,1,.01)))

0.900    158.000
0.910    184.000
0.920    211.440
0.930    253.000
0.940    303.580
0.950    375.150
0.960    462.000
0.970    590.000
0.980    814.860
0.990   1298.860
Name: totalRatingCount, dtype: float64


In [9]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')
print(len(combine_movie_rating))
print(len(rating_with_totalRatingCount))
rating_with_totalRatingCount.head()

2604995
2604995


Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,0,1,4,Harry Potter and the Deathly Hallows: Part II,1703
1,21,1,4,Harry Potter and the Deathly Hallows: Part II,1703
2,25,1,5,Harry Potter and the Deathly Hallows: Part II,1703
3,34,1,4,Harry Potter and the Deathly Hallows: Part II,1703
4,36,1,5,Harry Potter and the Deathly Hallows: Part II,1703


In [10]:
#有10%的电影评价次数大于158次
popular_threshold=158
rating_popular_movies= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')
rating_popular_movies.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,0,1,4,Harry Potter and the Deathly Hallows: Part II,1703
1,21,1,4,Harry Potter and the Deathly Hallows: Part II,1703
2,25,1,5,Harry Potter and the Deathly Hallows: Part II,1703
3,34,1,4,Harry Potter and the Deathly Hallows: Part II,1703
4,36,1,5,Harry Potter and the Deathly Hallows: Part II,1703


# KNN

In [11]:
ratings_pivot = rating_popular_movies.pivot(index='movieId', columns='userId',values='rating').fillna(0)
ratings_pivot_sparse = csr_matrix(ratings_pivot.values)

In [12]:
model_nn_binary = NearestNeighbors(metric='cosine', algorithm='brute')
model_nn_binary.fit(ratings_pivot_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [75]:
query_index=np.random.choice(ratings_pivot.shape[0])
distances, indices = model_nn_binary.kneighbors(ratings_pivot.iloc[query_index, :].reshape(1, -1), n_neighbors = 11)

for i in range(0, len(distances.flatten())):
    movieId=ratings_pivot.index[query_index]
    likelymovieId=ratings_pivot.index[indices.flatten()[i]]
    if i == 0:
        print('当前电影:',movies[movies.movieId==movieId]['title'].values[0])
    else:
        print('推荐电影{0}: {1}, 距离为:{2}'.format(i, movies[movies.movieId==likelymovieId]['title'].values[0], 
                                                    distances.flatten()[i]))

当前电影: 少年张三丰
推荐电影1: 机灵小不懂, 距离为:0.4828775024976728
推荐电影2: 春光灿烂猪八戒, 距离为:0.5471577246480848
推荐电影3: 绝代双骄, 距离为:0.5736253341325162
推荐电影4: 粉红女郎, 距离为:0.5761728959194434
推荐电影5: 少年包青天, 距离为:0.578347365754289
推荐电影6: 少年英雄方世玉, 距离为:0.5805705904995269
推荐电影7: 少年包青天2, 距离为:0.5844965129491082
推荐电影8: 小李飞刀, 距离为:0.5905323242444609
推荐电影9: 穿越时空的爱恋, 距离为:0.5968249361452425
推荐电影10: 還珠格格第二部, 距离为:0.6047020203715006


  


In [16]:
movieId=2550
distances, indices = model_nn_binary.kneighbors(ratings_pivot.query('movieId == 2550').values, n_neighbors = 11)

for i in range(0, len(distances.flatten())):
    likelymovieId=ratings_pivot.index[indices.flatten()[i]]
    if i == 0:
        print('当前电影:',movies[movies.movieId==movieId]['title'].values[0])
    else:
        print('推荐电影{0}: {1}, 距离为:{2}'.format(i, movies[movies.movieId==likelymovieId]['title'].values[0], 
                                                    distances.flatten()[i]))

当前电影: 黃飛鴻之三獅王爭霸
推荐电影1: 黃飛鴻之二男兒當自強, 距离为:0.2236617772379993
推荐电影2: 黃飛鴻, 距离为:0.2771905809331011
推荐电影3: 方世玉, 距离为:0.3036596141407937
推荐电影4: 太极张三丰, 距离为:0.3454806002742725
推荐电影5: 方世玉续集, 距离为:0.3489783330675462
推荐电影6: 精武英雄, 距离为:0.38593751478880156
推荐电影7: 新少林五祖, 距离为:0.4035055605033421
推荐电影8: 倚天屠龍記之魔教教主, 距离为:0.4150684119775777
推荐电影9: 中南海保镖, 距离为:0.45784800244154567
推荐电影10: 我是谁, 距离为:0.45895083257042657


# SVD

In [103]:
ratings_pivot2 = rating_popular_movies.pivot(index='userId', columns='movieId',values='rating').fillna(0)
ratings_pivot2_sparse = csr_matrix(ratings_pivot2.values)
print(ratings_pivot2.shape)
ratings_pivot2.head()

(27895, 3329)


movieId,1,2,4,5,10,12,13,15,17,18,...,12612,12634,13346,14821,15721,15741,15826,16155,16323,16660
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,5.0,3.0,4.0,5.0,4.0,2.0,4.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,3.0,4.0,3.0,0.0,5.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
X=ratings_pivot2.values.T
X.shape

(3329, 27895)

In [107]:
from sklearn.decomposition import TruncatedSVD
svd=TruncatedSVD(n_components=10,random_state=17)
matrix=svd.fit_transform(X)
print(matrix.shape)

(3329, 10)


In [108]:
import warnings
warnings.filterwarnings("ignore",category=RuntimeWarning)
corr=np.corrcoef(matrix)
print(corr.shape)
corr

(3329, 3329)


array([[ 1.        ,  0.88719504,  0.94722286, ...,  0.80781994,
         0.7544022 ,  0.5269623 ],
       [ 0.88719504,  1.        ,  0.86930487, ...,  0.61777898,
         0.5339515 ,  0.67181041],
       [ 0.94722286,  0.86930487,  1.        , ...,  0.79992334,
         0.5870368 ,  0.45608533],
       ..., 
       [ 0.80781994,  0.61777898,  0.79992334, ...,  1.        ,
         0.74405944,  0.38598591],
       [ 0.7544022 ,  0.5339515 ,  0.5870368 , ...,  0.74405944,
         1.        ,  0.33090199],
       [ 0.5269623 ,  0.67181041,  0.45608533, ...,  0.38598591,
         0.33090199,  1.        ]])

In [124]:
example_movieId=2550
movieIds=ratings_pivot2.columns 
movieIds_list = list(movieIds)
movieId_index = movieIds_list.index(example_movieId)

movieId_vec=corr[movieId_index]
argsort_idx =np.argsort(-movieId_vec)[:11]
coff=movieId_vec[argsort_idx]
similar_movie_Ids=movieIds[argsort_idx]
print(similar_movie_Ids.values)
print('--------------------------------------------------------------')
print(coff)

[2550 3874 2552 3143 3732 2553  639 2547 2555 2956 2551]
--------------------------------------------------------------
[ 1.          0.99637533  0.99598866  0.99554685  0.99453251  0.99409538
  0.99386483  0.99091663  0.98941127  0.98634359  0.98620206]


In [111]:
for idx,mId in enumerate(similar_movie_Ids):
    name = movies[movies.movieId==mId]['title'].values[0]
    if idx==0:
        print('当前电影:',name)
    else:
         print('推荐电影{0}: {1}, 相关系数:{2}'.format(idx,name, coff[idx]))

当前电影: 黃飛鴻之三獅王爭霸
推荐电影1: 太极张三丰, 相关系数:0.9963753317262949
推荐电影2: 黃飛鴻之二男兒當自強, 相关系数:0.9959886617593083
推荐电影3: 黃飛鴻, 相关系数:0.9955468545480034
推荐电影4: 方世玉续集, 相关系数:0.9945325050301708
推荐电影5: 新少林五祖, 相关系数:0.9940953835981864
推荐电影6: 方世玉, 相关系数:0.9938648308354907
推荐电影7: 倚天屠龍記之魔教教主, 相关系数:0.9909166322984471
推荐电影8: 赌神, 相关系数:0.9894112727822967
推荐电影9: 红番区, 相关系数:0.9863435881824448
推荐电影10: 冒險王, 相关系数:0.9862020560988378
