In [1]:
import pandas as pd
import numpy as np 
from surprise import SVD
from surprise import KNNBaseline
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

### 数据在此下载 ###
https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/ez_douban/intro.ipynb

In [3]:
movies = pd.read_csv( './data/douban/movies.csv')
ratings = pd.read_csv('./data/douban/ratings.csv')

combine_movie_rating= pd.merge(ratings,movies,on='movieId',how='inner')
combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)
combine_movie_rating = combine_movie_rating.dropna(axis = 0 ,subset=['title'])
print('评分数量：%d' % len(combine_movie_rating))
combine_movie_rating.sample(10)

评分数量：2604995


Unnamed: 0,userId,movieId,rating,title
1949349,7240,4916,4,彼女と彼女の猫
968003,8830,1134,3,女人不坏
673749,27837,745,5,The Godfather
64703,9605,89,4,Prison Break
1519063,6554,2459,3,The Darjeeling Limited
342066,2198,352,4,The Sisterhood of the Traveling Pants
1972748,485,5049,3,Loves Me... Loves Me Not
1005036,4188,1212,3,Saw IV
2325556,687,8572,4,ナナ
672169,5216,745,5,The Godfather


In [4]:
movie_rating_count=pd.DataFrame(combine_movie_rating.
                    groupby(['movieId'])['rating'].
                    count().
                    reset_index().
                    rename(columns={'rating':'totalRatingCount'})                   
                   )
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,0,1,4,Harry Potter and the Deathly Hallows: Part II,1703
1,21,1,4,Harry Potter and the Deathly Hallows: Part II,1703
2,25,1,5,Harry Potter and the Deathly Hallows: Part II,1703
3,34,1,4,Harry Potter and the Deathly Hallows: Part II,1703
4,36,1,5,Harry Potter and the Deathly Hallows: Part II,1703


In [5]:
rating_with_totalRatingCount['totalRatingCount'].quantile(np.arange(.9,1,.01))

0.90    2351.0
0.91    2441.0
0.92    2654.0
0.93    2814.0
0.94    2958.0
0.95    3062.0
0.96    3330.0
0.97    3731.0
0.98    4432.0
0.99    5072.0
Name: totalRatingCount, dtype: float64

In [6]:
#取10%的最热门的电影
popular_threshold=2351 
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')
popular_movies_rating= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')
print('热门电影数据量：%d' % len(popular_movies_rating))

热门电影数据量：262455


In [7]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(popular_movies_rating[['userId', 'movieId', 'rating']], reader)
train, test = train_test_split(data, test_size=.25, random_state=0)
svd_model = SVD(random_state=0)
svd_model.fit(train)
predict = svd_model.test(test)   
print("RMSE: ",accuracy.mae(predict, verbose=False))
print("MAE: ",accuracy.rmse(predict, verbose=False))
predict[:10]

RMSE:  0.5501276131831262
MAE:  0.6968515836030541


[Prediction(uid=1426, iid=1130, r_ui=5.0, est=3.6235014684425333, details={'was_impossible': False}),
 Prediction(uid=9899, iid=609, r_ui=4.0, est=4.427492924992114, details={'was_impossible': False}),
 Prediction(uid=10837, iid=150, r_ui=5.0, est=4.827915792544979, details={'was_impossible': False}),
 Prediction(uid=4643, iid=995, r_ui=4.0, est=4.211769405268239, details={'was_impossible': False}),
 Prediction(uid=9769, iid=247, r_ui=5.0, est=4.2549007653247655, details={'was_impossible': False}),
 Prediction(uid=5582, iid=170, r_ui=4.0, est=4.31240196218265, details={'was_impossible': False}),
 Prediction(uid=6605, iid=156, r_ui=5.0, est=4.245027154626259, details={'was_impossible': False}),
 Prediction(uid=9884, iid=21, r_ui=4.0, est=3.824987946672277, details={'was_impossible': False}),
 Prediction(uid=17492, iid=96, r_ui=4.0, est=4.72638009370838, details={'was_impossible': False}),
 Prediction(uid=25829, iid=738, r_ui=5.0, est=4.788603891699279, details={'was_impossible': False})

In [8]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [9]:
# trainset = data.build_full_trainset()
# svd_model.fit(trainset)

#从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影，即该测试集中的数据不在训练集中
# testset = trainset.build_anti_testset()
# predictions = svd_model.test(testset)

# top_n = get_top_n(predictions, n=8)

# for uid, user_ratings in top_n.items():
#      print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])

In [10]:
#定义留一法交叉验证,在测试集中每个用户只保留一条评分记录
LOOCV = LeaveOneOut(n_splits=1, random_state=1)

for trainSet, testSet in LOOCV.split(data):
    
    #在训练集上训练模型
    svd_model.fit(trainSet)
    #在测试集上预测
    leftOutPredictions = svd_model.test(testSet)
    
    #从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影，即该测试集中的数据不在训练集中
    bigTestSet = trainSet.build_anti_testset()
    
    #得到所有用户没有看过的所有电影的预测评分
    allPredictions = svd_model.test(bigTestSet)
    
    #从每个用户的未看过的电影的预测评分中抽取前10个得分最高的电影
    topNPredicted = get_top_n(allPredictions, n=10)

#打印为每个用户推荐的10部电影和对它们的评分
# for uid, user_ratings in topNPredicted.items():
#       print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])

### 整体命中率

In [11]:

def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0
 
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    return hits/total
print("整体命中率: ", HitRate(topNPredicted, leftOutPredictions))

整体命中率:  0.1979780960404381


### 评分命中率

In [12]:
def RatingHitRate(topNPredicted, leftOutPredictions):
    hits = defaultdict(float)
    total = defaultdict(float)    
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit = True
                break
        if (hit) :
            hits[actualRating] += 1
        total[actualRating] += 1

    for rating in sorted(hits.keys()):
        print(rating, hits[rating] / total[rating])
print("评分的命中率: ")
RatingHitRate(topNPredicted, leftOutPredictions)

评分的命中率: 
1.0 0.06930693069306931
2.0 0.06529209621993128
3.0 0.09394964299135664
4.0 0.15139442231075698
5.0 0.25259798707143444


### 累积命中率 ###

In [13]:
def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
    hits = 0
    total = 0

    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:

        if (actualRating >= ratingCutoff):            
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if (int(leftOutMovieID) == movieID):
                    hit = True
                    break
            if (hit) :
                hits += 1
            total += 1
    return hits/total
print("累积命中率 (rating >= 4): ", CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))

累积命中率 (rating >= 4):  0.21482051282051282


### 平均互惠命中排名 ###

In [14]:
def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
    summation = 0
    total = 0
        
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        
        hitRank = 0
        rank = 0
        for movieID, predictedRating in topNPredicted[int(userID)]:
            rank = rank + 1
            if (int(leftOutMovieID) == movieID):
                hitRank = rank
                break
        if (hitRank > 0) :
                summation += 1.0 / hitRank

        total += 1

    return summation / total

print("平均互惠命中排名: ", AverageReciprocalHitRank(topNPredicted, leftOutPredictions))

平均互惠命中排名:  0.06069266116709869
