## python推荐系统库Surprise

![](./Surprise.png)

In [17]:
"""
以下的程序段告诉大家如何在协同过滤算法建模以后，根据一个item取回相似度最高的item，主要是用到algo.get_neighbors()这个函数
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset


def read_item_names():
    """
    获取电影名到电影id 和 电影id到电影名的映射
    """

    file_name = 'ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# 首先，用算法计算相互间的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x11a8d5518>

In [18]:
# 获取电影名到电影id 和 电影id到电影名的映射
rid_to_name, name_to_rid = read_item_names()

In [19]:
# 拿出来Toy Story这部电影对应的item id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_raw_id

'1'

In [20]:
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id

24

In [21]:
# 找到最近的10个邻居
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors

[433, 101, 302, 309, 971, 95, 26, 561, 816, 347]

In [22]:
# 从近邻的id映射回电影名称
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)


The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


In [23]:
# 拿出来Toy Story这部电影对应的item id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# 找到最近的10个邻居
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# 从近邻的id映射回电影名称
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)


The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


#### 1.2 音乐预测的例子

In [32]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

from surprise import KNNBaseline, Reader
from surprise import Dataset

import _pickle as cPickle

id_name_dic = cPickle.load(open('popular_playlist.pkl','rb'), encoding='utf-8')
print("加载歌单id到歌单名的映射字典完成...")

name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成...")


file_path = 'popular_music_suprise_format.txt'

reader = Reader(line_format='user item rating timestamp', sep=',')

music_data = Dataset.load_from_file(file_path, reader=reader)

print("构建数据集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

加载歌单id到歌单名的映射字典完成...
加载歌单名到歌单id的映射字典完成...
构建数据集...


In [36]:
trainset.n_items

50539

In [37]:
trainset.n_users

1076

#### 1.2.1 模板之查找最近的user(在这里是歌单)

In [40]:
print("Start training model...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()
algo.fit(trainset)

current_playlist = list(name_id_dic.keys())[39]
print("歌单名称", current_playlist)

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print("歌单id", playlist_id)
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print("内部id", playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字
# to_raw_uid映射回去
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id] for playlist_id in playlist_neighbors)

print()
print("和歌单 《", current_playlist, "》 最接近的10个歌单为：\n")
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]))

Start training model...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
歌单名称 周杰伦公举歌曲大全(专辑发行顺序)，循环专
歌单id 92187045
内部id 839

和歌单 《 周杰伦公举歌曲大全(专辑发行顺序)，循环专 》 最接近的10个歌单为：

【情怀录】泱泱华夏，千古风华 0
当过千评论的华语翻唱遇上“原唱”【更新】 1
第四季中国好声音第三期原唱及翻唱 2
【华语】暖心物语 纯白思念 3
〖循环〗单曲循环是强迫症吗？ 4
周杰伦地表最强演唱会2017520南京站曲目 5
简单的爱总是那么吸引人 6
别让时光偷走你上扬的嘴角 7
大神爱翻唱1：华语篇 8
国语VS闽南语 给你最强听觉冲击 9


In [41]:
# 
id1 = '118663384'
print (id_name_dic[id1])
# 当过千评论的华语翻唱遇上“原唱”【更新】

当过千评论的华语翻唱遇上“原唱”【更新】


In [43]:
import _pickle as cPickle
# 重建歌曲id到歌曲名的映射字典
song_id_name_dic = cPickle.load(open('popular_song.pkl','rb'), encoding='utf-8')
print("加载歌曲id到歌曲名的映射字典完成...")
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成...")

加载歌曲id到歌曲名的映射字典完成...
加载歌曲名到歌曲id的映射字典完成...


In [47]:
#内部编码的50号用户
user_inner_id = 50
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 50         item: 5026       r_ui = 1.00   est = 1.00   {'was_impossible': False} Hymn For The Weekend	Coldplay
user: 50         item: 4648       r_ui = 1.00   est = 1.00   {'was_impossible': False} Faded	Alan Walker
user: 50         item: 4634       r_ui = 1.00   est = 1.00   {'was_impossible': False} I Took A Pill In Ibiza (SeeB Remix)	SeeB
user: 50         item: 5027       r_ui = 1.00   est = 1.00   {'was_impossible': False} Cheap Thrills	Sia
user: 50         item: 137        r_ui = 1.00   est = 1.00   {'was_impossible': False} 小幸运（Cover 田馥甄）	金玟岐
user: 50         item: 5028       r_ui = 1.00   est = 1.00   {'was_impossible': False} Sofia	Alvaro Soler
user: 50         item: 5029       r_ui = 1.00   est = 1.00   {'was_impossible': False} Lay It All On Me	Rudimental
user: 50         item: 5030       r_ui = 1.00   est = 1.00   {'was_impossible': False} 恋の中	新山詩織
user: 50         item: 5031       r_ui = 1.00   est = 1.00   {'was_impossible': False} Heathens	twenty one pilots
user: 50

### 2.用矩阵分解进行预测

In [10]:
### 使用NMF
from surprise import NMF, evaluate
from surprise import Dataset

file_path = os.path.expanduser('popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 构建数据集和建模
algo = NMF()
trainset = music_data.build_full_trainset()
algo.train(trainset)

In [17]:
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(algo.trainset.to_raw_uid(user_inner_id), algo.trainset.to_raw_iid(song), r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 92509527   item: 27724082   r_ui = 1.00   est = 1.00   {u'was_impossible': False} 听见下雨的声音	魏如昀
user: 92509527   item: 167916     r_ui = 1.00   est = 1.00   {u'was_impossible': False} 梦一场	萧敬腾
user: 92509527   item: 408307325  r_ui = 1.00   est = 1.00   {u'was_impossible': False} 干杯	西瓜Kune
user: 92509527   item: 394618     r_ui = 1.00   est = 1.00   {u'was_impossible': False} 给自己的歌 (Live) - live	纵贯线
user: 92509527   item: 421423806  r_ui = 1.00   est = 1.00   {u'was_impossible': False} 小半	陈粒
user: 92509527   item: 394485     r_ui = 1.00   est = 1.00   {u'was_impossible': False} 思念是一种病(Live) - live	张震岳
user: 92509527   item: 5239563    r_ui = 1.00   est = 1.00   {u'was_impossible': False} 可以不可以	丁当
user: 92509527   item: 30635613   r_ui = 1.00   est = 1.00   {u'was_impossible': False} 秋酿	房东的猫
user: 92509527   item: 185884     r_ui = 1.00   est = 1.00   {u'was_impossible': False} 退后	周杰伦
user: 92509527   item: 276936     r_ui = 1.00   est = 1.00   {u'was_impossible': False} 阴天	莫文蔚
user:

## 模型存储

In [None]:
import surprise
surprise.dump.dump('./recommendation.model', algo=algo)
# 可以用下面的方式载入
algo = surprise.dump.load('./recommendation.model')

## 不同的推荐系统算法评估

### 首先载入数据

In [23]:
import os
from surprise import Reader, Dataset
# 指定文件路径
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
music_data.split(n_folds=5)

In [None]:
music_data

In [None]:
music_data.raw_ratings[:20]

In [None]:
### 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用基础版协同过滤
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用均值协同过滤
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用协同过滤baseline
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用SVD
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用SVD++
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

In [None]:
### 使用NMF
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)