In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:

# 读取数据
movies_path = './data/ml-1m/movies.dat'
ratings_path = './data/ml-1m/ratings.dat'

movies_cols = 'MovieID::Title::Genres'.split("::")
ratings_cols = 'UserID::MovieID::Rating::Timestamp'.split('::')

moviesDF = pd.read_csv(movies_path, sep='::', names=movies_cols)
ratingsDF = pd.read_csv(ratings_path, sep='::', names=ratings_cols)


In [3]:
# 构建users-items矩阵
# pandas pivot_table数据表透视: 行索引UserID,列索引MovieID,值Rating,填充值0
ratingPivotDF = pd.pivot_table(ratingsDF[['UserID','MovieID','Rating']],\
                                           columns=['MovieID'],index=['UserID'],\
                                           values='Rating',fill_value=0)
# 得到users字典以及movies字典,方便由id得到名称
users_map = dict(enumerate(list(ratingPivotDF.index)))
movies_map = dict(enumerate(list(ratingPivotDF.columns)))

# users-items矩阵:去除dataframe格式
ratings = ratingPivotDF.values.tolist()
print(ratings)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import math
import numpy as np

# 用户之间相似度计算
def calCosineSimilarity(vec1,vec2):
    numerator = 0.0
    denominator1 = 0.0
    denominator2 = 0.0
    
    for (val1, val2) in zip(vec1, vec2):
        numerator += val1 * val2
        denominator1 += val1 ** 2
        denominator2 += val2 ** 2
    
    return numerator / math.sqrt(denominator1 * denominator2)

# 用户相似度矩阵:对称矩阵
users_similarity_matrix = np.zeros([len(ratings), len(ratings)], dtype=np.float32)
# 用户本身的相似度没有意义;同时因为对称矩阵,可以只计算上三角部分
for i in range(len(ratings)-1):#i,j:newID for users
    for j in range(i+1, len(ratings)):
        users_similarity_matrix[i,j] = calCosineSimilarity(ratings[i], ratings[j])
        users_similarity_matrix[j,i] = users_similarity_matrix[i,j]

In [None]:

# 筛选topK相似用户集合,超参数K设置为10
users_most_simi = dict()# key:userid, value: dict contailing the userID and the similarity
K = 10
for i in range(len(ratings)):# i: newID for users, or the index for users
    # 没有用到用户名称(原始用户ID)
    users_most_simi[i] = sorted(enumerate(list(ratings[i])), key=lambda x: x[1],reverse=True)[:K]

In [None]:
# 为每个用户推荐N个unseen电影,超参数N设置为10
N = 10
# 为unseen电影评分 unseen : ratings[i,j] = 0

user_rec_result = np.zeros([len(ratings), len(ratings[0])], dtype=np.float32)

# 遍历ratings,根据是否为0,判断seen or unseen
for i in range(len(ratings)):
    for j in range(len(ratings[0])):
        if ratings[i][j] == 0:# unseen
            val = 0
            for (u, sim) in users_most_simi[i]:
                val += sim * ratings[u][j] 
            user_rec_result[i,j] = val

# 筛选topN个推荐结果
recommend = dict()

for i in range(len(ratings)):
    recommend[i] = sorted(enumerate(list(user_rec_result[i])), key=lambda x:x[1],reverse=True)[:N]

# 查询userMap,moviesMap,找到对应的名称
recommend_list = list()

for key, value in recommend.items():
    user = users_map[key]
    for (movieID,name) in value:
        recommend_list.append([user, movies_map[name]])

recommendDF = pd.DataFrame(recommend_list,columns=['UserID','MovieID'])
recommendDF = pd.merge(recommendDF,moviesDF[['MovieID','Title']],on='MovieID',how='inner')
recommendDF.tail(10)

In [None]:
recommendDF.head(10)
print(recommendDF.head(10))