In [1]:
import pandas as pd
import numpy as np

In [2]:
# 读取数据
rating_df = pd.read_csv('ml-latest-small/ratings.csv')
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
# 查看基本信息
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
# 删除无用列
rating_df.drop(['timestamp'],axis=1,inplace=True)

In [5]:
print(rating_df)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]


In [6]:
len(rating_df['movieId'].unique().tolist())

9724

In [7]:
len(rating_df['userId'].unique().tolist())

610

In [27]:
# print(pd.DataFrame(
#     [
#         ['userid1', 3, 4, 5],
#         ['userid2', 5, 1, np.nan],
#         ['userid3', 3, 1, 4]
#     ]
#     , columns=['userid', 'movieid1', 'movieid2', 'movieid3']
# ))

In [114]:
# 使用pivot_table函数将数据转换为列联表
pivot_df = pd.pivot_table(rating_df, values='rating', index='userId', columns='movieId', aggfunc='first')

# 重命名列名
pivot_df.columns = ['movieid{}'.format(col) for col in pivot_df.columns]

# 重置索引并重命名索引列
pivot_df.reset_index(inplace=True)

In [9]:
pivot_df.to_csv('pivot_df.csv',index=False)

In [113]:
user_similar = pivot_df.T.corr()
user_similar

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,1.000000,-0.099912,0.159951,-0.072169,-0.411503,-0.398105,-0.418265,-0.715184,-0.794552,...,-0.943794,-0.636918,-0.376609,-0.785318,-0.596206,-0.410681,-0.430070,-0.307009,-0.866802,-0.552199
1,1.000000,1.000000,,-1.000000,-1.000000,-0.866025,-0.943742,-1.000000,,-0.618903,...,-0.679423,-1.000000,-0.982252,,,-0.844866,-1.000000,-0.826883,-1.000000,-0.527826
2,-0.099912,,1.000000,-1.000000,,0.942809,,1.000000,,,...,0.999999,0.999996,0.248346,,-1.000000,0.294414,0.299925,0.285938,,0.014045
3,0.159951,-1.000000,-1.000000,1.000000,-0.310200,0.180060,0.503856,0.306331,1.000000,0.396753,...,0.134736,0.107267,0.028032,0.200680,0.026339,0.037101,0.037343,0.060300,0.192026,0.039122
4,-0.072169,-1.000000,,-0.310200,1.000000,0.120412,0.419246,0.251976,,0.262297,...,0.534522,0.276643,0.289624,0.397702,0.274980,0.223984,0.399014,0.264405,0.449481,0.410136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,-0.410681,-0.844866,0.294414,0.037101,0.223984,0.297533,0.284389,0.717111,0.783101,0.699936,...,0.999972,0.999905,0.999418,0.999983,0.999913,1.000000,0.999859,0.999560,0.999992,0.999765
606,-0.430070,-1.000000,0.299925,0.037343,0.399014,0.359862,0.534787,0.756147,1.000000,0.968036,...,0.999996,0.999940,0.999724,0.999967,0.999935,0.999859,1.000000,0.999796,0.999989,0.999882
607,-0.307009,-0.826883,0.285938,0.060300,0.264405,0.219133,0.266505,0.561963,0.706414,0.712799,...,0.999965,0.999875,0.999352,0.999888,0.999824,0.999560,0.999796,1.000000,0.999982,0.999493
608,-0.866802,-1.000000,,0.192026,0.449481,0.442510,0.749383,0.750508,,0.963123,...,1.000000,0.999973,0.999977,0.999992,0.999983,0.999992,0.999989,0.999982,1.000000,0.999982


In [115]:
# 定义一个函数来根据用户的相似度得分和其他用户的评分来推荐电影
def recommend_movies_by_similarity(N,user_similar, pivot_df, user_id, num_recommendations=10):
    # 获取当前用户的评分
    user_ratings = pivot_df.loc[user_id][1:].dropna()
    
    # 初始化推荐电影的字典，用于存储电影ID和加权分数
    recommended_movies = {}
    
    # 获取当前用户的相似用户及其相似度得分
    similar_users_scores = user_similar.loc[user_id].sort_values(ascending=False)[1:N+1]
    
    # 遍历每个相似用户及其相似度得分
    for similar_user, similarity_score in similar_users_scores.items():
        # 获取相似用户的评分
        similar_user_ratings = pivot_df.loc[similar_user][1:].dropna()
        score = 0
        # 对于相似用户评分的每部电影，累加加权分数（相似度得分 * 相似用户的评分）
        for movie, rating in similar_user_ratings.items():
            # 如果当前用户没有评过这部电影，则添加到字典中
            if movie not in recommended_movies:
                recommended_movies[movie] = 0.0
            # 累加加权分数
            score += similarity_score
            recommended_movies[movie] += similarity_score * rating 
        
    # 过滤掉当前用户已经评分的电影
    recommended_movies = {movie: score for movie, score in recommended_movies.items() if movie not in user_ratings}

    # 对推荐电影的分数进行降序排序
    sorted_recommendations = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)
    
    # 获取前num_recommendations个推荐电影
    top_recommendations = sorted_recommendations[:num_recommendations]
    
    return top_recommendations

# 使用推荐函数为特定用户推荐电影
user_id = 3  # 请替换为实际的用户ID
num_recommendations = 5  # 您想要的推荐数量
recommended_list = recommend_movies_by_similarity(3,user_similar, pivot_df, user_id, num_recommendations)

# 打印推荐的电影列表
for movie, score in recommended_list:
    print(f"推荐电影: {movie} (分数: {score:.2f})")

推荐电影: movieid109487 (分数: 10.00)
推荐电影: movieid4993 (分数: 9.50)
推荐电影: movieid79132 (分数: 9.00)
推荐电影: movieid116797 (分数: 9.00)
推荐电影: movieid59315 (分数: 8.50)


In [83]:
similar_users_scores = user_similar.loc[0].sort_values(ascending=False)[1:3+1]
similar_users_scores

1    1.000000
3    0.159951
4   -0.072169
Name: 0, dtype: float64

In [99]:
user_ratings = pivot_df.loc[0][1:].dropna()
user_ratings

movieid1       4.0
movieid3       4.0
movieid6       4.0
movieid47      5.0
movieid50      5.0
              ... 
movieid3744    4.0
movieid3793    5.0
movieid3809    4.0
movieid4006    4.0
movieid5060    5.0
Name: 0, Length: 232, dtype: float64

In [110]:
# recommended_movies = {}
# for similar_user, similarity_score in similar_users_scores.items():
#     
#     similar_user_ratings = pivot_df.loc[similar_user][1:].dropna()
#     
#     for movie, rating in similar_user_ratings.items():
#         if movie not in recommended_movies:
#             recommended_movies[movie] = 0.0
#         recommended_movies[movie] += similarity_score * rating
#         
# recommended_movies = {movie: score for movie, score in recommended_movies.items() if movie not in user_ratings}
# 
# print(recommended_movies)

{'movieid318': 2.7834936490538906, 'movieid1704': 4.6599509736047064, 'movieid6874': 3.9999999999999996, 'movieid8798': 3.4999999999999996, 'movieid46970': 3.9999999999999996, 'movieid48516': 3.9999999999999996, 'movieid58559': 4.499999999999999, 'movieid60756': 4.999999999999999, 'movieid68157': 4.499999999999999, 'movieid71535': 2.9999999999999996, 'movieid74458': 3.9999999999999996, 'movieid77455': 2.9999999999999996, 'movieid79132': 3.9999999999999996, 'movieid80489': 4.499999999999999, 'movieid80906': 4.999999999999999, 'movieid86345': 3.9999999999999996, 'movieid89774': 4.999999999999999, 'movieid91529': 3.4999999999999996, 'movieid91658': 2.4999999999999996, 'movieid99114': 3.4999999999999996, 'movieid106782': 4.999999999999999, 'movieid109487': 2.9999999999999996, 'movieid112552': 3.9999999999999996, 'movieid114060': 1.9999999999999998, 'movieid115713': 3.4999999999999996, 'movieid122882': 4.999999999999999, 'movieid131724': 4.999999999999999, 'movieid21': 0.19117778621930914, 

In [130]:
similar_users_scores = user_similar.loc[1].sort_values(ascending=False)[1:3+1]
similar_users_scores

0      1.000000
447   -0.455029
413   -0.491061
Name: 1, dtype: float64

In [135]:
# 选择用户1的相似用户得分，并排除负数得分，然后取前3个正值得分进行排序
similar_users_scores = user_similar.loc[1].drop(user_similar.loc[1][user_similar.loc[1] <= 0].index).sort_values(ascending=False)[1:3+1].dropna()
similar_users_scores

# 现在 similar_users_scores 包含了用户1的前3个正值相似度得分

0    1.0
Name: 1, dtype: float64

In [151]:
# 定义一个函数来根据用户的相似度得分和其他用户的评分来推荐电影
def recommend_movies_by_similarity(N,user_similar, pivot_df, user_id, num_recommendations=10):
    # 获取当前用户的评分,第一行是userId，去空
    user_ratings = pivot_df.loc[user_id][1:].dropna()
    
    # 存储电影ID和加权分数
    recommended_movies = {}
    
    # 获取当前用户的相似用户及其相似度得分 求前N个相似的用户
    similar_users_scores = user_similar.loc[user_id].sort_values(ascending=False)[1:N+1]
    # similar_users_scores = user_similar.loc[user_id].drop(user_similar.loc[user_id][user_similar.loc[1] <= 0].index).dropna().sort_values(ascending=False)[1:N+1]
    
    # 初始化分母（所有相似度得分的总和）
    sum_similarities = 0
    
    # 遍历每个相似用户及其相似度得分
    for similar_user, similarity_score in similar_users_scores.items():
        sum_similarities += similarity_score  # 更新分母
        
        # 获取相似用户的评分 第一行是userId，去空
        similar_user_ratings = pivot_df.loc[similar_user][1:].dropna()
        
        # 对于相似用户评分的每部电影，累加加权分数（相似度得分 * 相似用户的评分）
        for movie, rating in similar_user_ratings.items():
            # 如果当前用户没有评过这部电影，则添加到字典中
            if movie not in recommended_movies:
                recommended_movies[movie] = 0.0
            # 累加加权分数
            recommended_movies[movie] += similarity_score * rating
            # print(recommended_movies[movie])
    
    # 计算每个电影的加权平均分数
    for movie, score in recommended_movies.items():
        # 如果分母不为0，则更新电影的推荐分数，否则设置为0
        recommended_movies[movie] = score / sum_similarities if sum_similarities else 0
    
    # 过滤
    recommended_movies = {movie: score for movie, score in recommended_movies.items() if movie not in user_ratings}

    # 降序
    sorted_recommendations = sorted(recommended_movies.items(), key=lambda x: x[1], reverse=True)

    top_recommendations = sorted_recommendations[:num_recommendations]
    
    return top_recommendations

# 使用推荐函数为特定用户推荐电影
user_id = 1
num_recommendations = 10
N = 3

recommended_list = recommend_movies_by_similarity(N,user_similar, pivot_df, user_id, num_recommendations)

# 打印推荐的电影列表
for movie, score in recommended_list:
    print(f"推荐电影: {movie} (分数: {score:.2f})")

推荐电影: movieid110 (分数: 3.33)
推荐电影: movieid593 (分数: 3.00)
推荐电影: movieid260 (分数: 1.67)
推荐电影: movieid608 (分数: 1.67)
推荐电影: movieid898 (分数: 1.67)
推荐电影: movieid904 (分数: 1.67)
推荐电影: movieid905 (分数: 1.67)
推荐电影: movieid908 (分数: 1.67)
推荐电影: movieid926 (分数: 1.67)
推荐电影: movieid935 (分数: 1.67)
