In [10]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import defaultdict

In [3]:
#导入数据集
df_rating= pd.read_csv("../data/ml-1m/ratings.dat", sep="::", header=None, engine="python", 
                     names="UserID::MovieID::Rating::Timestamp".split("::"))

In [8]:
#随机采样
df_rating_sample = df_rating.sample(frac = 0.01)
df_rating_sample.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
716442,4289,2420,4,965275831
648639,3908,587,4,965931274
524890,3240,345,2,968349972
662071,3992,161,4,965595861
353300,2070,3863,5,974659721


In [9]:
#取出所有用户和电影ID
user = df_rating_sample["UserID"].unique()
movie = df_rating_sample["MovieID"].unique()
print(user.shape, movie.shape)

(3671,) (2265,)


In [32]:
#构造用户电影和评分的字典，为接下来构造共现矩阵做铺垫
rating_dict = defaultdict(int)

In [33]:
for index, row in df_rating_sample.iterrows():
    userid = row["UserID"]
    movieid = row["MovieID"]
    rating = row["Rating"]
    key = (userid, movieid)
    rating_dict[key] = rating
print(len(rating_dict))

10002


In [35]:
#构造共现矩阵，行索引代表用户id，列索引代表电影id
co_matrix = []
i = 0
j = 0
for i in range(len(user)):
    l = []
    for j in range(len(movie)):
        key = (user[i], movie[j])
        l.append(rating_dict[key])
    co_matrix.append(l)
print(len(co_matrix), len(co_matrix[0]), co_matrix[0][0])

3671 2265 4


In [36]:
np_matrix = np.array(co_matrix)
np_matrix.dtype

dtype('int64')

In [37]:
np_matrix.shape

(3671, 2265)

In [50]:
#构造用户余弦相似度矩阵
user_sim_dict = defaultdict(list)

In [51]:
for i in range(np_matrix.shape[0]):
    for j in range(np_matrix.shape[0]):
        if j == i:
            continue
        a_norm = np.linalg.norm(np_matrix[i])
        b_norm = np.linalg.norm(np_matrix[j])
        similiarity = np.dot(np_matrix[i], np_matrix[j].T)/(a_norm * b_norm)
        user_sim_dict[user[i]].append((user[j], similiarity))
#这个地方待优化，我理解用户相似度矩阵是个对称的矩阵，只求上半三角就行，循环(n*n-n)/2就行

In [53]:
print(len(user_sim_dict[4289]))

3670


In [57]:
d = user_sim_dict

In [58]:
#按用户相似度值降序排列，取最相近的30名用户
for k, v in d.items():
    d[k] = sorted(v, key=lambda x : x[1], reverse=True)[:30]

In [60]:
print(d[3992])

[(1670, 0.565685424949238), (2741, 0.44172610429938614), (2883, 0.4242640687119285), (413, 0.3999999999999999), (4080, 0.3999999999999999), (4101, 0.3618136134933163), (3847, 0.3394112549695428), (5390, 0.3373096170846271), (82, 0.3123475237772121), (3178, 0.29541957835039856), (5434, 0.29488391230979427), (4633, 0.26111648393354675), (3260, 0.25298221281347033), (4058, 0.2433321316961438), (2485, 0.23999999999999996), (3453, 0.2363515791475006), (3693, 0.2359071298478354), (4271, 0.2286002286003429), (2436, 0.21908902300206642), (916, 0.21828206253269966), (2073, 0.2167313476731508), (3576, 0.19339751136609123), (2608, 0.18973665961010275), (1727, 0.18740851426632724), (149, 0.1811643254631353), (5397, 0.1685854460847049), (1980, 0.15504050659568297), (3513, 0.15301841113520118), (4169, 0.14249409997581927), (482, 0.1271997456007632)]


In [65]:
#构造userid:user_index, movieid:movie_index字典，为推荐使用
user_dict = defaultdict()
movie_dict = defaultdict()
for i in range(len(user)):
    user_dict[user[i]] = i
for j in range(len(movie)):
    movie_dict[movie[j]] = j

In [None]:
#选前10个用户做推荐， 也可以随机选
predict_user = user[:10]
print(predict_user)

![jupyter](./usercf.png)

In [None]:
#根据用户相似度进行推荐
predict_rec = defaultdict(list)
for u in predict_user:
    l = []
    s = d[u]
    for w in s:
        total_w += w[1]
    if total_w == 0:
        continue
    for m in movie:
        #取用户无评价的做推荐（默认0就是无评价，也有可能是就评价0，这里忽略）当作没看过的
        if(np_matrix[user_dict[u]][movie_dict[m]]==0):
            total_rating = 0
            for w in s:
                neighoor_u = w[0]
                total_rating += w[1]*np_matrix[user_dict[neighoor_u]][movie_dict[m]]
            rates = total_rating / total_w
            l.append((m, rates))
    predict_rec[u] = l
                
            
        

In [126]:
predict_rec_n = predict_rec

In [127]:
#取推荐评分最高的前50 个
for k, v in predict_rec_n.items():
    predict_rec_n[k] = sorted(v, key=lambda x : x[1], reverse=True)[:50]

In [128]:
print(predict_rec_n[3908])

[(1641, 0.196272076271877), (2028, 0.19488083938246864), (2677, 0.15701766101750161), (1396, 0.15462030403785382), (1240, 0.15462030403785382), (1964, 0.1269873737207597), (1237, 0.1241333604218842), (2019, 0.1241333604218842), (1732, 0.1241333604218842), (306, 0.1241333604218842), (316, 0.12252429228261652), (3505, 0.12239730117121274), (3911, 0.11776324576312622), (3551, 0.11764841078138884), (2858, 0.10658712275666496), (50, 0.09808815521621263), (2915, 0.09808815521621263), (1958, 0.09749547211513011), (1258, 0.09691340439285069), (269, 0.09691340439285069), (2333, 0.09421059661050096), (1580, 0.0941517701317021), (2921, 0.0941517701317021), (3175, 0.0941517701317021), (2870, 0.0941517701317021), (3638, 0.0941517701317021), (2313, 0.0941517701317021), (1371, 0.0941517701317021), (1266, 0.09139770373589474), (480, 0.08586805069271454), (2347, 0.08586805069271454), (2058, 0.0819997991207191), (733, 0.0819997991207191), (589, 0.08108357149083004), (3770, 0.07847052417297011), (454, 0.