In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
df_rating= pd.read_csv("../data/ml-1m/ratings.dat", sep="::", header=None, engine="python", 
                     names="UserID::MovieID::Rating::Timestamp".split("::"))

In [3]:
df_rating_sample = df_rating.sample(frac = 0.01)
df_rating_sample.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
697883,4169,2015,4,971579525
473937,2909,2717,3,971973337
815556,4897,66,2,962716426
896985,5424,1580,3,960142938
628538,3807,2465,2,965967203


In [4]:
user = df_rating_sample["UserID"].unique()
movie = df_rating_sample["MovieID"].unique()
print(user.shape, movie.shape)

(3713,) (2305,)


In [5]:
rating_dict = defaultdict(int)
for index, row in df_rating_sample.iterrows():
    userid = row["UserID"]
    movieid = row["MovieID"]
    rating = row["Rating"]
    key = (movieid, userid)
    rating_dict[key] = rating
print(len(rating_dict))

10002


In [6]:
co_matrix = []
i = 0
j = 0
for i in range(len(movie)):
    l = []
    for j in range(len(user)):
        key = (movie[i], user[j])
        l.append(rating_dict[key])
    co_matrix.append(l)
print(len(co_matrix), len(co_matrix[0]), co_matrix[0][0])

2305 3713 4


In [7]:
np_matrix = np.array(co_matrix)
movie_sim_dict = defaultdict(list)
for i in range(np_matrix.shape[0]):
    for j in range(np_matrix.shape[0]):
        if j == i:
            continue
        a_norm = np.linalg.norm(np_matrix[i])
        b_norm = np.linalg.norm(np_matrix[j])
        similiarity = np.dot(np_matrix[i], np_matrix[j].T)/(a_norm * b_norm)
        movie_sim_dict[movie[i]].append((movie[j], similiarity))

In [14]:
print(movie_sim_dict[2015][1000:1110])

[(504, 0.0), (2345, 0.0), (2459, 0.0), (2307, 0.0), (3087, 0.0), (1061, 0.0), (3506, 0.0), (897, 0.0), (229, 0.0), (2975, 0.0), (1979, 0.15434872662825797), (2650, 0.0), (3051, 0.0), (640, 0.0), (2247, 0.0), (1655, 0.0), (1945, 0.0), (171, 0.0), (2333, 0.0), (569, 0.0), (3030, 0.0), (2871, 0.0), (1027, 0.0), (2320, 0.0), (1290, 0.0), (3060, 0.0), (354, 0.0), (1440, 0.0), (1457, 0.0), (42, 0.0), (2340, 0.0), (1582, 0.0), (2052, 0.0), (1218, 0.0), (2572, 0.0), (1731, 0.0), (2297, 0.0), (1212, 0.0), (2393, 0.0), (3812, 0.0), (2912, 0.0), (1921, 0.0), (147, 0.0), (2799, 0.0), (3450, 0.0), (1517, 0.0), (2024, 0.0), (3267, 0.0), (1089, 0.0), (1936, 0.0), (1961, 0.0), (1985, 0.0), (1175, 0.0), (1339, 0.0), (3372, 0.0), (3568, 0.0), (612, 0.0), (1263, 0.0), (3793, 0.0), (1336, 0.0), (2334, 0.0), (43, 0.0), (635, 0.0), (3801, 0.0), (2947, 0.0), (3240, 0.0), (3604, 0.0), (3668, 0.0), (3097, 0.0), (292, 0.0), (24, 0.0), (2323, 0.0), (3176, 0.0), (2783, 0.4), (2053, 0.16), (1941, 0.0), (2467, 0.0)

In [15]:
d = movie_sim_dict
for k, v in d.items():
    d[k] = sorted(v, key=lambda x : x[1], reverse=True)[:30]
print(d[2015])

[(3349, 0.4), (1436, 0.4), (2783, 0.4), (941, 0.4), (961, 0.4), (2074, 0.4), (3235, 0.4), (1671, 0.4), (3619, 0.4), (1844, 0.4), (1888, 0.3328201177351375), (2435, 0.3), (3537, 0.3), (3661, 0.3), (1546, 0.3), (479, 0.28284271247461906), (3739, 0.2603778219616477), (2622, 0.24), (2037, 0.24), (1537, 0.22086305214969307), (2447, 0.21213203435596426), (52, 0.21192517713040698), (2848, 0.21192517713040698), (3932, 0.2057983021710106), (1566, 0.2032002032003048), (1326, 0.19402850002906638), (1012, 0.17320508075688776), (1911, 0.17320508075688773), (552, 0.16803361008336115), (1466, 0.16724840200141816)]


In [16]:
user_dict = defaultdict()
movie_dict = defaultdict()
for i in range(len(user)):
    user_dict[user[i]] = i
for j in range(len(movie)):
    movie_dict[movie[j]] = j

In [17]:
predict_user = user[:10]
print(predict_user)

[4169 2909 4897 5424 3807 3445 1119  695 2929 1547]


![jupyter](./itemcf.png)

In [24]:
predict_rec = defaultdict(list)
for u in predict_user:
    l=[]
    #visit_m可以初始化为字典，因为python里字典查找比列表查找快
    visit_m = []
    unvisit_m = []
    for m in movie:
        if(np_matrix[user_dict[u]][movie_dict[m]] > 0):
            visit_m.append(m)
        else:
            unvisit_m.append(m)
    
    for uvm in unvisit_m:
        total_rating = 0
        for value in d[uvm]:
            #这里对整反馈电影进行查找
            if value[0] in visit_m:
                total_rating += value[1]*np_matrix[user_dict[u]][movie_dict[value[0]]]
        l.append((uvm, total_rating))
    predict_rec[u] = l

In [25]:
predict_rec_n = predict_rec

In [26]:
for k, v in predict_rec_n.items():
    predict_rec_n[k] = sorted(v, key=lambda x : x[1], reverse=True)[:20]

In [28]:
print(predict_rec_n[4169])

[(3920, 4.0), (2285, 4.0), (427, 2.910427500435996), (3596, 2.8284271247461903), (599, 2.82842712474619), (164, 2.6490647141300876), (2055, 2.558408596267325), (581, 2.4618298195866544), (2215, 2.1213203435596424), (2485, 2.1213203435596424), (1702, 2.0483662259967566), (2937, 2.0), (1009, 1.9845557534273355), (3194, 1.9694638556693236), (1566, 1.7235659238795935), (3349, 1.6), (1436, 1.6), (2783, 1.6), (941, 1.6), (961, 1.6)]
