## 基于协同过滤的推荐
### 基于内存
#### User-based
* 找历史行为相似的邻居
* 用户评分用其邻居对目标的评分加权求和得到
* 基于评分呈现推荐内容

In [77]:
import numpy as np
import pandas as pd
import csv



##### 建立用户画像和项目画像字典

In [85]:
#建立user--book画像
user_portrait = {}
#建立book--user画像
book_portrait = {}

with open('book_score.csv',encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        #print(row)
        user_ID = int(row[0])
        book_ID = int(row[1])
        book_score = int(row[2])
        #建立书籍和评分的pair
        if user_ID in user_portrait:
            user_portrait[user_ID].append((book_ID, book_score))
        else:
            user_portrait[user_ID] = [(book_ID, book_score)]
        #建立用户和评分的pair
        if book_ID in book_portrait:
            book_portrait[book_ID].append((user_ID, book_score))
        else:
            book_portrait[book_ID] = [(user_ID, book_score)]

#打印用户画像字典
"""for key in user_portrait:
    print(key, ":", user_portrait[key]) 
print(user_portrait[1398478][0][0])"""   
#打印书籍画像字典
for key in book_portrait:
    print(key, ":", book_portrait[key])
    

1467022 : [(1398478, 0), (1218141, 5), (3526903, 0), (1479992, 5), (1478027, 4), (4271422, 4), (1083017, 5), (1501360, 5), (3366093, 5), (1048973, 5), (1865551, 5), (1014251, 5), (2166781, 0), (1345593, 4), (2843554, 5), (2803529, 0), (1321919, 5), (1012873, 4), (1498591, 0), (1162157, 0), (1265544, 4), (1585055, 0), (1753339, 3), (1219568, 0), (1234587, 5), (1458339, 4), (1169159, 5), (1867263, 4), (1641349, 3), (4414552, 3), (2418800, 5), (1077415, 0), (2812706, 5), (1228483, 0), (2420609, 5), (1289109, 3), (1436123, 4), (1341999, 4), (1009630, 5), (1904648, 5), (1944227, 4), (1282305, 0), (3472033, 0), (1380333, 0), (1035406, 5), (1513932, 5), (2515299, 5), (1001118, 5), (1635210, 5), (3742361, 5), (1134932, 5), (2201915, 0), (1205909, 5), (1207888, 5), (1129079, 4), (2256880, 3), (1031137, 4), (2311333, 5), (1129412, 4), (1039999, 0), (3175040, 5), (2873595, 5), (30523221, 0), (2163644, 5), (1000153, 5), (1050726, 5), (1223830, 0), (1069524, 5), (1331855, 0), (1764684, 5), (2139539

##### 计算用户相似性——寻找邻居
计算过程可优化（先排序）

In [82]:
#用余弦相似度计算两个用户评分的相似度
def average(userid, user_portrait):
    n = len(user_portrait[userid])
    average = 0
    for i in range(n):
        average += user_portrait[userid][i][1]
    average = average/n
    return average
def sim(userid1, userid2, user_portrait):
    n = len(user_portrait[userid1])
    m = len(user_portrait[userid2])
    #计算分母
    average1 = average(userid1, user_portrait)
    sum1 = 0
    for i in range(n):
        sum1 += (user_portrait[userid1][i][1]-average1)**2
    sum1 = np.sqrt(sum1)
    average2 = average(userid2, user_portrait)
    sum2 = 0
    for i in range(m):
        sum2 += (user_portrait[userid2][i][1]-average2)**2
    sum2 = np.sqrt(sum2)
    #计算分子
    sum3 = 0
    for i in range(n):
        for j in range(m):
            if user_portrait[userid1][i][0] == user_portrait[userid2][j][0]:
                sum3 += (user_portrait[userid1][i][1]-average1)*(user_portrait[userid2][j][1]-average2)
    return sum3/(sum1*sum2)

test = {}
test[1] = [(1,1),(2,2),(3,3)]
test[2] = [(1,1),(2,2),(4,4)]    
sim(1,2,test)

0.43643578047198484

In [89]:
# 计算某个用户的top-k相似用户
def top_k_similar(userid, user_portrait, k):
    sim_list = []
    for key in user_portrait:
        if key != userid:
            sim_list.append((key, sim(userid, key, user_portrait)))
    # 从高到低排
    # lambda x:x[1]表示按照列表中的第二个元素排序
    sim_list.sort(key=lambda x:x[1], reverse=True) 
    # 去除相似度非正的项
    sim_list1 = sim_list[:k]
    sim_list2 = [x for x in sim_list1 if x[1]>0]
    return sim_list2

test2 = {}
test2[1] = [(1,1),(2,2),(3,3)]
test2[2] = [(2,2),(4,4)]
test2[3] = [(1,1),(2,2),(5,5)]
test2[4] = [(1,5),(2,0),(6,5)]
top_k_similar(1, test2, 2)

[(3, 0.40032038451271773)]

#### 基于相似邻居预测用户对某本未读过的书的评分

In [None]:
def pred(userid, bookid, user_portrait, k):
    top_k = top_k_similar(userid, user_portrait, k)
    sum1 = 0
    sum2 = 0
    for i in range(len(top_k)):
        for j in range(len(user_portrait[top_k[i][0]])):
            if user_portrait[top_k[i][0]][j][0] == bookid :
                sum1 += top_k[i][1]*(user_portrait[top_k[i][0]][j][1]-average(top_k[i][0], user_portrait))
                sum2 += top_k[i][1]
    return average(userid, user_portrait)+sum1/sum2

In [None]:
#预测用户对未度过的书的top-a评分
def top_a_pred(userid, user_portrait, a):
    pred_list = []
    for book in book_portrait:
        if book not in [x[0] for x in user_portrait[userid]]:
            pred_list.append((book, pred(userid, book, user_portrait, a)))
    pred_list.sort(key=lambda x:x[1], reverse=True)
    if len(pred_list) < a:
        return pred_list
    return pred_list[:a]