<h1>基于用户的协同过滤<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1.-导入数据" data-toc-modified-id="1.-导入数据-1">1. 导入数据</a></span></li><li><span><a href="#2.-建立关系矩阵rating" data-toc-modified-id="2.-建立关系矩阵rating-2">2. 建立关系矩阵rating</a></span></li><li><span><a href="#3.计算用户相似度（余弦相似度）" data-toc-modified-id="3.计算用户相似度（余弦相似度）-3">3.计算用户相似度（余弦相似度）</a></span></li><li><span><a href="#4.-构建推荐指数函数" data-toc-modified-id="4.-构建推荐指数函数-4">4. 构建推荐指数函数</a></span></li><li><span><a href="#5.-构建预测函数" data-toc-modified-id="5.-构建预测函数-5">5. 构建预测函数</a></span></li><li><span><a href="#6.-构建最终的Topk推荐函数" data-toc-modified-id="6.-构建最终的Topk推荐函数-6">6. 构建最终的Topk推荐函数</a></span></li><li><span><a href="#7.-模型改进" data-toc-modified-id="7.-模型改进-7">7. 模型改进</a></span><ul class="toc-item"><li><span><a href="#练习：基于物品的协同过滤" data-toc-modified-id="练习：基于物品的协同过滤-7.1">练习：基于物品的协同过滤</a></span></li></ul></li></ul></div>

## 1. 导入数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
#读取数据文档
df = pd.read_csv('example.csv')
df.head()

Unnamed: 0,用户id,物品id,评分
0,0,0,1.0
1,0,2,2.0
2,0,5,1.0
3,1,2,4.0
4,1,3,2.0


In [3]:
df['用户id'].value_counts()

10    6
2     5
4     4
3     3
0     3
11    2
9     2
8     2
7     2
6     2
5     2
1     2
Name: 用户id, dtype: int64

In [4]:
# 对物品id去重
np.unique(df['用户id'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

## 2. 建立关系矩阵rating

In [5]:
# index 行填什么 col 列填什么 fill_value = 0 如果有空值填什么 values 就是行和列组成的值填什么
dfpivot= df.pivot_table(index="用户id",columns="物品id",values="评分",fill_value=0)
dfpivot

物品id,0,1,2,3,4,5
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0,2,0,0,1
1,0,0,4,2,0,0
2,3,5,0,4,4,3
3,0,4,1,0,3,0
4,0,0,2,5,4,3
5,5,0,0,0,2,0
6,0,4,3,0,0,0
7,0,0,0,4,0,2
8,5,0,4,0,0,0
9,0,2,3,0,0,0


In [6]:
#获取关系矩阵
freq_matrix = dfpivot.values
freq_matrix

array([[1, 0, 2, 0, 0, 1],
       [0, 0, 4, 2, 0, 0],
       [3, 5, 0, 4, 4, 3],
       [0, 4, 1, 0, 3, 0],
       [0, 0, 2, 5, 4, 3],
       [5, 0, 0, 0, 2, 0],
       [0, 4, 3, 0, 0, 0],
       [0, 0, 0, 4, 0, 2],
       [5, 0, 4, 0, 0, 0],
       [0, 2, 3, 0, 0, 0],
       [4, 1, 5, 2, 2, 4],
       [0, 3, 0, 0, 5, 0]], dtype=int64)

## 3.计算用户相似度（余弦相似度）

In [7]:
# 用户之间的相似度，行一定是用户。会变成12*12 的数组
from sklearn.metrics.pairwise import cosine_similarity
# 此时 user_similar 是 12*12
user_similar=cosine_similarity(freq_matrix)
# 化成DateFrame
pd.DataFrame(user_similar)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.730297,0.282843,0.160128,0.388889,0.379049,0.489898,0.182574,0.82885,0.679366,0.904534,0.0
1,0.730297,1.0,0.206559,0.175412,0.547723,0.0,0.536656,0.4,0.558744,0.744208,0.660578,0.0
2,0.282843,0.206559,1.0,0.724657,0.707107,0.493172,0.46188,0.568038,0.270501,0.320256,0.639602,0.693103
3,0.160128,0.175412,0.724657,1.0,0.373632,0.218507,0.745241,0.0,0.122513,0.598321,0.362103,0.908108
4,0.388889,0.547723,0.707107,0.373632,1.0,0.202159,0.163299,0.791155,0.17002,0.226455,0.670025,0.46676
5,0.379049,0.0,0.493172,0.218507,0.202159,1.0,0.0,0.0,0.725018,0.0,0.54858,0.318465
6,0.489898,0.536656,0.46188,0.745241,0.163299,0.0,1.0,0.0,0.374817,0.94299,0.467748,0.411597
7,0.182574,0.4,0.568038,0.0,0.791155,0.0,0.0,1.0,0.0,0.0,0.440386,0.0
8,0.82885,0.558744,0.270501,0.122513,0.17002,0.725018,0.374817,0.0,1.0,0.519778,0.768946,0.0
9,0.679366,0.744208,0.320256,0.598321,0.226455,0.0,0.94299,0.0,0.519778,1.0,0.58037,0.285391


## 4. 构建推荐指数函数

计算公式：$$ r_{xi} = \frac{\sum_{y \in N(x;i)} s_{xy} \cdot r_{yi}}{\sum_{y \in N(x;i)}  s_{xy}}$$



$$r_{xi}: 预测用户x对物品i的评分$$
$$r_{yi}: 预测用户y对物品i的评分$$
$$s_{xy}: 用户x和用户y之间的相似度$$ 
$$N(x;i): 与用户x相似的用户对物品i的评分集合$$

In [11]:
freq_matrix

array([[1, 0, 2, 0, 0, 1],
       [0, 0, 4, 2, 0, 0],
       [3, 5, 0, 4, 4, 3],
       [0, 4, 1, 0, 3, 0],
       [0, 0, 2, 5, 4, 3],
       [5, 0, 0, 0, 2, 0],
       [0, 4, 3, 0, 0, 0],
       [0, 0, 0, 4, 0, 2],
       [5, 0, 4, 0, 0, 0],
       [0, 2, 3, 0, 0, 0],
       [4, 1, 5, 2, 2, 4],
       [0, 3, 0, 0, 5, 0]], dtype=int64)

In [12]:
#构建一个基于用户的推荐
def Recommendation(uid,iid,similar,k=5):
    # 加权平均的分子
    score = 0
    # 加权平均的分母
    weight = 0
    user_id_action = freq_matrix[uid,:]      #用户user_id 对所有商品的行为评分  
    item_id_action = freq_matrix[:,iid]      #物品item_id 得到的所有用户评分  

    user_id_similar = similar[uid,:]      #用户user_id 对所有用户的相似度    
    # argsort 从小到大排序 返回的是索引
    similar_index = np.argsort(user_id_similar)[-(k+1):-1]  #最相似的k个用户的index（除了自己）
    
    for j in similar_index :
        if item_id_action[j]!=0:
            user_id_j_action = freq_matrix[j,:] # j号用户对于物品评分
            score += user_id_similar[j]*(item_id_action[j])
            weight += abs(user_id_similar[j])

    if weight==0:  
        return 0
    else:
        return score/weight

In [13]:
Recommendation(4,0,user_similar,k=3)

3.4865366677826084

## 5. 构建预测函数

In [14]:
#构建预测函数
def predict(similar):
    """预测函数的功能: 传入相似度矩阵, 通过对每个用户和每个物品进行计算, 计算出一个推荐矩阵"""
    user_count = freq_matrix.shape[0]#用户数
    item_count = freq_matrix.shape[1]#商品数
    predic_matrix = np.zeros((user_count,item_count)) # 创建一个全为0的矩阵（形状与原来评分矩阵一样）
    for uid in range(user_count):
        for iid in range(item_count):
            if freq_matrix[uid,iid] == 0:
                predic_matrix[uid,iid] = Recommendation(uid,iid,similar)
    return predic_matrix

In [33]:
user_prediction_matrix = predict(user_similar)
pd.DataFrame(user_prediction_matrix)

Unnamed: 0,0,1,2,3,4,5
0,0.0,2.036292,0.0,2.0,2.0,0.0
1,3.162839,1.529766,0.0,0.0,2.9066,2.587323
2,0.0,0.0,2.576503,0.0,0.0,0.0
3,3.0,0.0,0.0,4.340195,0.0,3.0
4,3.486537,3.040221,0.0,0.0,0.0,0.0
5,0.0,2.91853,3.873223,2.946812,0.0,2.852536
6,2.465305,0.0,0.0,2.0,2.614384,2.465305
7,3.063172,3.253172,3.169252,0.0,3.510568,0.0
8,0.0,1.403327,0.0,2.0,2.0,2.443763
9,2.382123,0.0,0.0,2.0,2.507615,2.382123


## 6. 构建最终的Topk推荐函数

In [49]:
# recommendation_df = pd.DataFrame(user_prediction_matrix,columns=dfpivot.columns,index=dfpivot.index)
# recommendation_df = recommendation_df.stack().reset_index()
# recommendation_df

Unnamed: 0,用户id,物品id,0
0,0,0,0.000000
1,0,1,2.036292
2,0,2,0.000000
3,0,3,2.000000
4,0,4,2.000000
...,...,...,...
67,11,1,0.000000
68,11,2,1.722070
69,11,3,4.402427
70,11,4,0.000000


In [50]:
# recommendation_df.rename(columns={0:'推荐指数'},inplace=True)
# recommendation_df

Unnamed: 0,用户id,物品id,推荐指数
0,0,0,0.000000
1,0,1,2.036292
2,0,2,0.000000
3,0,3,2.000000
4,0,4,2.000000
...,...,...,...
67,11,1,0.000000
68,11,2,1.722070
69,11,3,4.402427
70,11,4,0.000000


In [53]:
# grouped = recommendation_df.groupby("用户id")
# # 得到分组后的前几个数据 (k就是传入get_topk的那个k参数)
# topk = grouped.apply(get_topk,k=3)

In [54]:
# topk #此时topk 具有多层索引 可以看到行索引 用户id 后面还有一列 用drop level 删除 1，3，4，6 那一列的索引

In [55]:
def get_topk(group,k):
    # 返回排序后的前几个值
    return group.sort_values("推荐指数",ascending=False)[:k]

In [56]:
def get_recommendation(user_prediction_matrix,k=5):
    # 将用户预测数据, 构建成一个DataFrame
    recommendation_df = pd.DataFrame(user_prediction_matrix,columns=dfpivot.columns,index=dfpivot.index)
    # 将数据进行转换
    recommendation_df = recommendation_df.stack().reset_index()
    # 对列名进行修改
    recommendation_df.rename(columns={0:"推荐指数"},inplace=True)
    # 根据用户ID列进行分组
    grouped = recommendation_df.groupby("用户id")
    # 得到分组后的前几个数据 (k就是传入get_topk的那个k参数)
    topk = grouped.apply(get_topk,k=k)
    
    
    # 删除掉用户ID列
    topk = topk.drop(["用户id"],axis=1)
    # 删除掉多余的索引
    topk.index = topk.index.droplevel(1)
    # 索引重排(这个在多维索引的情况下 也会帮我们删除其他维的索引)
    topk.reset_index(inplace=True)
    return topk

In [57]:
top3 = get_recommendation(user_prediction_matrix,k=3)
top3

Unnamed: 0,用户id,物品id,推荐指数
0,0,1,2.036292
1,0,3,2.0
2,0,4,2.0
3,1,0,3.162839
4,1,4,2.9066
5,1,5,2.587323
6,2,2,2.576503
7,2,0,0.0
8,2,1,0.0
9,3,3,4.340195


## 7. 模型改进

In [58]:
#构建一个基于用户的推荐
def Recommendation_mean(uid,iid,similar,k=10):
    """减去平均数的计算方法"""
    score = 0
    weight = 0
    user_id_action = freq_matrix[uid,:]      #用户user_id 对所有商品的行为评分  
    item_id_action = freq_matrix[:,iid]      #物品item_id 得到的所有用户评分  

    user_id_similar = similar[uid,:]      #用户user_id 对所有用户的相似度    
    similar_index = np.argsort(user_id_similar)[-(k+1):-1]  #最相似的k个用户的index（除了自己）
    user_id_i_mean = np.sum(user_id_action)/user_id_action[user_id_action!=0].size  # uid 的评分均值
    for j in similar_index : # 从最相似用户里面计算评分
        if item_id_action[j]!=0:
            user_id_j_action = freq_matrix[j,:]
            user_id_j_mean = np.sum(user_id_j_action)/user_id_j_action[user_id_j_action!=0].size
            score += user_id_similar[j]*(item_id_action[j]-user_id_j_mean)
            weight += abs(user_id_similar[j])

    if weight==0:  
        return 0
    else:
        return user_id_i_mean + score/weight

In [59]:
Recommendation_mean(4,0,user_similar,k=3)

3.5757660020086943

In [15]:
#构建预测函数
def predict_mean(similar):
    """预测函数的功能: 传入相似度矩阵, 通过对每个用户和每个物品进行计算, 计算出一个推荐矩阵"""
    user_count = freq_matrix.shape[0]#用户数
    item_count = freq_matrix.shape[1]#商品数
    predic_matrix = np.zeros((user_count,item_count))
    for uid in range(user_count):
        for iid in range(item_count):
            if freq_matrix[uid,iid] == 0:
                predic_matrix[uid,iid] = Recommendation_mean(uid,iid,similar)
    return predic_matrix

In [16]:
user_prediction_matrix_mean = predict_mean(user_similar)
top3_mean = get_recommendation(user_prediction_matrix_mean,k=3)
top3_mean

Unnamed: 0,用户id,物品id,推荐指数
0,0,3,1.006973
1,0,1,0.796579
2,0,4,0.780863
3,1,0,3.246395
4,1,5,2.83421
5,1,4,2.819569
6,2,2,3.504501
7,2,0,0.0
8,2,1,0.0
9,3,3,2.769286


```


```
### 练习：基于物品的协同过滤
- 1.读取数据
- 2.根据用户id和物品id构建rating矩阵
- 3.使用余弦相似度计算物品相似度矩阵
- 4.构建物品推荐函数（用减去均值的改进方法）
- 5.构建预测函数
- 6.构建最终Top5推荐函数

1.读取数据

In [None]:
import numpy as np
import pandas as pd

In [60]:
#读取数据文档
df = pd.read_csv('example.csv')
df.head()

Unnamed: 0,用户id,物品id,评分
0,0,0,1.0
1,0,2,2.0
2,0,5,1.0
3,1,2,4.0
4,1,3,2.0


2.根据用户id和物品id构建rating矩阵

In [61]:
df_pivot = df.pivot_table(values='评分',index='用户id',columns='物品id',fill_value=0)
df_pivot

物品id,0,1,2,3,4,5
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0,2,0,0,1
1,0,0,4,2,0,0
2,3,5,0,4,4,3
3,0,4,1,0,3,0
4,0,0,2,5,4,3
5,5,0,0,0,2,0
6,0,4,3,0,0,0
7,0,0,0,4,0,2
8,5,0,4,0,0,0
9,0,2,3,0,0,0


3.使用余弦相似度计算物品相似度矩阵

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

In [83]:
pro_similar =cosine_similarity(df_pivot.T.values)
freq_matrix = df_pivot.values
pd.DataFrame(pro_similar)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.258653,0.525657,0.284555,0.400036,0.477567
1,0.258653,1.0,0.349619,0.323845,0.676007,0.361071
2,0.525657,0.349619,1.0,0.378932,0.266357,0.4892
3,0.284555,0.323845,0.378932,1.0,0.57675,0.854042
4,0.400036,0.676007,0.266357,0.57675,1.0,0.595665
5,0.477567,0.361071,0.4892,0.854042,0.595665,1.0


In [81]:
user_id_similar = pro_similar[0,:] 
user_id_similar

array([1.        , 0.25865307, 0.52565748, 0.2845552 , 0.40003556,
       0.47756693])

In [74]:
pd.DataFrame(freq_matrix)

Unnamed: 0,0,1,2,3,4,5
0,1,0,2,0,0,1
1,0,0,4,2,0,0
2,3,5,0,4,4,3
3,0,4,1,0,3,0
4,0,0,2,5,4,3
5,5,0,0,0,2,0
6,0,4,3,0,0,0
7,0,0,0,4,0,2
8,5,0,4,0,0,0
9,0,2,3,0,0,0


4.构建物品推荐函数（用减去均值的改进方法）

In [None]:
def Recommendation_pro_mean(uid,iid,similar,k=10):
    '''减去平均数的算法'''
    score = 0
    weight = 0
    

In [77]:
user_id_action = freq_matrix[4,:]
user_id_action

array([0, 0, 2, 5, 4, 3], dtype=int64)

In [None]:
#构建一个基于用户的推荐
def Recommendation_mean(uid,iid,similar,k=10):
    """减去平均数的计算方法"""
    score = 0
    weight = 0
    user_id_action = freq_matrix[uid,:]      #用户user_id 对所有商品的行为评分  
    item_id_action = freq_matrix[:,iid]      #物品item_id 得到的所有用户评分  

    pro_id_similar = similar[iid,:]      #物品pro_id 对所有物品的相似度    
    pro_similar_index = np.argsort(pro_id_similar)[-(k+1):-1]  #最相似的k个用户的index（除了自己）(2,5)
    pro_id_i_mean = np.sum(item_id_action)/item_id_action[item_id_action!=0].size  # 单个物品的评分均值
    for j in pro_similar_index : # 从最相似物品里面计算评分
        if item_id_action[j]!=0:
            user_id_j_action = freq_matrix[j,:]
            user_id_j_mean = np.sum(user_id_j_action)/user_id_j_action[user_id_j_action!=0].size
            score += user_id_similar[j]*(item_id_action[j]-user_id_j_mean)
            weight += abs(user_id_similar[j])

    if weight==0:  
        return 0
    else:
        return user_id_i_mean + score/weight