In [1]:
import numpy as np
import pandas as pd

###### 导入数据

In [2]:
example = pd.read_csv('example.txt', header=None)
example.head()

Unnamed: 0,0,1,2
0,1,1,4
1,1,2,3
2,1,5,5
3,2,1,5
4,2,3,4


In [3]:
example.columns = ["用户ID", "商品ID", "评分"]
example.head()

Unnamed: 0,用户ID,商品ID,评分
0,1,1,4
1,1,2,3
2,1,5,5
3,2,1,5
4,2,3,4


In [4]:
# pivot_table 透视表
items = pd.pivot_table(example, index='商品ID', columns='用户ID', values='评分')
items

# NaN用0填充
items.fillna(0, inplace=True)
items

用户ID,1,2,3,4,5,6
商品ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,5.0,4.0,0.0,0.0,0.0
2,3.0,0.0,0.0,3.0,4.0,0.0
3,0.0,4.0,5.0,0.0,0.0,2.0
4,0.0,0.0,3.0,0.0,0.0,4.0
5,5.0,4.0,4.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,4.0,5.0


In [5]:
# 相似度
# 余弦相似度
def cos_similar(a, b):
    return np.dot(a.T,b) / ( np.sqrt( (a**2).sum() ) * np.sqrt( (b**2).sum() ) )

In [6]:
#  推荐
#  比如：给用户2推荐商品4，需要计算4行2列的得分

# 商品4
item4 = items.loc[4].values
item4

# 商品3
item3 = items.loc[3].values
item3

# 计算相似度
cos_similar(item3, item4)

0.6857275130999354

###### 计算商品4和其他商品的相似度

In [7]:
items

用户ID,1,2,3,4,5,6
商品ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,5.0,4.0,0.0,0.0,0.0
2,3.0,0.0,0.0,3.0,4.0,0.0
3,0.0,4.0,5.0,0.0,0.0,2.0
4,0.0,0.0,3.0,0.0,0.0,4.0
5,5.0,4.0,4.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,4.0,5.0


In [8]:
# 商品4
item4 = items.loc[4].values

# 保存所有物品相似度
similar_dict = { }

for i in items.index:
    if i == 4:
        continue
    
    # 其他商品
    item = items.loc[i].values
    
    # 计算物品相似度
    similar = cos_similar(item, item4)

    
    similar_dict[i] = similar

In [9]:
similar_dict

{1: 0.3178877656956105,
 2: 0.0,
 3: 0.6857275130999354,
 5: 0.3178877656956105,
 6: 0.4923659639173309}

In [10]:
# top2: 获取相似度排名前2的物品
top2_similar = pd.Series(similar_dict).sort_values(ascending=False)[ : 2]
top2_similar

3    0.685728
6    0.492366
dtype: float64

###### 计算得分

In [11]:
items

用户ID,1,2,3,4,5,6
商品ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,5.0,4.0,0.0,0.0,0.0
2,3.0,0.0,0.0,3.0,4.0,0.0
3,0.0,4.0,5.0,0.0,0.0,2.0
4,0.0,0.0,3.0,0.0,0.0,4.0
5,5.0,4.0,4.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,4.0,5.0


In [12]:
# 用户2
top2_score = items.loc[top2_similar.index, 2]
top2_score

3    4.0
6    0.0
Name: 2, dtype: float64

In [13]:
score = (top2_score * top2_similar).sum() / top2_similar.sum()
score

2.328261811061315

###### 把所有0的值计算出得分，并填充

In [15]:
items

用户ID,1,2,3,4,5,6
商品ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,5.0,4.0,0.0,0.0,0.0
2,3.0,0.0,0.0,3.0,4.0,0.0
3,0.0,4.0,5.0,0.0,0.0,2.0
4,0.0,0.0,3.0,0.0,0.0,4.0
5,5.0,4.0,4.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,4.0,5.0


In [16]:
# 封装: 
def recommend(items, user_id, item_id, topK=2):
    # item
    item = items.loc[item_id].values
    
    # 保存所有物品相似度
    similar_dict = { }
    
    # 遍历其他每隔商品
    for i in items.index:
        if i == item_id:
            continue
        # 其他商品
        item_other = items.loc[i].values
        # 计算物品相似度
        similar = cos_similar(item_other, item)
        # 保存相似度
        similar_dict[i] = similar
    
    # print(similar_dict)
    
    # topK_similar: 获取相似度排名前K的物品
    topK_similar = pd.Series(similar_dict).sort_values(ascending=False)[ : topK]
    # topK_score: 相似度前K的得分
    topK_score = items.loc[topK_similar.index, user_id]
    
    # 得分
    score = (topK_score * topK_similar).sum() / topK_similar.sum()

    return score

In [17]:
recommend(items, 2, 4)

2.328261811061315

###### 计算每一个空位置的得分

In [18]:
# 保存所有推荐的得分
recommends = [ ]

for item_id in items.index:   # 行    
    
    for  user_id in items.columns:  # 列
        
        # 不为0的位置 直接跳过，不计算
        if items.loc[item_id, user_id] != 0:
            continue
        
        # 计算用户user_id，商品item_id对应的得分
        score = recommend(items, user_id, item_id, topK=2)
        # 填充
        items.loc[item_id, user_id] = score
        
        # 把对应用户user_id,商品item_id 的推荐得分保存
        recommends.append( (item_id, user_id, score) )

In [19]:
recommends

df = pd.DataFrame(recommends, columns=["item_id", 'user_id', 'score'])
df

Unnamed: 0,item_id,user_id,score
0,1,4,0.0
1,1,5,0.0
2,1,6,0.891292
3,2,2,1.369584
4,2,3,1.661957
5,2,6,2.575211
6,3,1,4.464549
7,3,4,0.0
8,3,5,0.0
9,4,1,2.397071


In [20]:
# 排序: 降序
df2 = df.sort_values(by=['user_id', 'score'], ascending=False)
df2

Unnamed: 0,item_id,user_id,score
5,2,6,2.575211
15,5,6,1.440772
2,1,6,0.891292
1,1,5,0.0
8,3,5,0.0
12,4,5,0.0
14,5,5,0.0
0,1,4,0.0
7,3,4,0.0
11,4,4,0.0


In [21]:
# 把user_id当成行索引
df3 = df2.set_index(keys=['user_id'])
df3

Unnamed: 0_level_0,item_id,score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,2,2.575211
6,5,1.440772
6,1,0.891292
5,1,0.0
5,3,0.0
5,4,0.0
5,5,0.0
4,1,0.0
4,3,0.0
4,4,0.0


In [22]:
# 所有用户ID
items.columns

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64', name='用户ID')

In [23]:
df3.loc[1]

Unnamed: 0_level_0,item_id,score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,4.464549
1,6,2.80102
1,4,2.397071


In [24]:
# 推荐top2： 给用户推荐得分排名前2的商品

result = [ ]

for user_id in items.columns:
    # recommend_top2
    recommend_top2 = df3.loc[user_id][ : 2]
    
    result.append( (user_id, recommend_top2.item_id.tolist(), recommend_top2.score.tolist()) )
    

In [25]:
result

[(1, [3, 6], [4.464549494028136, 2.8010201685475584]),
 (2, [4, 6], [2.8319137601622186, 1.8898104530840865]),
 (3, [6, 2], [2.1798968360608266, 1.6619574443347203]),
 (4, [1, 3], [0.0, 0.0]),
 (5, [1, 3], [0.0, 0.0]),
 (6, [2, 5], [2.575210695719183, 1.4407720410086033])]