# 标签推荐

## 业务场景
1. 针对Delicious数据集，通过标签推荐方式实现推荐系统，并统计不同推荐方式的精确度和召回率
2. 使用4种方法：SimpleTagBased、NormTagBased、TagBased-TFIDF、TagBased-TFIDF++

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

## 数据集处理

In [2]:
df_data = pd.read_csv('./user_taggedbookmarks-timestamps.dat', sep='\t')
print('重复个数', df_data.duplicated().sum())

重复个数 0


In [3]:
# 所有数据保存字典形
record = {}
for user_id, groupy_user_id in df_data.groupby('userID'):
    record[user_id] = {}
    for item_id, groupby_item_id in groupy_user_id.groupby('bookmarkID'):
        record[user_id][item_id] = groupby_item_id['tagID'].values.tolist()

In [4]:
# 拆分训练集和测试集
def train_test_split(record, ratio):
    set_1 = {}
    set_2 = {}
    
    for user in record:
        set_1[user] = {}
        set_2[user] = {}
        for item in record[user]:
            if random.random() <= ratio:
                set_1[user][item] = record[user][item]
            else:
                set_2[user][item] = record[user][item]
    return set_1, set_2

train_set, test_set = train_test_split(record, 0.75)

## 定义计算的相关数据结构

In [5]:
# user，item，tag索引映射
def mapping_index(df, column):
    value_list = df[column].unique().tolist()
    index_to_value = {index: value for index, value in zip(range(len(value_list)), value_list)}
    value_to_index = {x[1]: x[0] for x in list(index_to_value.items())}
    return index_to_value, value_to_index

index_to_user, user_to_index = mapping_index(df_data, 'userID')
index_to_item, item_to_index = mapping_index(df_data, 'bookmarkID')
index_to_tag, tag_to_index = mapping_index(df_data, 'tagID')

In [6]:
# 利用 train_set 对 user，item，tag 生成两两关系的numpy矩阵
user_item = np.zeros(shape=(len(df_data['userID'].unique()), len(df_data['bookmarkID'].unique())))
user_tag = np.zeros(shape=(len(df_data['userID'].unique()), len(df_data['tagID'].unique())))
item_tag = np.zeros(shape=(len(df_data['bookmarkID'].unique()), len(df_data['tagID'].unique())))

for user_id in train_set:
    for item_id in train_set[user_id]:
        user_item[user_to_index[user_id], item_to_index[item_id]] += 1
        for tag_id in train_set[user_id][item_id]:
            user_tag[user_to_index[user_id], tag_to_index[tag_id]] += 1
            item_tag[item_to_index[item_id], tag_to_index[tag_id]] += 1

## 推荐计算

$SimpleTagBased:score(u,i) = \sum_{b}n_{u,b} \times n_{i,b}$

$NormTagBased:score(u,i) = \sum_{b}\frac{n_{u,b}}{n_{u}} \times \frac{n_{i,b}}{n_{i}}$

$TagBased-TFIDF:score(u,i) = \sum_{b}\frac{n_{u,b}}{log(1+n_{b,u})} \times n_{i,b}$

$TagBased-TFIDF++:score(u,i) = \sum_b\frac{n_{u,b}}{log(1+n_{b,u})} \times \frac{n_{i,b}}{log(1+n_{i,u})}$

In [7]:
class DataBase():
    def __init__(self):
        self.user_tag = user_tag
        self.user_item = user_item
        self.item_tag = item_tag
        self.user_to_index = user_to_index
        self.item_to_index = item_to_index
        self.tag_to_index = tag_to_index
        self.index_to_item = index_to_item
        self.index_to_tag = index_to_tag
        self.index_to_user = index_to_user
        
        # 计算商品i被打过标签的总次数 n_i
        self.n_i = self.item_tag.sum(axis=1).reshape(-1,1)
        self.n_i[self.n_i==0] = float('inf') # 防止0除以0报警告
        
        # 计算物品i被多少个不同用户打过标签的数目 n_i_u
        self.n_i_u = self.user_item.sum(axis=0).reshape(-1,1)
        self.n_i_u[self.n_i_u==0] = float('inf') # 防止0除以0报警告

In [50]:
class TagBasedRecommend(DataBase):     
    
    def fit(self, user_id):
        # 转换成用户的index
        self.user_index = self.user_to_index[user_id]
        
        # 计算出用户打过的标签的index
        self.tag_index = np.where(self.user_tag[self.user_index]>0)[0]
        
        # 计算用户已经打过标签的商品index
        self.item_index = np.where(self.user_item[self.user_index]>0)[0]
        
        # 计算用户u对标签b使用过的次数 n_u_b
        self.n_u_b = self.user_tag[self.user_index][np.where(self.user_tag[self.user_index]>0)[0]]
        
        # 计算商品i被打过标签b的次数 n_i_b
        self.n_i_b = self.item_tag[:,self.tag_index]
        
        # 计算用户打过标签总次数 n_u
        self.n_u = self.user_tag[self.user_index].sum()
        
        # 计算标签b被多少个不同用户使用过 n_b_u
        self.n_b_u = (self.user_tag[:,self.tag_index] > 0).sum(axis=0)
        
        
    def predict(self, method='simple', top=5):
        # SimpleTagBased 推荐
        if method == 'simple':
            self.recommend_score = np.dot(self.n_u_b, self.n_i_b.T)
            
        # NormTabBased 推荐
        if method == 'norm':
            self.recommend_score = np.dot(self.n_u_b / self.n_u, (self.n_i_b / self.n_i).T) 
            
        # TagBase-TFIDF 推荐
        if method == 'tfidf':
            self.recommend_score = np.dot(self.n_u_b / np.log(1+self.n_b_u), self.n_i_b.T)
        
        # TagBase-TFIDF++ 推荐
        if method == 'tfidf_plus':
            self.recommend_score = np.dot(self.n_u_b / np.log(1+self.n_b_u), (self.n_i_b / np.log(1+self.n_i_u)).T)
        
        
        # 晒选出未打过标签的商品index
        self.recommend_item_score = {x: y for x, y in enumerate(self.recommend_score) if x not in self.item_index}
        
        # 对字典排序后提取商品index
        self.recommend_item_index = list(dict(sorted(self.recommend_item_score.items(), key=lambda x:x[1], reverse=True)).keys())
               
        # 商品index转化为商品id
        self.recommend_item_id = [self.index_to_item[x] for x in self.recommend_item_index[:top]]
        return self.recommend_item_id

## 推荐并计算精度和召回

In [54]:
fav_count = 0
rec_count = 0
valid_count = np.array([[0,0,0,0]])
top = 20

rec = TagBasedRecommend()
for num, user in enumerate(test_set):
    user_fav_list = test_set[user].keys()
    fav_count += len(user_fav_list)
    rec.fit(user)
    for idx, method in enumerate(['simple', 'norm', 'tfidf', 'tfidf_plus']):
        recommend_list = rec.predict(method=method, top=top)
        valid_count[0][idx] += len(set(recommend_list) & set(user_fav_list))
    rec_count += 20
    if num % 100 == 0:
        print(num, end='...')
    if num == 100:
        break

df_data = pd.DataFrame(
    data=np.concatenate((valid_count/rec_count, valid_count/fav_count)), 
    columns=['simple', 'norm', 'tfidf', 'tfidf_plus'], 
    index=['precision', 'recall']
)

for column in df_data.columns:
    df_data[column] = df_data[column].apply(lambda x:'{}%'.format(round(x*100,2)))
df_data

0...100...

Unnamed: 0,simple,norm,tfidf,tfidf_plus
precision,0.64%,0.0%,0.59%,0.35%
recall,0.75%,0.0%,0.69%,0.41%
