# 基于用户标签的推荐

In [12]:
# 导入包
import random
import math
import time
import pandas as pd

## 一. 通用函数定义

In [3]:
# 定义装饰器，监控运行时间
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
        return res
    return wrapper

### 1. 数据处理相关
Delicious-2k数据集
1. load data
2. split data

In [4]:
class Dataset():
    
    def __init__(self, fp):
        # fp: data file path
        self.data = self.loadData(fp)
    
    def loadData(self, fp):
        data = [f.strip().split('\t')[:3] for f in open(fp).readlines()[1:]]
        new_data = {}
        for user, item, tag in data:
            if user not in new_data:
                new_data[user] = {}
            if item not in new_data[user]:
                new_data[user][item] = set()
            new_data[user][item].add(tag)
        ret = []
        for user in new_data:
            for item in new_data[user]:
                ret.append((user, item, list(new_data[user][item])))
        return ret
    
    def splitData(self, M, k, seed=1):
        '''
        :params: data, 加载的所有(user, item)数据条目
        :params: M, 划分的数目,最后需要取M折的平均
        :params: k, 本次是第几次划分,k~[0, M)
        :params: seed, random的种子数,对于不同的k应设置成一样的
        :return: train, test
        '''
        # 按照(user, item)作为key进行划分
        train, test = [], []
        random.seed(seed)
        for user, item, tags in self.data:
            # 这里与书中的不一致，本人认为取M-1较为合理，因randint是左右都覆盖的
            if random.randint(0, M-1) == k:  
                test.append((user, item, tags))
            else:
                train.append((user, item, tags))

        # 处理成字典的形式，user->set(items)
        def convert_dict(data):
            data_dict = {}
            for user, item, tags in data:
                if user not in data_dict:
                    data_dict[user] = {}
                data_dict[user][item] = tags
            return data_dict

        return convert_dict(train), convert_dict(test)

### 2. 评价指标
1. Precision（精确率）
2. Recall（召回率）
3. Coverage（覆盖率）
4. Diversity
5. Popularity(Novelty)（新颖度）

In [5]:
class Metric():
    
    def __init__(self, train, test, GetRecommendation):
        '''
        :params: train, 训练数据
        :params: test, 实验数据
        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数
        '''
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    # 为test中的每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
        
    # 定义精确率指标计算方式
    def precision(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(rank)
        return round(hit / all * 100, 2)
    
    # 定义召回率指标计算方式
    def recall(self):
        all, hit = 0, 0
        for user in self.test:
            test_items = set(self.test[user])
            rank = self.recs[user]
            for item, score in rank:
                if item in test_items:
                    hit += 1
            all += len(test_items)
        return round(hit / all * 100, 2)
    
    # 定义覆盖率指标计算方式
    def coverage(self):
        all_item, recom_item = set(), set()
        for user in self.train:
            for item in self.train[user]:
                all_item.add(item)
        for user in self.test:
            rank = self.recs[user]
            for item, score in rank:
                recom_item.add(item)
        return round(len(recom_item) / len(all_item) * 100, 2)
    
    # 定义多样性指标计算方式
    def diversity(self):
        # 计算item_vec，每个tag的个数
        item_tags = {}
        for user in self.train:
            for item in self.train[user]:
                if item not in item_tags:
                    item_tags[item] = {}
                for tag in self.train[user][item]:
                    if tag not in item_tags[item]:
                        item_tags[item][tag] = 0
                    item_tags[item][tag] += 1
        
        # 计算两个item的相似度
        def CosineSim(u, v):
            ret = 0
            for tag in item_tags[u]:
                if tag in item_tags[v]:
                    ret += item_tags[u][tag] * item_tags[v][tag]
            nu, nv = 0, 0
            for tag in item_tags[u]:
                nu += item_tags[u][tag] ** 2
            for tag in item_tags[v]:
                nv += item_tags[v][tag] ** 2
            return ret / math.sqrt(nu * nv)
        
        # 计算Diversity
        div = []
        for user in self.test:
            rank = self.recs[user]
            sim, cnt = 0, 0
            for u, _ in rank:
                for v, _ in rank:
                    if u == v:
                        continue
                    sim += CosineSim(u, v)
                    cnt += 1
            sim = sim / cnt if sim != 0 else 0
            div.append(1 - sim)
        return sum(div) / len(div)   
    
    # 定义新颖度指标计算方式
    def popularity(self):
        # 计算物品的流行度，为给这个物品打过标签的用户数
        item_pop = {}
        for user in self.train:
            for item in self.train[user]:
                if item not in item_pop:
                    item_pop[item] = 0
                item_pop[item] += 1

        num, pop = 0, 0
        for user in self.test:
            rank = self.recs[user]
            for item, score in rank:
                # 取对数，防止因长尾问题带来的被流行物品所主导
                pop += math.log(1 + item_pop[item])
                num += 1
        return round(pop / num, 6)
    
    # 评价值
    def eval(self):
        metric = {'Precision': self.precision(),
                  'Recall': self.recall(),
                  'Coverage': self.coverage(),
                  'Diversity': self.diversity(),
                  'Popularity': self.popularity()}
        print('Metric:', metric)
        return metric

## 二. 算法实现
1. SimpleTagBased
2. TagBasedTFIDF
3. TagBasedTFIDF++

In [6]:
# 1. 基于热门标签的推荐
def SimpleTagBased(train, N):
    '''
    :params: train, 训练数据集
    :params: N, 超参数,设置取TopN推荐物品数目
    :return: GetRecommendation,推荐接口函数
    '''
    # 统计user_tags和tag_items
    user_tags, tag_items = {}, {}
    for user in train:
        user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                if tag not in user_tags[user]:
                    user_tags[user][tag] = 0
                user_tags[user][tag] += 1
                if tag not in tag_items:
                    tag_items[tag] = {}
                if item not in tag_items[tag]:
                    tag_items[tag][item] = 0
                tag_items[tag][item] += 1
    
    def GetRecommendation(user):
        # 按照打分推荐N个未见过的
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0
                item_score[item] += user_tags[user][tag] * tag_items[tag][item]
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    
    return GetRecommendation

In [9]:
# 2. 改进一：为热门标签加入惩罚项
def TagBasedTFIDF(train, N):
    '''
    :params: train, 训练数据集
    :params: N, 超参数,设置取TopN推荐物品数目
    :return: GetRecommendation,推荐接口函数
    '''
    # 统计user_tags和tag_items
    user_tags, tag_items = {}, {}
    # 统计标签的热门程度，即打过此标签的不同用户数
    tag_pop = {}
    for user in train:
        user_tags[user] = {}
        for item in train[user]:
            for tag in train[user][item]:
                if tag not in user_tags[user]:
                    user_tags[user][tag] = 0
                user_tags[user][tag] += 1
                if tag not in tag_items:
                    tag_items[tag] = {}
                if item not in tag_items[tag]:
                    tag_items[tag][item] = 0
                tag_items[tag][item] += 1
                if tag not in tag_pop:
                    tag_pop[tag] = set()
                tag_pop[tag].add(user)
    tag_pop = {k: len(v) for k, v in tag_pop.items()}
    
    def GetRecommendation(user):
        # 按照打分推荐N个未见过的
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0
                item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag]
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    
    return GetRecommendation

In [10]:
# 3. 改进二：同时也为热门商品加入惩罚项
def TagBasedTFIDF_Improved(train, N):
    '''
    :params: train, 训练数据集
    :params: N, 超参数,设置取TopN推荐物品数目
    :return: GetRecommendation,推荐接口函数
    '''
    # 统计user_tags和tag_items
    user_tags, tag_items = {}, {}
    # 统计标签和物品的热门程度，即打过此标签的不同用户数，和物品对应的不同用户数
    tag_pop, item_pop = {}, {}
    for user in train:
        user_tags[user] = {}
        for item in train[user]:
            if item not in item_pop:
                item_pop[item] = 0
            item_pop[item] += 1
            for tag in train[user][item]:
                if tag not in user_tags[user]:
                    user_tags[user][tag] = 0
                user_tags[user][tag] += 1
                if tag not in tag_items:
                    tag_items[tag] = {}
                if item not in tag_items[tag]:
                    tag_items[tag][item] = 0
                tag_items[tag][item] += 1
                if tag not in tag_pop:
                    tag_pop[tag] = set()
                tag_pop[tag].add(user)
    tag_pop = {k: len(v) for k, v in tag_pop.items()}
    
    def GetRecommendation(user):
        # 按照打分推荐N个未见过的
        if user not in user_tags:
            return []
        seen_items = set(train[user])
        item_score = {}
        for tag in user_tags[user]:
            for item in tag_items[tag]:
                if item in seen_items:
                    continue
                if item not in item_score:
                    item_score[item] = 0
                item_score[item] += user_tags[user][tag] * tag_items[tag][item] / tag_pop[tag] / item_pop[item]
        item_score = list(sorted(item_score.items(), key=lambda x: x[1], reverse=True))
        return item_score[:N]
    
    return GetRecommendation

## 三. 实验
1. SimpleTagBased实验
2. TagBasedTFIDF实验
3. TagBasedTFIDF++实验

M=10, N=10

In [13]:
class Experiment():
    
    def __init__(self, M, N, fp='../dataset/delicious-2k/user_taggedbookmarks.dat', rt='SimpleTagBased'):
        '''
        :params: M, 进行多少次实验
        :params: N, TopN推荐物品的个数
        :params: fp, 数据文件路径
        :params: rt, 推荐算法类型
        '''
        self.M = M
        self.N = N
        self.fp = fp
        self.rt = rt
        self.alg = {'SimpleTagBased': SimpleTagBased, 'TagBasedTFIDF': TagBasedTFIDF, \
                    'TagBasedTFIDF_Improved': TagBasedTFIDF_Improved}
    
    # 定义单次实验
    @timmer
    def worker(self, train, test):
        '''
        :params: train, 训练数据集
        :params: test, 实验数据集
        :return: 各指标的值
        '''
        getRecommendation = self.alg[self.rt](train, self.N)
        metric = Metric(train, test, getRecommendation)
        return metric.eval()
    
    # 多次实验取平均
    @timmer
    def run(self):
        # 初始指标值
        metrics = {'Precision': 0, 'Recall': 0, 
                   'Coverage': 0, 'Diversity': 0, 
                   'Popularity': 0}
        metricList = []
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print('-------------------------Experiment {}-------------------------'.format(ii))
            metric = self.worker(train, test)
            metricList.append(metric)
            metrics = {k: metrics[k]+metric[k] for k in metrics}
        metrics = {k: metrics[k] / self.M for k in metrics}
        print('Average Result (M={}, N={}): {}'.format(self.M, self.N, metrics))
        df = pd.DataFrame(metricList)
        return df

In [14]:
# 1. SimpleTagBased实验
M, N = 10, 10
exp = Experiment(M, N, rt='SimpleTagBased')
df = exp.run()
print(df)
print(df.describe())

-------------------------Experiment 0-------------------------
Metric: {'Precision': 0.33, 'Recall': 0.54, 'Coverage': 3.32, 'Diversity': 0.7893190039044842, 'Popularity': 2.341847}
Func worker, run time: 23.15436601638794
-------------------------Experiment 1-------------------------
Metric: {'Precision': 0.36, 'Recall': 0.59, 'Coverage': 3.38, 'Diversity': 0.789234109883235, 'Popularity': 2.326798}
Func worker, run time: 23.364434003829956
-------------------------Experiment 2-------------------------
Metric: {'Precision': 0.36, 'Recall': 0.59, 'Coverage': 3.37, 'Diversity': 0.7932654461673471, 'Popularity': 2.328691}
Func worker, run time: 23.897894620895386
-------------------------Experiment 3-------------------------
Metric: {'Precision': 0.29, 'Recall': 0.48, 'Coverage': 3.34, 'Diversity': 0.7981049263828354, 'Popularity': 2.365525}
Func worker, run time: 23.42951273918152
-------------------------Experiment 4-------------------------
Metric: {'Precision': 0.34, 'Recall': 0.56, 

In [18]:
# 2. TagBasedTFIDF实验
M, N = 10, 10
exp_improved_1 = Experiment(M, N, rt='TagBasedTFIDF')
df = exp_improved_1.run()
print(df)
print(df.describe())

-------------------------Experiment 0-------------------------
Metric: {'Precision': 0.38, 'Recall': 0.62, 'Coverage': 16.84, 'Diversity': 0.8817793088581598, 'Popularity': 1.32412}
Func worker, run time: 26.699300289154053
-------------------------Experiment 1-------------------------
Metric: {'Precision': 0.39, 'Recall': 0.64, 'Coverage': 16.95, 'Diversity': 0.8826770775676372, 'Popularity': 1.316937}
Func worker, run time: 26.63674783706665
-------------------------Experiment 2-------------------------
Metric: {'Precision': 0.35, 'Recall': 0.58, 'Coverage': 16.94, 'Diversity': 0.8810835954925031, 'Popularity': 1.32842}
Func worker, run time: 26.720722436904907
-------------------------Experiment 3-------------------------
Metric: {'Precision': 0.3, 'Recall': 0.5, 'Coverage': 16.99, 'Diversity': 0.8852717719698695, 'Popularity': 1.324043}
Func worker, run time: 26.951189517974854
-------------------------Experiment 4-------------------------
Metric: {'Precision': 0.39, 'Recall': 0.65

In [19]:
# 3. TagBasedTFIDF++实验
M, N = 10, 10
exp_improved_2 = Experiment(M, N, rt='TagBasedTFIDF_Improved')
df = exp_improved_2.run()
print(df)
print(df.describe())

-------------------------Experiment 0-------------------------
Metric: {'Precision': 0.14, 'Recall': 0.23, 'Coverage': 19.4, 'Diversity': 0.8598754674467468, 'Popularity': 0.78619}
Func worker, run time: 33.01668310165405
-------------------------Experiment 1-------------------------
Metric: {'Precision': 0.16, 'Recall': 0.27, 'Coverage': 19.36, 'Diversity': 0.8618280737703956, 'Popularity': 0.785819}
Func worker, run time: 33.07584190368652
-------------------------Experiment 2-------------------------
Metric: {'Precision': 0.18, 'Recall': 0.3, 'Coverage': 19.48, 'Diversity': 0.8612912974003869, 'Popularity': 0.787078}
Func worker, run time: 33.327146768569946
-------------------------Experiment 3-------------------------
Metric: {'Precision': 0.15, 'Recall': 0.24, 'Coverage': 19.32, 'Diversity': 0.8633547216465478, 'Popularity': 0.785996}
Func worker, run time: 34.323554039001465
-------------------------Experiment 4-------------------------
Metric: {'Precision': 0.21, 'Recall': 0.34

## 四. 实验结果
1. SimpleTagBased实验

    Running time: 404.8816478252411
    
    Average Result (M=10, N=10): {'Precision': 0.33799999999999997, 'Recall': 0.554, 'Coverage': 3.3579999999999997, 'Diversity': 0.7914683636087276, 'Popularity': 2.3399532}
     
2. TagBasedTFIDF实验
    
    Running time: 443.55260705947876
    
    Average Result (M=10, N=10): {'Precision': 0.352, 'Recall': 0.5799999999999998, 'Coverage': 16.952, 'Diversity': 0.8829977473089358, 'Popularity': 1.3244411}
     
3. TagBasedTFIDF_Improved实验
    
    Running time: 551.4401750564575
    
    Average Result (M=10, N=10): {'Precision': 0.16299999999999998, 'Recall': 0.268, 'Coverage': 19.410999999999998, 'Diversity': 0.8611795265352565, 'Popularity': 0.7858471}