In [6]:
# 使用TagBased算法对用户进行Top-N书签推荐
# 数据源地址：https://grouplens.org/datasets/hetrec-2011/
# 数据格式：userID bookmarkID tagID timestamp

In [7]:
import random
import operator
import math

In [29]:
class TagBased:
    def __init__(self,filename):
        self.filename = filename
        self.storeData()
        self.splitData(0.2)
        self.stats()
        self.testRecommend()
        
    # 建立合适数据结构存储数据
    # records = {userID1:{bookmarkID:[tagID]},
    #            userID2:{bookmarkId:[tagID]}}
    def storeData(self):
        self.records = {}
        filename = self.filename
        file = open(filename)
        line_number = 0
        for line in file:
            line_number+=1 
            if line_number==1: # 第一行为标题，忽略
                continue
            userId,bookmarkId,tagId,timestamp = line.split('\t')
            userId = int(userId)
            bookmarkId = int(bookmarkId)
            tagId = int(tagId)
            self.records.setdefault(userId,{})
            self.records[userId].setdefault(bookmarkId,[])
            self.records[userId][bookmarkId].append(tagId)
        file.close()
        print("数据集大小为:%d" % line_number)
        print("共有%d人进行了标记" % len(self.records))
        print("数据集加载完成")
        
    #把数据集拆分为训练集和测试集
    def splitData(self,ratio,seed=100):
        random.seed(seed)
        self.train_record = {}
        self.test_record = {}
        for userId in self.records.keys():
            for bookmarkId in self.records[userId].keys():
                if random.random()<ratio:
                    self.test_record.setdefault(userId,{})
                    self.test_record[userId].setdefault(bookmarkId,[])
                    for tagId in self.records[userId][bookmarkId]:
                        self.test_record[userId][bookmarkId].append(tagId)
                else:
                    self.train_record.setdefault(userId,{})
                    self.train_record[userId].setdefault(bookmarkId,[])
                    for tagId in self.records[userId][bookmarkId]:
                        self.train_record[userId][bookmarkId].append(tagId)
        print("训练集样本数%d,测试集样本数%d" % (len(self.train_record),len(self.test_record)))
        
    #辅助函数addToDict
    def addToDict(self,Dict,index,content,value=1):
        if index not in Dict:
            Dict.setdefault(index,{})
            Dict[index].setdefault(content,value)
        else:
            if content not in Dict[index]:
                Dict[index][content]=value
            else:
                Dict[index][content]+=value
    
    # 对我们需要的数据进行统计
    '''
    用户使用过的标签：user_tags[user][tag]
    用户打过标签的书: user_books[user][book]
    打上某标签的书: tag_books[tag][book]
    某标签使用过的用户: tag_users[tag][user]
    '''
    def stats(self):
        self.user_tags = {}
        self.user_books = {}
        self.tag_books = {}
        self.tag_users = {}
        for userId in self.train_record.keys():
            for bookmarkId in self.train_record[userId].keys():
                for tagId in self.train_record[userId][bookmarkId]:
                    self.addToDict(self.user_tags,userId,tagId,1)
                    self.addToDict(self.user_books,userId,bookmarkId,1)
                    self.addToDict(self.tag_books,tagId,bookmarkId,1)
                    self.addToDict(self.tag_users,tagId,userId,1)
        print("数据统计完成")
        print("user_tags大小为%d,user_books大小为%d,tag_books大小为%d,tag_users大小为%d" % (len(self.user_tags),len(self.user_books),len(self.tag_books),len(self.tag_users)))
        
    
    # 进行TOP-N推荐
    # SimpleTagBased算法
    def simpleTagBased(self,userId,N):
        recommend_books = {}
        # 对Book进行打分，分数为所有的（用户对某标签使用的次数 tagTimes, 乘以 商品被打上相同标签的次数 bookTimes）之和
        tagged_books = self.user_books[userId] # userId 看过哪些书
        for tagId,tagTimes in self.user_tags[userId].items():
            for bookId,bookTimes in self.tag_books[tagId].items():
                if bookId in tagged_books: # 如果在已经标记过的书里就忽略
                    continue 
                if bookId not in recommend_books:
                    recommend_books[bookId] = tagTimes*bookTimes
                else:
                    recommend_books[bookId]+= tagTimes*bookTimes
        return sorted(recommend_books.items(),key=operator.itemgetter(1),reverse=True)[0:N]
    
    # NormTagBased算法
    # 进行归一化
    def normTagBased(self,userId,N):
        recommend_books = {}
        # 对Book进行打分，分数为所有的（用户对某标签使用的次数 tagTimes, 乘以 商品被打上相同标签的次数 bookTimes）之和
        tagged_books = self.user_books[userId] # userId 看过哪些书
        tags_number = len(self.user_tags[userId])
        for tagId,tagTimes in self.user_tags[userId].items():
            books_number = len(self.tag_books[tagId])
            for bookId,bookTimes in self.tag_books[tagId].items():
                if bookId in tagged_books: # 如果在已经标记过的书里就忽略
                    continue 
                if bookId not in recommend_books:
                    recommend_books[bookId] = (tagTimes/tags_number)*(bookTimes/books_number)
                else:
                    recommend_books[bookId]+= (tagTimes/tags_number)*(bookTimes/books_number)
        return sorted(recommend_books.items(),key=operator.itemgetter(1),reverse=True)[0:N]
    
    # TagBased-TFIDF算法
    '''
    如果一个tag很热门，会导致user_tags[t]很大，所以即使tag_books[u,t]很小，
    也会导致score(u,i)很大。给热门标签过大的权重，不能反应用户个性化的兴趣。
    这里借鉴TF-IDF的思想，使用tag_users[t]表示标签t被多少个不同的用户使用
    '''
    def tagBased_TFIDF(self,userId,N):
        recommend_books = {}
        # 对Book进行打分，分数为所有的（用户对某标签使用的次数 tagTimes, 乘以 商品被打上相同标签的次数 bookTimes）之和
        tagged_books = self.user_books[userId] # userId 看过哪些书
        for tagId,tagTimes in self.user_tags[userId].items():
            users_number = len(self.tag_users[userId])
            for bookId,bookTimes in self.tag_books[tagId].items():
                if bookId in tagged_books: # 如果在已经标记过的书里就忽略
                    continue 
                if bookId not in recommend_books:
                    recommend_books[bookId] = tagTimes*bookTimes/math.log(1+users_number)
                else:
                    recommend_books[bookId]+= tagTimes*bookTimes/math.log(1+users_number)
        return sorted(recommend_books.items(),key=operator.itemgetter(1),reverse=True)[0:N]
    
    # 使用测试集，计算准确率和召回率
    def precisionAndRecall(self,N):
        hit = 0
        h_recall = 0
        h_precision = 0
        for userId,book_tags in self.test_record.items():
            if userId not in self.train_record:
                continue
            # 获取Top-N推荐列表
            #rank = self.simpleTagBased(userId,N)
            rank = self.normTagBased(userId,N)
            for bookId,rui in rank:
                if bookId in book_tags:
                    hit+=1
            h_recall+=len(book_tags)
            h_precision+=N
        print('一共命中 %d 个, 一共推荐 %d 个, 用户设置tag总数 %d 个' %(hit, h_precision, h_recall))
        return (hit/(h_precision*1.0)), (hit/(h_recall*1.0))
    
    # 使用测试集，对推荐结果进行评估
    def testRecommend(self):
        print("推荐结果评估")
        print("%3s %10s %10s" % ('N',"精确率",'召回率'))
        for n in [5,10,20,40,60,80,100]:
            precision,recall=self.precisionAndRecall(n)
            print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [30]:
if __name__=="__main__":
    stb = TagBased("user_taggedbookmarks-timestamps.dat")

数据集大小为:437594
共有1867人进行了标记
数据集加载完成
训练集样本数1860,测试集样本数1793
数据统计完成
user_tags大小为1860,user_books大小为1860,tag_books大小为36884,tag_users大小为36884
推荐结果评估
  N        精确率        召回率
一共命中 72 个, 一共推荐 8930 个, 用户设置tag总数 20861 个
  5      0.806%      0.345%
一共命中 103 个, 一共推荐 17860 个, 用户设置tag总数 20861 个
 10      0.577%      0.494%
一共命中 153 个, 一共推荐 35720 个, 用户设置tag总数 20861 个
 20      0.428%      0.733%
一共命中 214 个, 一共推荐 71440 个, 用户设置tag总数 20861 个
 40      0.300%      1.026%
一共命中 278 个, 一共推荐 107160 个, 用户设置tag总数 20861 个
 60      0.259%      1.333%
一共命中 338 个, 一共推荐 142880 个, 用户设置tag总数 20861 个
 80      0.237%      1.620%
一共命中 397 个, 一共推荐 178600 个, 用户设置tag总数 20861 个
100      0.222%      1.903%
