针对Delicious数据集，对SimpleTagBased算法进行改进（使用NormTagBased、TagBased-TFIDF算法）

In [58]:

import random
import math
import operator
import pandas as pd
import numpy as np

In [81]:
file_path = r'E:\bi_course\L3\sat_course\课堂资料\delicious-2k\user_taggedbookmarks-timestamps.dat'
# 字典类型，保存了user对item的tag，即{userid: {item1:[tag1, tag2], ...}}
records = {}
# 训练集，测试集
train_data = dict()
test_data = dict()
# 用户标签，商品标签
user_tags = dict() #用户打过的标签
user_items = dict() #用户打过标签的商品
tag_items = dict() #打上标签的items
tags_users = dict() #某标签使用过的用户

In [82]:
# 数据加载
def load_data():
    print("开始数据加载...")
    df = pd.read_csv(file_path, sep='\t')
    for i in range(len(df)):
        uid = df['userID'][i]
        iid = df['bookmarkID'][i]
        tag = df['tagID'][i]
        # 键不存在时，设置默认值{}
        records.setdefault(uid,{})
        records[uid].setdefault(iid,[])
        records[uid][iid].append(tag)

    print("数据集大小为 %d." % (len(df)))
    print("设置tag的人数 %d." % (len(records)))
    print("数据加载完成\n")

In [83]:
# 将数据集拆分为训练集和测试集
def train_test_split(ratio, seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            # ratio比例设置为测试集
            if random.random()<ratio:
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
    return        train_data, test_data
    print("训练集样本数 %d, 测试集样本数 %d" % (len(train_data),len(test_data)))


In [87]:
def recommend(user, N, methodName):
    recommend_items=dict()
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut除以用户打标签的总数, 乘以 商品被打上相同标签的次数 wti除以所有商品打上某标签的总数）之和
    tagged_items = user_items[user]
    for tag, wut in user_tags[user].items():
        # print(self.user_tags[user].items())
        for item, wti in tag_items[tag].items():
            
                if item in tagged_items:
                    continue
                # print('wut = %s, wti = %s' %(wut, wti))
                # 在这里进行函数的改造, 新增NormTagBased, TFIDF_TagBased的算法
                if  methodName == 'simpleTagBased':
                    if item not in recommend_items:
                        recommend_items[item] = wut * wti
                    else:
                        recommend_items[item] = recommend_items[item] + wut * wti
                
                elif methodName == 'NormTagBased': 

                    if item not in recommend_items:
                        recommend_items[item] = wut/sum(user_tags[user].values()) * wti/sum(tag_items[tag].values())
                    else:
                        recommend_items[item] = recommend_items[item] + wut/sum(user_tags[user].values()) * wti/sum(tag_items[tag].values())
                
                elif methodName == 'TFIDF_TagBased':
                    if item not in recommend_items:
                        recommend_items[item] = wut * wti/np.log( 1 + sum(tag_items[tag].values()))
                    else:
                        recommend_items[item] = recommend_items[item] + wut * wti/np.log( 1 + sum(tag_items[tag].values()))

    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]

In [36]:

# 使用测试集，计算准确率和召回率
def precisionAndRecall(N, methodName):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user,items in test_data.items():
        if user not in train_data: # 对user不属于train_data的跳过
            continue
        # 获取Top-N推荐列表
        rank = recommend(user, N, methodName)
        for item,rui in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
    #print('一共命中 %d 个, 一共推荐 %d 个, 用户设置tag总数 %d 个' %(hit, h_precision, h_recall))
    # 返回准确率 和 召回率
    return ( hit/(h_precision*1.0)), (hit/(h_recall*1.0))

In [44]:
# 使用测试集，对推荐结果进行评估
def testRecommend(methodName):
    for n in [5,10,20,30,40,50]:
        precision,recall = precisionAndRecall(n, methodName)
        print( "%10s %10d %10.3f%% %10.3f%%" % (methodName, n, precision * 100, recall  * 100) )

In [26]:
# 设置矩阵 mat[index, item] = 1
def addValueToMat(mat, index, item, value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

In [97]:
# 使用训练集，初始化user_tags, tag_items, user_items
def initStat():
    records=train_data
    for u,items in records.items():
        for i,tags in items.items():
            for tag in tags:
                #print tag
                # 用户和tag的关系
                addValueToMat(user_tags, u, tag, 1)
                # tag和item的关系
                addValueToMat(tag_items, tag, i, 1)
                # 用户和item的关系
                addValueToMat(user_items, u, i, 1)
                # #某标签使用过的用户
                # addValueToMat(tags_users, tag, u, 1)

    # return user_tags, tag_items, user_items
    print("user_tags, tag_items, user_items初始化完成.")
    print("user_tags大小 %d, tag_items大小 %d, user_items大小 %d" % (len(user_tags), len(tag_items), len(user_items)))

In [91]:
# 数据加载
load_data()

开始数据加载...
数据集大小为 437593.
设置tag的人数 1867.
数据加载完成



In [98]:
# 训练集，测试集拆分，20%测试集
train_test_split(0.2)
initStat()

user_tags, tag_items, user_items初始化完成.
user_tags大小 1860, tag_items大小 36884, user_items大小 1860


In [45]:
print("推荐结果评估")
print("%10s %10s %10s %10s" % ('TagBased Method','N',"精确率",'召回率'))

testRecommend(methodName = 'simpleTagBased')

print('done')

推荐结果评估
TagBased Method          N        精确率        召回率
simpleTagBased          5      0.829%      0.355%
simpleTagBased         10      0.633%      0.542%
simpleTagBased         20      0.512%      0.877%
simpleTagBased         30      0.429%      1.103%
simpleTagBased         40      0.381%      1.304%
simpleTagBased         50      0.345%      1.476%
done


In [47]:
print("推荐结果评估")
print("%10s %10s %10s %10s" % ('TagBased Method','N',"精确率",'召回率'))

testRecommend(methodName = 'NormTagBased')

print('done')

推荐结果评估
TagBased Method          N        精确率        召回率
NormTagBased          5      0.717%      0.307%
NormTagBased         10      0.526%      0.451%
NormTagBased         20      0.412%      0.705%
NormTagBased         30      0.340%      0.872%
NormTagBased         40      0.293%      1.002%
NormTagBased         50      0.269%      1.150%
done


In [99]:
print("推荐结果评估")
print("%10s %10s %10s %10s" % ('TagBased Method','N',"精确率",'召回率'))

testRecommend(methodName = 'TFIDF_TagBased')

print('done')

推荐结果评估
TagBased Method          N        精确率        召回率
TFIDF_TagBased          5      0.918%      0.393%
TFIDF_TagBased         10      0.722%      0.618%
TFIDF_TagBased         20      0.543%      0.930%
TFIDF_TagBased         30      0.446%      1.146%
TFIDF_TagBased         40      0.398%      1.361%
TFIDF_TagBased         50      0.357%      1.529%
done
