In [1]:
import pandas as pd

In [2]:
file_path = './user_taggedbookmarks-timestamps.dat'

#字典类型，保存了用户打标签的记录user对item的tag,即{userid:{item1:[tag1,tag2]},{item2}...}
records = {}

# 训练集，测试集
# dict() 函数用于创建一个字典
train_data = dict()
test_data = dict()

# 用户打过的标签
user_tags = dict()
# 用户打过标签的商品
user_items = dict()
# 打上某标签的商品
tag_items = dict()
# 某标签使用过的用户
tag_user = dict()


In [3]:
# 数据加载
# 原始数据结构 userID bookmarkID tagID timestamp
def load_data():
    print("开始加载数据....")
    df = pd.read_csv(file_path,sep='\t')
    for i in range(len(df)):
        uid = df['userID'][i]
        iid = df['bookmarkID'][i]
        tag = df['tagID'][i]
        # 键不存在时，设置默认值为{}
        records.setdefault(uid,{})
        records[uid].setdefault(iid,[])
        records[uid][iid].append(tag)
    print(f"数据集大小为{len(df)}")
    print(f"设置tag的人数{len(records)}")
    print("数据加载完成\n")

In [4]:
load_data()

开始加载数据....
数据集大小为437593
设置tag的人数1867
数据加载完成



In [5]:
import random

In [16]:
# 将数据拆分为训练集和测试集
def train_test_split(ratio,seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            # ratio 比例设置为测试集
            if random.random()<ratio:
                #print(random.random())
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
    print(f"训练机集样本数为{len(train_data)},测试机集样本数为{len(test_data)}")
                    
    

In [18]:
train_test_split(0.2)

训练机集样本数为1867,测试机集样本数为1832


In [22]:
# 设置矩阵mat[index,item]=1
def addValueToMat(mat,index,item,value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

In [29]:
# 使用训练集初始化 user_tag tag_items user_items
def initStat():
    records = train_data
    for u,items in records.items():
        for i,tags in items.items():
            for tag in tags:
                #print(tag)
                # 用户和tag的关系
                addValueToMat(user_tags,u,tag,1)
                # tag 和item的关系
                addValueToMat(tag_items,tag,i,1)
                # 用户和item的关系
                addValueToMat(user_items,u,i,1)
                # 标签和用户的关系
                addValueToMat(tag_user,tag,u,1)
    print("user_tag,user_items,tag_items,tag_user初始化完成！")
    print(f"user_tag的大小{len(user_tags)},user_items的大小{len(user_items)},tag_items的大小{len(tag_items)},tag_user的大小{len(tag_user)}")
            
                

In [30]:
initStat()

user_tag,user_items,tag_items,tag_user初始化完成！
user_tag的大小1867,user_items的大小1867,tag_items的大小40159,tag_user的大小40159


In [31]:
import operator

In [32]:
#对用户user推荐top-N
def recommands(user,N):
    recommands_item = dict()
    # 对item进行打分，分数为所有的（用户对某标签使用的次数wut，乘以 商品被打上相同标签的次数 wti）之和
    tagged_items = user_items[user]
    for tag,wut in user_tags[user].items():
        for item,wti in tag_items[tag].items():
            if item in tagged_items:
                continue
            if item not in recommands_item:
                recommands_item[item] = wut * wti
            else:
                recommands_item[item] += wut * wti
                
    return sorted(recommands_item.items(),key=operator.itemgetter(1),reverse=True)[0:N]


In [35]:
#使用测试集，计算精确率和召回率
def precision_recall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user,items in test_data.items():
        if user not in train_data:
            continue
        # 获取top-N推荐列表
        rank = recommands(user,N)
        for item,rui in rank:
            if item in items:
                hit+=1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
    print(f"一共命中{hit}个，一共推荐{h_precision}个，用户设置tag总数为：{h_recall}")
    # 返回精确率和召回率
    return (hit/(h_precision*1.0)),(hit/(h_recall*1.0))

In [36]:
# 使用测试集，对推荐结果进行评估
def test_recommand():
    print("推荐结果进行评估..")
    print('%3s %10s %10s' % ('N','精确率','召回率'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precision_recall(n)
        print('%3d %10.3f%% %10.3f%%' % (n,precision *100,recall * 100))
        

In [37]:
test_recommand()

推荐结果进行评估..
  N        精确率        召回率


TypeError: unsupported operand type(s) for +=: 'dict' and 'int'