In [18]:
import random
import math
import json
import os

In [25]:
class UserCFRec(object):
    
    def __init__(self,data_file):
        """初始化 data_file"""
        self.data_file = data_file
        self.data = self.load_data()
        self.train,self.test = self.train_test_split()
    
    def load_data(self):
        print("加载数据...")
        data = []
        for line in open(self.data_file):
            user_id,item_id,record,_ = line.split("::")
            data.append((user_id,item_id,int(record)))
        return data
    
    def train_test_split(self,k=2,seed=1,M=8):
        """
        @param:
            k
        return
            train : 用户物品交互矩阵 user-item : record
        """
        print("Train-Test-Spliting")
        train = {}
        test = {}
        random.seed(seed)
        for user,item,record in self.data:
            if random.randint(0,M)==k:
                test.setdefault(user,{})
                test[user][item] = record
            else:
                train.setdefault(user,{})
                train[user][item] = record
        return train,test
    
    def user_similarity_best(self,file_path='../../data/user_sim.json'):
        """
        使用余弦相似度来计算两个user之间的相似性
        @params:
        return user_x,user_y : similarity
        """
        if os.path.exists(file_path):
            print('用户相似度从文件中进行加载')
            user_sim = json.load(open(file_path,'r'))
        else:
            print('计算用户相似度...')
            item_users = {}
            for u,items in self.train.items():
                for item in items.keys():
                    item_users.setdefault(item,set())
                    if self.train[u][item] > 0:
                        item_users[item].add(u)
            # 领用倒排表计算
            prefrence = {}
            user_item_count = {}
            for (item,users) in item_users.items():
                for user_x in users:
                    user_item_count.setdefault(user_x,0)
                    user_item_count[user_x] += 1
                    prefrence.setdefault(user_x,{})
                    for user_y in  users:
                        if user_y != user_x:
                            prefrence[user_x].setdefault(user_y,0)
                            prefrence[user_x][user_y] += 1/math.log(len(users)+1) # 修正的余弦相似度，表示对热度物品的惩罚，热度越大，N越大
            user_sim = {}
            for user_x,users_y in prefrence.items():
                # 大量使用setdefault来初始化dict
                user_sim.setdefault(user_x,{})
                for user_y in users_y:
                    user_sim[user_x][user_y] = prefrence[user_x][user_y]/math.sqrt((user_item_count[user_x]*user_item_count[user_y]))
            json.dump(user_sim,open(file_path,'w'))
        return user_sim # (u_x,u_y: value)
    def recommend(self,user,k=8,n=40):
        """
        根据用户相似度矩阵计算推荐的物品
        train(user,item:record) + user_sim(user_x,user_y : similarity)
        """
        result = {}
        exerted = self.train[user].keys()
        for (user_y,sim) in sorted(self.user_similarity_best()[user].items(),key=lambda x:x[1],reverse=True)[:k]:
            if user_y in exerted:
                continue
            for item,record in self.train[user_y].items():
                result.setdefault(item,0)
                result[item] += sim*record
        return sorted(result.items(),key=lambda x:x[1],reverse=True)[:n]
    
    def precision(self,k=8,n=10):
        """
        @params:
            k=8: 相似用户的数量
            n=10: 推荐物品的总数
        return
            precision: 精确度
        """
        hit = 0
        total = 0
        for user,items in list(self.test.items())[:10]:
            rank = self.recommend(user,k,n)
            for u,r in rank:
                if u in items.keys():
                    hit += 1
            total += n
        return hit / total
            
        

In [26]:
data_file = '../../data/ch5/ml-1m/ratings.dat'
cf = UserCFRec(data_file)
print(cf.recommend('1',n=10))
print(cf.precision())

加载数据...
Train-Test-Spliting
用户相似度从文件中进行加载
[('3114', 1.8716690139191705), ('595', 1.7306063931992168), ('588', 1.6837121995761963), ('1', 1.5444884734317776), ('2355', 1.4944802460409934), ('2081', 1.4939663969090482), ('919', 1.4114297632580142), ('1022', 1.304900632086901), ('2687', 1.2590982941676885), ('594', 1.2190239611486573)]
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
用户相似度从文件中进行加载
0.06
