计算物品相似度(基于物品共现在用户历史行为中)
- 建立用户物品倒排表 {user: items}
- 构建物品的共现矩阵 {item : {item : 0/1}}
- 计算相似度 $w_{ij}=\frac{N(i)\bigcap N(j)}{N(i)}$ 同时喜欢物品i和物品j的用户数除以喜欢物品i的总数 ： 表示与i相似的物品

推荐物品的计算
- 推荐物品i, 找到与物品i相似的k个物品 K
- 找到与用户u过去喜欢的物品like
- 求相似并喜欢的交集$KL = K\bigcap L$
- 计算用户u与物品i的关联度$P_{u,i}=\sum_{j=1}^{KL}w_{i,j}*P_{u,j}$
- 推荐P高的几个物品


In [2]:
import json
import random
import math
import os
import numpy as np

In [38]:
class ItemCFRec(object):
    # 每次训练选择的训练集和测试集不一致，所以会导致出现有些没有出现在训练集中
    
    def __init__(self,data_file='../../data/ch5/ml-1m/ratings.dat',ratio=0.9):
        """
        @params:
            data_file：记录用户评分(用户历史行为)文件
            retio: train与test数据集的比例
        @return:
        """
        self.data_file = data_file
        self.ratio = ratio
        self.data = self.load_data()
        self.train_data,self.test_data = self.train_test_split()
        self.item_similariy_calculation()
        
    def load_data(self):
        """
        @params:
        @return: data 
        """
        print('load data ...')
        data = []
        for line in open(self.data_file,'r'):
            user_id,item_id,record,_ = line.split("::")
            data.append([user_id,item_id,int(record)])
        return data
    
    def train_test_split(self):
        """
        @params:
        @return: 
            train_data,test_data
        """
        print('train_test_split...')
        data = np.random.permutation(self.data)
        train_data = {}
        for (user,item,record) in data[:int(len(data)*self.ratio)]:
            train_data.setdefault(user,{})
            train_data[user][item] = int(record)
        test_data = {}
        for (user,item,record) in data[int(len(data)*self.ratio):]:
            test_data.setdefault(user,{})
            test_data[user][item] = int(record)
        return train_data,test_data
    
    def item_similariy_calculation(self,sim_file='../../data/item_sim.json'):
        """计算物品之间的相似度
        """
        if os.path.exists(sim_file):
            print('load similarity from file ...')
            self.item_similariy = json.load(open(sim_file,'r'))
        else:
            # 构建倒排表:
            print('calculate similarity ...')
            user_items = {}
            item_item_count = {}
            item_user_count = {} # 统计物品item被多少个用户喜欢
            for user,items in self.train_data.items():
                for item_x in items:
                    item_user_count.setdefault(item_x,0)
                    if items[item_x] > 0.0:
                        item_user_count[item_x] += 1
                        for item_y in items:
                            item_item_count.setdefault(item_x,{})
                            item_item_count[item_x].setdefault(item_y,0)
                            if item_x != item_y and items[item_y]>0.0:
                                item_item_count[item_x][item_y] += 1
            # 计算相似度
            item_similarity = {}
            for item_x in item_item_count:
                item_similarity.setdefault(item_x,{})
                for item_y in item_item_count[item_x]:
                    item_similarity[item_x][item_y] = item_item_count[item_x][item_y] / (math.sqrt(item_user_count[item_x]*item_user_count[item_y] + 0.00000001))
            json.dump(item_similarity,open(sim_file,'w'))
            self.item_similariy = item_similarity
    
    def recommend(self,user,k=8,n=40):
        """根据物品相似度对用户user推荐物品
        @params:
            user
            k=8
            n=10
        @return:
            recommendation
        """
        result = {}
        u_items = self.train_data.get(user,{})  # 获得user的物品列表
        for u_item in u_items:
            for sim_item,sim_val in list(sorted(self.item_similariy.get(u_item,{}).items(),key=lambda x:x[1],reverse=True))[:k]:
                if sim_item in u_items:
                    continue
                result.setdefault(sim_item,0)
                result[sim_item] += sim_val*u_items[u_item]
        return sorted(result.items(),key=lambda x:x[1],reverse=True)[:n]
    
    def precision(self,k=8,n=10):
        """
        @params:
            k=8
            n=10
        @return:
            precision
        """
        total = 0
        hit = 0
        for user,items in self.test_data.items():
            rank = self.recommend(user,k,n)
            for item,val in rank:
                if item in items:
                    hit += 1
            total += n
        return hit/total

In [36]:
cf = ItemCFRec()
print(cf.recommend('1'))

load data ...
train_test_split...
load similarity from file ...
[('1196', 15.787825933066065), ('318', 15.343585337609266), ('593', 15.037839581541544), ('364', 14.364707630229926), ('595', 14.207670517569975), ('2571', 13.929569642419104), ('296', 11.592571666025139), ('2858', 11.476533847058619), ('2096', 10.587207854647415), ('2087', 10.143471638906124), ('596', 10.106568988686695), ('1265', 9.864673610909001), ('1210', 9.427880333479925), ('2174', 9.407809817880448), ('1240', 9.220036791153465), ('2797', 9.015252354928943), ('1968', 8.57408966308178), ('1282', 8.270270476716853), ('2080', 7.639640918540172), ('1704', 6.944132695061446), ('1302', 6.655310248077166), ('356', 6.398895596117137), ('50', 6.2819552351351176), ('1198', 6.270582709688138), ('2396', 6.098466349139613), ('3448', 5.903651122619482), ('1225', 5.700910838159634), ('1688', 5.665792099085287), ('1028', 5.618362085960438), ('1784', 5.597979141504153), ('1682', 5.480621654143118), ('1032', 5.361349741754148), ('139

In [None]:
cf = ItemCFRec()
print(cf.precision())

load data ...
train_test_split...
load similarity from file ...
