In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# 导入数据 
data_path=r'./data/ua.base'

pd_data = pd.read_csv(data_path, sep="\t", header=None, names=["user_id", "content_id", "score", "ts"])
pd_data.head()

Unnamed: 0,user_id,content_id,score,ts
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [5]:
def cirRatMatrix(pd_data,userId,contentId,score):
    """
    计算评分矩阵
    - param: 
        pd_data: 原始数据
        userId: 用户列名
        contentId: 内容列名
        score: 评分列名
    - return: 
        pd_data: 新增了两个编码列的数据表
        rating: 评分矩阵
    """

    pd_data['user_factorize_id'],_ = pd.factorize(pd_data[userId])
    pd_data['content_factorize_id'],_ = pd.factorize(pd_data[contentId])
    
    userNo = pd_data['user_factorize_id'].max() + 1
    contentNo = pd_data['content_factorize_id'].max() + 1
    
    rating = np.zeros((contentNo,userNo))
    #查看矩阵ratings_df的第一维度是多少
    for _,row in pd_data.iterrows():
        #interrows（），对原始数据进行遍历
        rating[int(row['content_factorize_id']),int(row['user_factorize_id'])] = row[score]
    return pd_data,rating

In [6]:
def cos_dist(vec1,vec2):
    """
    - param: 
        vec1: 向量1
        vec2: 向量2
    - return: 
        dist: 两个向量的余弦相似度
    """
    
    if vec1.sum()==0 or vec2.sum()==0:
        dist = 0
    else:
        dist = float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
    return dist



In [7]:

def reGetUserId(pd_factorize_data,user_factorize_id):
    '''
    根据 user_factorize_id 返回对应的 user_id
    '''
    user_id = pd_factorize_data[pd_factorize_data['user_factorize_id'] == user_factorize_id]["user_id"].values[0]
    return user_id

def reGetContentId(pd_factorize_data,content_factorize_id):
    '''
    根据 content_factorize_id 返回对应的 content_id
    '''
    content_id = pd_factorize_data[pd_factorize_data['content_factorize_id'] == content_factorize_id]['content_id'].values[0]
    return content_id


In [8]:

def ItemCF(pd_factorize_data,rating):
    '''
    基于物品的协同过滤
    - param pd_factorize_data: 包含user_factorize的用户评分表
    - param rating: 用户对物品的评分矩阵
    - return 
        #real_sim_item_dict:  相似物品池，key为content_id
        #real_sim_item_socre_dict:  相似物品的相似度
        user_rec_dict: 推荐池，key为user_id
        user_rec_score_dict:  与内容对应的推荐分数， rec_score = 用户对content_id的评分 * 物品的相似度
    '''
    
    start_time = time.time()
    contentNum = pd_factorize_data['content_factorize_id'].max() + 1
    # 计算物品之间的相似度
    item_sim_matrix = np.zeros((contentNum,contentNum))
    idx = 0
    while idx < len(rating):
        idy = idx + 1
        while idy < len(rating):
            item_sim_matrix[idx,idy] = cos_dist(rating[idx],rating[idy])
            idy = idy +1
        idx = idx + 1
  
    # 返回相似物品
    #sim_item_dict = {}
    #sim_item_socre_dict = {}
    real_sim_item_dict = {}
    real_sim_item_socre_dict = {}
    sim_item_max_len = 20
    for i in range(contentNum):
        temp_series = pd.DataFrame(item_sim_matrix[i],columns=['item_factorize_'+str(i)])
        sim_item_index = temp_series.sort_values(by = 'item_factorize_'+str(i),ascending = False)[0:sim_item_max_len].index.tolist()
        sim_item_score = list(temp_series.iloc[sim_item_index].values.reshape(-1))
        for j in list(range(len(sim_item_score)))[::-1] : # 从后往前遍历
            if sim_item_score[j] == 0:
                sim_item_index.pop(j)
                sim_item_score.pop(j)
        #sim_item_dict[i] = sim_item_index
        #sim_item_socre_dict[i] = sim_item_score
        real_item_index = []
        for item_factorize_id in sim_item_index:
            real_item_index.append(reGetContentId(pd_factorize_data,item_factorize_id))
        real_sim_item_dict[reGetContentId(pd_factorize_data,i)] = real_item_index
        real_sim_item_socre_dict[reGetContentId(pd_factorize_data,i)] = sim_item_score
        
    
    # 基于ItemCF的思想进行推荐
    user_rec_dict = {}
    user_rec_score_dict = {}
    userNum = pd_factorize_data['user_factorize_id'].max() + 1
    for user_factorize_id in range(userNum):
        user_data = pd_factorize_data[pd_factorize_data['user_factorize_id'] == user_factorize_id]
        user_id = reGetUserId(user_data,user_factorize_id)
        user_list = []
        user_score_list = []
        for content in user_data['content_id'].values:
            item_sim = real_sim_item_dict.get(content)
            score = user_data[user_data['content_id']==content]['score'].values[0]
            item_sim_score = real_sim_item_socre_dict.get(content)
            for i in range(len(item_sim_score)):
                rec_score = score * item_sim_score[i]
                if rec_score > 0 and item_sim[i] not in user_list:
                    user_score_list.append(rec_score)
                    user_list.append(item_sim[i])
            
        if len(user_list) >0 :
            user_rec_dict[user_id] = user_list
            user_rec_score_dict[user_id] = user_score_list

    cost_time = time.time() - start_time
    print('ItemCF: deal with %d content,total cost %0.2f seconds' %(contentNum,cost_time))
    
    return user_rec_dict, user_rec_score_dict


In [9]:
pd_factorize_data,rating_matrix = cirRatMatrix(pd_data,"user_id","content_id","score")

In [10]:
pd_factorize_data.head()

Unnamed: 0,user_id,content_id,score,ts,user_factorize_id,content_factorize_id
0,1,1,5,874965758,0,0
1,1,2,3,876893171,0,1
2,1,3,4,878542960,0,2
3,1,4,3,876893119,0,3
4,1,5,3,889751712,0,4


In [11]:
user_num = pd_factorize_data['user_factorize_id'].max() + 1
content_mum = pd_factorize_data['content_factorize_id'].max() + 1
print('用户数：'+str(user_num)+'，内容数：'+str(content_mum))

用户数：943，内容数：1680


In [12]:
user_rec_dict,user_rec_score_dict = ItemCF(pd_factorize_data, rating_matrix)
# deal with 943 users,the total cost 491.25 seconds

ItemCF: deal with 1680 content,total cost 122.12 seconds


In [14]:
user_id = 1
rec_num = 10
rec_list = user_rec_dict.get(user_id)[0:rec_num]
rec_score_list = user_rec_score_dict.get(user_id)[0:rec_num]
print('we reccommend user '+str(user_id)+':'+str(rec_list))
print('the reccommend score:'+str(rec_score_list))

we reccommend user:1:[50, 181, 121, 174, 405, 237, 222, 100, 151, 117]
the reccommend score:[3.2588890579732945, 3.1847165958556216, 3.1768543182615887, 2.9639449878935293, 2.9563400113767773, 2.9323503118894862, 2.9195874790968546, 2.9020249984099804, 2.8934404992515472, 2.891637450760491]
