In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 导入数据 
data_path=r'./data/ua.base'

pd_data = pd.read_csv(data_path, sep="\t", header=None, names=["user_id", "content_id", "score", "ts"])
pd_data.head()

Unnamed: 0,user_id,content_id,score,ts
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [4]:
def cirRatMatrix(pd_data,userId,contentId,score):
    """
    计算评分矩阵
    - param: 
        pd_data: 原始数据
        userId: 用户列名
        contentId: 内容列名
        score: 评分列名
    - return: 
        pd_data: 新增了两个编码列的数据表
        rating: 评分矩阵
    """

    pd_data['user_factorize_id'],_ = pd.factorize(pd_data[userId])
    pd_data['content_factorize_id'],_ = pd.factorize(pd_data[contentId])
    
    userNo = pd_data['user_factorize_id'].max() + 1
    contentNo = pd_data['content_factorize_id'].max() + 1
    
    rating = np.zeros((contentNo,userNo))
    #查看矩阵ratings_df的第一维度是多少
    for _,row in pd_data.iterrows():
        #interrows（），对原始数据进行遍历
        rating[int(row['content_factorize_id']),int(row['user_factorize_id'])] = row[score]
    return pd_data,rating

In [5]:
def cos_dist(vec1,vec2):
    """
    - param: 
        vec1: 向量1
        vec2: 向量2
    - return: 
        dist: 两个向量的余弦相似度
    """
    
    if vec1.sum()==0 or vec2.sum()==0:
        dist = 0
    else:
        dist = float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
    return dist

In [6]:

def reGetAuthorId(pd_factorize_data,user_factorize_id):
    '''
    根据 user_factorize_id 返回对应的 user_id
    '''
    user_id = pd_factorize_data[pd_factorize_data['user_factorize_id'] == user_factorize_id]["user_id"].values[0]
    return user_id

def reGetContentId(pd_factorize_data,content_factorize_id):
    '''
    根据 content_factorize_id 返回对应的 content_id
    '''
    content_id = pd_factorize_data[pd_factorize_data['content_factorize_id'] == content_factorize_id]['content_id'].values[0]
    return content_id


In [9]:
def UserCF(pd_factorize_data,rating):
    '''
    基于用户的协同过滤
    - param: 
        pd_factorize_data: 包含user_factorize_id的数据表
        rating: （编码后的）用户对物品的评分矩阵
    - return: 
        user_rec_dict： 推荐池，key为author_id
        user_rec_score_dict：  与推荐池中内容对应的推荐分数
    '''
    start_time = time.time()
    userNum = pd_factorize_data['user_factorize_id'].max() + 1
    # step1: 计算用户之间的相似度
    user_sim_matrix = np.zeros((userNum,userNum))
    rating_T = rating.T
    idx = 0
    while idx < len(rating_T):
        idy = idx + 1
        while idy < len(rating_T):
            user_sim_matrix[idx,idy] = cos_dist(rating_T[idx],rating_T[idy])
            idy = idy +1
        idx = idx + 1

    # step2: 返回相似用户
    sim_user_list = []
    sim_user_dict = {}
    sim_user_max_len = 5 # 定义给每个人最大的相似用户数量
    for i in range(userNum):
        temp_series = pd.DataFrame(user_sim_matrix[i],columns=['user_factorize_'+str(i)])
        sim_user_index = temp_series.sort_values(by = 'user_factorize_'+str(i) )[-sim_user_max_len:].index.tolist()
        for user_index in sim_user_index:
            if temp_series.iloc[user_index].values == 0:
                sim_user_index.remove(user_index)
        sim_user_dict[i] = sim_user_index
        sim_user_list.append(sim_user_index)
        
    # step3: 根据相似用户的浏览情况进行推荐
    user_rec_dict = {}
    user_rec_score_dict = {}
    for user,sim_user_list in sim_user_dict.items():
        print('now deal with the user:'+str(user))
        user_data = pd_factorize_data[pd_factorize_data['user_factorize_id'] == user]
        user_content_list = user_data[['content_id']].values  # 获取用户之前浏览过的内容
        rec_num = 0
        rec_list = []
        rec_score_list = []
        for sim_user in sim_user_list:
            sim_user_data = pd_factorize_data[pd_factorize_data['user_factorize_id'] == sim_user]
            sim_user_data['rec_socre'] = user_sim_matrix[user][sim_user] * sim_user_data['score']   # 推荐得分 = 用户相似度 * 用户对content_id的评分
            sim_user_data = sim_user_data.sort_values(by = 'rec_socre',ascending=False)
            # 去重
            for indexs in sim_user_data.index:
                if sim_user_data.loc[indexs]['rec_socre']>0 and sim_user_data.loc[indexs]['content_id'] not in user_content_list:
                    if sim_user_data.loc[indexs]['content_id'] not in rec_list: # 不同的相似用户可能都会推荐同一个content，这里默认只取第一个相似用户的推荐
                        rec_list.append(sim_user_data.loc[indexs]['content_id'])
                        rec_score_list.append(sim_user_data.loc[indexs]['rec_socre'])
                    rec_num = rec_num + 1
        if len(rec_list) > 0:
            user_rec_dict[reGetAuthorId(pd_factorize_data,user)] = rec_list
            user_rec_score_dict[reGetAuthorId(pd_factorize_data,user)] = rec_score_list
    cost_time = time.time() - start_time
    
    print('deal with %d users,the total cost %0.2f seconds' %(userNum,cost_time))

    return user_rec_dict , user_rec_score_dict
    

In [10]:
pd_factorize_data,rating_matrix = cirRatMatrix(pd_data,"user_id","content_id","score")

In [11]:
pd_factorize_data.head()

Unnamed: 0,user_id,content_id,score,ts,user_factorize_id,content_factorize_id
0,1,1,5,874965758,0,0
1,1,2,3,876893171,0,1
2,1,3,4,878542960,0,2
3,1,4,3,876893119,0,3
4,1,5,3,889751712,0,4


In [12]:
user_num = pd_factorize_data['user_factorize_id'].max() + 1
content_mum = pd_factorize_data['content_factorize_id'].max() + 1
print('用户数：'+str(user_num)+'，内容数：'+str(content_mum))

用户数：943，内容数：1680


In [21]:
user_rec_dict,user_rec_score_dict = UserCF(pd_factorize_data, rating_matrix)
# deal with 943 users,the total cost 491.25 seconds

now deal with the user:0
now deal with the user:1
now deal with the user:2
now deal with the user:3
now deal with the user:4
now deal with the user:5
now deal with the user:6
now deal with the user:7
now deal with the user:8
now deal with the user:9
now deal with the user:10
now deal with the user:11
now deal with the user:12
now deal with the user:13
now deal with the user:14
now deal with the user:15
now deal with the user:16
now deal with the user:17
now deal with the user:18
now deal with the user:19
now deal with the user:20
now deal with the user:21
now deal with the user:22
now deal with the user:23
now deal with the user:24
now deal with the user:25
now deal with the user:26
now deal with the user:27
now deal with the user:28
now deal with the user:29
now deal with the user:30
now deal with the user:31
now deal with the user:32
now deal with the user:33
now deal with the user:34
now deal with the user:35
now deal with the user:36
now deal with the user:37
now deal with the user