In [26]:
import numpy as np
import pandas as pd
import random

minhash = True # 是否使用minhash优化
nfuncs = 1000 # 映射函数数量

In [27]:
# 读取训练集
data = pd.read_csv('datasets/train_set.csv')
data.drop('timestamp', axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99904 entries, 0 to 99903
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   99904 non-null  int64  
 1   movieId  99904 non-null  int64  
 2   rating   99904 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [28]:
# 如果是minhash模式，需要额外生成一个对rating进行01化的矩阵
binary_data = data.copy(deep=True)
if minhash:
    binary_data.loc[binary_data['rating']<2.6, 'rating'] = 0
    binary_data.loc[binary_data['rating']>2.9, 'rating'] = 1

# 生成utility matrix
matrix = data.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
matrix.fillna(0, inplace=True)
matrix.index = np.array(sorted(data['movieId'].unique()))

matrix

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# 生成binary utility matrix
if minhash:
    binary_matrix = binary_data.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
    binary_matrix.fillna(0, inplace=True)
    binary_matrix.index = np.array(sorted(binary_data['movieId'].unique()))

#binary_matrix

In [30]:
# 基于用户的协同过滤推荐
userid = 671 
K = 100 
n = 20 
    
def recommender(mode, minhash, *args):
    '''
    基于用户的协同过滤的推荐系统
    @params:
        mode: 为0时进行topN推荐 为1时直接预测评分
        minhash: 为True时进行minhash优化
        *args: 对应不同mode有不同参数
    '''    
    
    # 直接预测模式
    if mode == 1:
        userid, movieid, K = args
        
    # topN模式
    else:
        userid, K, n = args
    
    # minhash优化
    if minhash:
        # 根据随机生成的nfuncs个映射函数生成哈希签名矩阵
        users_num = len(binary_matrix.columns)
        movies_num = len(binary_matrix[1])

        sig_matrix = np.zeros((nfuncs, users_num))
        for i in range(nfuncs):
            func = list(range(1, movies_num+1))
            np.random.shuffle(func) # permutation π
            shuffled_matrix = binary_matrix.reindex(func)
            s = set(range(users_num))# 记录对于每个func，user是否找到第一个1的集合，当user找到了则从集合中弹出
            
            sig_i = np.zeros(users_num)
            for j in range(movies_num):
                row = np.array(shuffled_matrix.iloc[j])
                for r in range(users_num):
                    if row[r] and r in s:
                        s.remove(r)
                        sig_i[r] = j + 1
                if not s:
                    break
                        
            sig_matrix[i] = sig_i # 更新签名矩阵的第i行 
            
            
        sig_matrix = pd.DataFrame(sig_matrix)
        sig_matrix.columns = list(range(1, users_num+1))
        #print(sig_matrix)
        # 使用jaccard系数计算用户之间的相似度
        sim_dict = {i : np.sum(sig_matrix[userid] == sig_matrix[i]) / nfuncs for i in range(1, users_num+1)}
        sorted_sim = sorted(sim_dict.items(), key=lambda d:d[1], reverse=True)
    
    # 不使用minhash优化
    else:
        # 使用pearson系数计算用户之间的相似度
        users_num = len(matrix.iloc[0])
        sim_dict = {i : matrix[userid].corr(matrix[i], method='pearson') for i in range(1, users_num+1)}
        sorted_sim = sorted(sim_dict.items(), key=lambda d:d[1], reverse=True)

    # 取K个最相似的用户
    topK_id = [sorted_sim[i][0] for i in range(K)]
    topK_matrix = matrix[topK_id]
    
    # 直接预测评分模式
    if mode == 1:
        x = topK_matrix.loc[movieid]
        pred_i = np.mean(x[x!=0])
        return pred_i
        
    # topN推荐模式
    else:
        pred_dict = {}
        for i in range(len(matrix)):
            x = topK_matrix.iloc[i]
            if len(x[x!=0]) > 15:
                pred_i = np.mean(x[x!=0]) # 去掉里面的0项
                pred_dict[i] = 0 if np.isnan(pred_i) else pred_i
            else:
                pred_dict[i] = 0 # 不考虑
        
        # 取前n个电影推荐 
        sorted_pred = sorted(pred_dict.items(), key=lambda d:d[1], reverse=True)
        pred = sorted_pred[:n]
        print('As for User %d, the top %d recommendations are shown below:' % (userid, n))
        print('-----------------------------------------------------------')
        result = 0
        for i in range(n):
            id, score = pred[i]
            print('%6d | %.4f' % (matrix.index[id], score))
    

# 进行topK推荐
recommender(0, minhash, userid, K, n)

As for User 671, the top 20 recommendations are shown below:
-----------------------------------------------------------
   858 | 4.6053
   318 | 4.5278
109487 | 4.5250
   527 | 4.5000
  1704 | 4.2941
 58559 | 4.2857
  2571 | 4.2561
   356 | 4.2500
  2858 | 4.2500
  7153 | 4.2241
  1196 | 4.2097
 79132 | 4.1786
  4973 | 4.1667
  4993 | 4.1667
   260 | 4.1486
  2959 | 4.1111
   110 | 4.0938
  4995 | 4.0938
   296 | 4.0893
  5952 | 4.0484


In [23]:
print(recommender(0,False,547,K,n))

As for User 547, the top 20 recommendations are shown below:
-----------------------------------------------------------
   858 | 4.6149
  1252 | 4.5091
   912 | 4.4769
  1221 | 4.4667
   111 | 4.4636
   296 | 4.4625
  1945 | 4.4583
  1299 | 4.4464
    50 | 4.4348
  1217 | 4.4348
   969 | 4.4219
   318 | 4.4133
   750 | 4.4091
   905 | 4.4091
   904 | 4.4074
   926 | 4.4074
   913 | 4.4070
  1224 | 4.4062
   923 | 4.3966
  1219 | 4.3953
None


In [31]:
# 读取测试集
test = pd.read_csv('datasets/test_set.csv')
test.drop('timestamp', axis=1, inplace=True)
users, movies, ratings = test['userId'], test['movieId'], test['rating']
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   100 non-null    int64  
 1   movieId  100 non-null    int64  
 2   rating   100 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 2.5 KB


In [32]:
# 开始预测
preds = []
for i in range(len(test)):
    print('%d/%d...' % (i+1, len(test)))
    preds.append(recommender(1, minhash, users[i], movies[i], K))

SSE = np.sum(np.square(preds - ratings))

SSE

1/100...


KeyboardInterrupt: 