In [2]:
'''
recsys(content-based).py
By: YingjiaWang 
From: HUST
Date: 2020.12.23
'''
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

minhash = True # 是否使用minhash优化
nfuncs = 10 # 映射函数数量

# 读取movies数据
movies = pd.read_csv('datasets//movies.csv')
movies.shape

(9125, 3)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.tail()


Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [4]:
# 生成一个0开始的连续下标和movieId的双向映射
index2Id = {k:v for k, v in enumerate(movies['movieId'])}
Id2index = {v:k for k, v in index2Id.items()}
index2Id[9124]

164979

In [5]:
genres = [' '.join(movies['genres'][i].split('|')) for i in range(len(movies))]

# 如果使用minhash生成01矩阵
if minhash:
    cnt = CountVectorizer(binary=True)
    matrix = cnt.fit_transform(genres).toarray().T # 此处转置获得(features_num * movies_num)大小的矩阵，便于后续优化
    matrix = pd.DataFrame(matrix)
    matrix.columns = list(index2Id.values())

# 不使用minhash生成tf-idf矩阵
else:
    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(genres).toarray()
    matrix = pd.DataFrame(matrix)

matrix.shape

(24, 9125)

In [6]:
matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
0,0,0,0,0,0,1,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
1,1,1,0,0,0,0,0,1,0,1,...,0,1,0,0,0,1,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
7,0,0,0,1,0,0,0,0,0,0,...,1,0,1,1,0,1,0,0,0,0
8,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [7]:
if minhash:
    # 根据随机生成的nfuncs个映射函数生成哈希签名矩阵
    features_num = len(matrix)
    movies_num = len(matrix.columns)

    sig_matrix = np.zeros((nfuncs, movies_num))

    for i in range(nfuncs):
        func = list(range(1, features_num+1))
        random.shuffle(func) # Permutation π

        k = dict(zip(func, [np.array(matrix.loc[i]) for i in range(features_num)]))
        s = set(range(movies_num)) # 记录对于每个func，feature是否找到第一个1的集合，当feature找到了则从集合中弹出

        sig_i = np.zeros(movies_num)
        for j in range(1, features_num+1):
            row = k[j]
            for r in range(movies_num):
                if row[r] and r in s:
                    s.remove(r)
                    sig_i[r] = j
            if not s:
                break

        sig_matrix[i] = sig_i # 更新签名矩阵的第i行

    sig_matrix = pd.DataFrame(sig_matrix)
    print(sig_matrix)
    
else:
    cosine_matrix = cosine_similarity(matrix)
    print(cosine_matrix.shape)

   0     1     2     3     4     5     6     7     8     9     ...  9115  \
0   7.0   7.0   6.0   6.0  22.0  10.0   6.0   7.0  14.0  14.0  ...   2.0   
1   7.0  10.0   7.0   2.0   7.0   6.0   7.0  11.0  13.0  13.0  ...   2.0   
2   2.0  12.0   3.0   3.0   3.0   7.0   3.0  16.0   7.0   7.0  ...   8.0   
3   2.0   2.0  15.0  15.0  21.0   8.0  15.0  11.0   8.0   8.0  ...  17.0   
4   2.0  14.0   8.0   8.0  11.0  12.0   8.0  14.0  20.0  12.0  ...   9.0   
5   1.0   1.0   3.0   3.0  14.0  18.0   3.0   8.0  18.0  18.0  ...  15.0   
6   1.0   1.0   3.0   3.0  23.0   5.0   3.0   1.0   5.0   5.0  ...  10.0   
7   9.0   9.0  18.0  17.0  21.0   1.0  18.0   9.0   1.0   1.0  ...  16.0   
8   9.0   9.0   4.0   2.0  18.0   3.0   4.0   9.0   8.0   3.0  ...   2.0   
9   4.0   4.0  21.0   2.0  21.0   1.0  21.0   4.0  15.0   4.0  ...   2.0   

   9116  9117  9118  9119  9120  9121  9122  9123  9124  
0   1.0  19.0  19.0   6.0   6.0   1.0  24.0  22.0  24.0  
1   3.0   2.0   2.0  14.0   2.0   8.0   1.0   7

In [8]:
# 读取训练集
train = pd.read_csv('data/train_set.csv')
train.drop('timestamp', axis=1, inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99904 entries, 0 to 99903
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   99904 non-null  int64  
 1   movieId  99904 non-null  int64  
 2   rating   99904 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [9]:
# 基于内容推荐
userid = 671
K = 20

def cal_score(rated_movies, rating, method, movieid):
    '''
    计算当前用户对index为movieid的电影的打分
    @params:
        rated_movies: 当前用户评价过的movie_id(numpy.array)
        rating: 当前用户评价过的电影评分(numpy.array)
        method: 距离度量方式(cosine或jaccard)
        movieid: 电影的index，与id不同，注意区分
    '''
    # cosine
    if method == 'cosine':   
        
        distances = cosine_matrix[movieid] # 从movieid出发的距离向量
        computed_dict = {} # 计算集合
        for i in range(len(rated_movies)):#对已经评过分的电影中找与movieID相似的
            rated_movie = rated_movies[i]
            cosine = distances[Id2index[rated_movie]]
            if cosine > 1e-6:
                computed_dict[i] = cosine

        # 计算集合不为空
        if len(computed_dict.keys()):
            score = 0
            sum_v0 = 0
            for k, v in computed_dict.items():
                score += rating[k] * v
                sum_v0 += v

            return score / sum_v0

        # 计算集合为空
        else:
            return np.mean(rating)
        
    # jaccard  
    elif method == 'jaccard':
        
        computed_dict = {} # 计算集合
        for i in range(len(rated_movies)):
            rated_movie = rated_movies[i]
            sim = np.sum(sig_matrix[movieid] == sig_matrix[Id2index[rated_movie]]) / nfuncs
            if sim > 1e-6:
                computed_dict[i] = sim

        # 计算集合不为空
        if len(computed_dict.keys()):
            score = 0
            sum_v0 = 0
            for k, v in computed_dict.items():
                score += rating[k] * v
                sum_v0 += v

            return score / sum_v0

        # 计算集合为空
        else:
            return np.mean(rating)     
        
    # error
    else:
        raise Exception("Only Cosine and Jaccard are accepted.")

def recommender(mode, minhash, *args):
    '''
    基于内容的推荐系统
    @params:
        mode: 为0时进行topN推荐 为1时直接预测评分
        minhash: 为True时进行minhash优化
        *args: 对应不同mode有不同参数
    '''
    
    # 直接预测模式
    if mode == 1:
        userid, movieid = args
    
    # topK模式
    else:
        userid, K = args
    
    # 获得当前用户的数据
    data = train[train['userId']==userid]
    rated_movies = np.array(data['movieId'])
    rating = np.array(data['rating'])
    
    # 使用minhash优化
    if minhash:
        # 直接预测模式
        if mode == 1:
            return cal_score(rated_movies, rating, "jaccard", Id2index[movieid])
        
        # topK模式
        else:
            scores_dict = {}
            movies_num = len(index2Id)
            for i in range(movies_num):
                if i % 200 == 0:
                    print('%d/%d...' % (i+1, movies_num))
                scores_dict[i] = cal_score(rated_movies, rating, "jaccard", i)

            scores_list = sorted(scores_dict.items(), key=lambda d:d[1], reverse=True)
            print('As for User %d, the top %d recommendations are shown below:' % (userid, K))
            print('-----------------------------------------------------------')
            for i in range(K):
                ind, score = scores_list[i]
                print('%6d | %70s | %.4f' % (index2Id[ind], movies['title'][ind], score))
            
    # 不使用minhash优化
    else:
        # 直接预测模式
        if mode == 1:
            return cal_score(rated_movies, rating, "cosine", Id2index[movieid])

        # topK模式
        else:
            scores_dict = {}
            movies_num = len(index2Id)
            for i in range(movies_num):
                #if i % 200 == 0:
                #    print('%d/%d...' % (i+1, movies_num))
                scores_dict[i] = cal_score(rated_movies, rating, "cosine", i)

            scores_list = sorted(scores_dict.items(), key=lambda d:d[1], reverse=True)
            print('As for User %d, the top %d recommendations are shown below:' % (userid, K))
            print('-----------------------------------------------------------')
            for i in range(K):
                ind, score = scores_list[i]
                print('%6d | %70s | %.4f' % (index2Id[ind], movies['title'][ind], score))

#recommender(0, minhash, userid, K)

In [10]:
# 读取测试集
test = pd.read_csv('data/test_set.csv')
test.drop('timestamp', axis=1, inplace=True)
users, movies, ratings = test['userId'], test['movieId'], test['rating']
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   100 non-null    int64  
 1   movieId  100 non-null    int64  
 2   rating   100 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 2.5 KB


In [11]:
# 开始预测
preds = []
for i in range(len(test)):
    if i % 10 == 0:
        print('%d/%d...' % (i+1, len(test)))
    preds.append(recommender(1, minhash, users[i], movies[i]))

SSE = np.sum(np.square(preds - ratings))

SSE

1/100...
11/100...
21/100...
31/100...
41/100...
51/100...
61/100...
71/100...
81/100...
91/100...


67.16953413803502