基于用户相似度做的协同推荐

导入movielen数据

In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

import random
from sklearn.model_selection import train_test_split

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

将
users[['gender', 'age_desc', 'occ_desc']].head()

  gender  age_desc              occ_desc
0      F  Under 18          K-12 student
1      M       56+         self-employed
2      M     25-34             scientist
3      M     45-49  executive/managerial
4      M     25-34                writer

提取成一个pd.Series，便于后续用TfidfVectorizer做数值化，

0             [u'F', u'Under 18', u'K-12 student']
1                 [u'M', u'56+', u'self-employed']
2                   [u'M', u'25-34', u'scientist']
3    [u'M', u'45-49', u'executive', u'managerial']
4                      [u'M', u'25-34', u'writer']

目前排除了zipcode，因为发现用了引入zipcode后的相关度矩阵会非常庞大，后续再研究一下原因。

In [2]:
def f(x):
    return np.hstack([x[:-1], x[-1].split('/')]).tolist()

# Should include zipcode??
#uu = users[['gender', 'zipcode', 'age_desc', 'occ_desc']].values
uu = users[['gender', 'age_desc', 'occ_desc']].values
user_labels = pd.Series([f(x) for x in uu]).astype('str')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_user_matrix = tf.fit_transform(user_labels)
print(tfidf_user_matrix.shape)

(6040, 196)


由于TfidfVectorizer已经将向量标准化为长度1，所以可以直接用两个向量的点积作为cosine的值来判断向量之间的相似程度；因此可以用性能更好的linear_kernel来处理。

In [4]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_user_matrix, tfidf_user_matrix)
print(cosine_sim[:20, :20])

[[1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.06452105
  1.         0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         1.         0.         0.21496202 0.
  0.         0.23125072 0.19158282 0.         0.18982012 0.23125072
  0.         0.         0.2037839  0.         0.         0.
  0.         0.17331325]
 [0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.35977795 0.         0.40328017 0.         0.         0.
  0.         0.        ]
 [0.         0.         0.21496202 0.         1.         0.
  0.         0.25139085 0.20826819 0.         0.20635197 0.25139085
  0.         0.         0.22153189 0

准备索引，便于后续查找数据，

titles：根据索引得到电影名称
indices：根据电影名称得到索引
indices_movid：根据电影id得到电影的索引（电影id与索引的对应并不完全对齐）
indices_userid：根据用户id得到用户的索引

In [5]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

indices_userid = pd.Series(users.index, index=users['user_id'])
indices_movid = pd.Series(movies.index, index=movies['movie_id'])

根据用户id，返回最近似的用户，打分最高的电影。
对所有高分电影算近似用户的平均值，返回前n个相似用户平均分最高的电影。

简单验证user_id为1的用户的推荐结果

In [6]:
def get_first_n(l, n=3):
    """Get the first n results from l
    l is like,
    l = [(1, 1.0), (2, 1.0), (3, 0.8), (4, 0.75), (5, 0.6), (6, 0.4), (7, 1.0), (8, 1.0)]
    
    randomnize and return n of them if candidates more than n
    otherwise just sort and return first n of them
    """
    l = sorted(l, key=lambda x:x[1], reverse=True)
    top = l[0][-1]
    
    result = []
    for x in l:
        if x[1] == top:
            result.append(x)
    
    if len(result) >= n:
        random.shuffle(result)
        return result[:n]
    
    else:
        return l[:n]

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommend_by_user_similarity(user_id, n = 10, k = 10):
    """For each user_id,
    1. find the top n most similiar users
    2. find their highest rated movies_id
    3. shuffle and return the k of them
    """
    
    idx = indices_userid[user_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    users_id_list = [np.asscalar(users[users.index == x[0]].user_id) for x in get_first_n(sim_scores, n=n)]
    
    #print(users_id_list)
    
    y = None
    
    for user_id in users_id_list:
        highest_rate_movie = \
            ratings[(ratings.user_id == user_id) & \
                    (ratings.rating == np.max(ratings[ratings.user_id == user_id].rating))][['movie_id']]
        #print(user_id, highest_rate_movie.movie_id.values)
        x = ratings[(ratings.user_id == user_id) & (ratings.movie_id.isin(highest_rate_movie.movie_id.values))][['user_id', 'movie_id', 'rating']]
        y = pd.concat([x, y])
        
    # Average ratings for each high rated movies
    z = y.groupby('movie_id').rating.sum()/y.groupby('movie_id').rating.count()
    
    result = get_first_n(list(zip(z.index, z.values)), n=k)
    
    # Return the movie id lists
    result = [i[0] for i in result]
    
    return result

print(genre_recommend_by_user_similarity(1))

[318, 2167, 1073, 899, 1408, 2628, 364, 3635, 1197, 2761]


将数据集以默认1：3的比例以随机的方式分为训练和验证的部分，后续用验证集来验证推荐效果。 具体见， https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [7]:
# split into train & test data
ratings_train, ratings_test = train_test_split(ratings)

通过推荐列表的命中率验证效果，使用验证集来验证推荐结果，训练集用来生成推荐列表。 由于movielen里除了有评价过的电影还有相应的评分，为了贴合实际，认为， 只有命中且评分高于该用户的p80的评分（有些人习惯打高分，其他人反之），才算命中

会执行很久，但可以像下列例子那样仅验证头100个user的推荐的命中率

In [8]:
def hit_ratio_benchmark(ratings_train, ratings_test, rated_movie_limit=10):
    """
    for each user_id
    1. get recommend list using ratings_train rated movies
    2. use ratings_test rated movies to validate hit ratio
    it is considered hit when,
    1. user rated this movie
    2. the rate is >= this user's p80 rate in ratings_train
    """
    hit, count = (0., 0.)
    
    #for user_id in np.sort(ratings_test['user_id'].unique()):
    for user_id in np.arange(1, 100):
        recommend_list = genre_recommend_by_user_similarity(user_id)
        
        for item in recommend_list:
            count += 1
            x = ratings_test[ratings_test.user_id == user_id][['movie_id', 'rating']]
            if x[x.movie_id == item].empty:
                continue
            elif x[x.movie_id == item].rating.values < np.percentile(x.rating.values, 80):
                continue
            else:
                hit += 1
    
    print(hit, count)
    #hit_ratio = hit / ratings_test.movie_id.count() * 1.0
    hit_ratio = hit / count
    return hit_ratio
        
        
hit_ratio = hit_ratio_benchmark(ratings_train, ratings_test)
hit_ratio *= 100
print('hit ratio percentage: %.10f%%' % hit_ratio)

(19.0, 990.0)
hit ratio percentage: 1.9191919192%


NDCG原理见， http://sofasofa.io/forum_main_post.php?postid=1002561

In [9]:
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

对所有验证集的用户的推荐列表，计算ndcg并计算他们的平均值，作为该推荐算法的ndcg分数
由于ndcg计算的是推荐列表的顺序的精确度，因此如果验证集中该用户没有给推荐的电影打分，就认为是打了0分。

In [10]:
def ndcg_benchmark(ratings_train, ratings_test, rated_movie_limit=10):
    """
    for each user_id
    1. get recommend list using ratings_train rated movies
    2. use ratings_test rated movies to validate ndcg value
    if it is not rated, make it zero
    return average ndcg_score for all ratings_test users
    """
    
    ndcg_score, count = (0, 0)
    #for user_id in np.sort(ratings_test['user_id'].unique()):
    for user_id in np.arange(1, 100):
        r = []
        #print('user_id %s' % user_id)
        
        recommend_list = genre_recommend_by_user_similarity(user_id)
        #print(user_id, recommend_list)
        
        for item in recommend_list:
            x = ratings_test[ratings_test.user_id == user_id][['movie_id', 'rating']]
            if x[x.movie_id == item].empty:
                r.append(0)
            else:
                r.append(\
                    np.asscalar(ratings_test[(ratings_test.user_id == user_id) & \
                        (ratings_test.movie_id == item)].rating.values))
    
        ndcg_score += ndcg_at_k(r, len(r))
        count += 1.0

    ndcg_score /= count
    return ndcg_score

print(ndcg_benchmark(ratings_train, ratings_test))

0.14221969665559114


可以看到hit ratio和NDCG的值都偏低。原因包括该验证集并不是由新的推荐算法的产生的，实际生产中更多会通过A/B方式做验证。