## Recommed by Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read data

In [2]:
# read users file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./data/ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading items file
i_cols = ['movie id', 'movie title' ,'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('./data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

# read ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('./data/ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

In [3]:
# user
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# item
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# rating
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
# read the train and test data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('./data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('./data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
print('ratings train set shape: {}'.format(ratings_train.shape))
print('ratings test set shape:  {}'.format(ratings_test.shape))

ratings train set shape: (90570, 4)
ratings test set shape:  (9430, 4)


### 根据 用户-用户相似度(user-based) 和 电影-电影相似度(item-based) 推荐电影，首先需要计算独立用户和电影的数量。

In [7]:
n_users = ratings['user_id'].unique().shape[0]
n_items = ratings['movie_id'].unique().shape[0]
print('n_users: {}'.format(n_users))
print('n_items: {}'.format(n_items))

n_users: 943
n_items: 1682


In [8]:
# 构建一个用户电影矩阵，该矩阵可用于计算用户与电影之间的相似性
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]
print(data_matrix.shape)

(943, 1682)


In [9]:
# 使用sklearn的pairwise_distance函数来计算余弦相似度
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
print('similarity matrix of users: {}'.format(user_similarity.shape))
print('similarity matrix of items: {}'.format(item_similarity.shape))

similarity matrix of users: (943, 943)
similarity matrix of items: (1682, 1682)


In [10]:
def predict(ratings, similarity, type=None):
    """predict the rating of users
    Args:
        ratings: rating list of users in history.
        similarity: similarity matrix of users or items.
        type: recommeding type, chosen in "user-based" or "item-based", if type is None, use item-based method.
    """
    if type == 'user-based':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + \
            similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    else:
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [11]:
user_prediction = predict(data_matrix, user_similarity, type='user-based')

In [12]:
item_prediction = predict(data_matrix, item_similarity, type='item-based')

In [13]:
item_prediction.shape

(943, 1682)

In [14]:
print('<original rating matrix>:\n{}\n'.format(data_matrix[:10, :10]))
print('<predicted rating matrix (user-based)>:\n{}\n'.format(user_similarity[:10, :10]))
print('<predicted rating matrix (item-based)>:\n{}\n'.format(item_prediction[:10, :10]))

<original rating matrix>:
[[5. 3. 4. 3. 3. 5. 4. 1. 5. 3.]
 [4. 0. 0. 0. 0. 0. 0. 0. 0. 2.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [4. 3. 0. 0. 0. 0. 0. 0. 0. 0.]
 [4. 0. 0. 0. 0. 0. 2. 4. 4. 0.]
 [0. 0. 0. 5. 0. 0. 5. 5. 5. 4.]
 [0. 0. 0. 0. 0. 0. 3. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 4. 0. 0. 0.]
 [4. 0. 0. 4. 0. 0. 4. 0. 4. 0.]]

<predicted rating matrix (user-based)>:
[[0.         0.83306902 0.95254046 0.93564218 0.62152482 0.56976056
  0.5596332  0.68092789 0.92186163 0.62345619]
 [0.83306902 0.         0.88940868 0.82187881 0.92702104 0.75415674
  0.89267161 0.89665583 0.83895249 0.84013821]
 [0.95254046 0.88940868 0.         0.65584928 0.97875547 0.92758518
  0.93386336 0.91693997 0.93896    0.93484883]
 [0.93564218 0.82187881 0.65584928 0.         0.96819575 0.93195559
  0.90876955 0.81193969 0.89871644 0.93914077]
 [0.62152482 0.92702104 0.97875547 0.96819575 0.         0.76271353
  0.62639987 0.75107003 0.943153   0.79857299]
 [0.56976056 0.75415674 0.9

## 使用 Turicreate 库搭建简单流行的协同过滤模型

In [15]:
import turicreate as tc

  from ._conv import register_converters as _register_converters


In [16]:
train_data = tc.SFrame(ratings_train)
test_data = tc.SFrame(ratings_test)
print(train_data.shape)
print(test_data.shape)

(90570, 4)
(9430, 4)


In [17]:
popularity_model = \
    tc.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [18]:
# 按照流行度推荐topk个电影，每个用户推荐的都相同
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5], k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1599   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1599   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1599   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1599   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1599   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

In [19]:
# training the model
item_sim_model = tc.item_similarity_recommender.create(
    train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

In [20]:
# making recommendations 个性化推荐(item-based)
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5], k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   423    | 0.9834008066708805 |  1   |
|    1    |   202    | 0.9431346880115625 |  2   |
|    1    |   655    | 0.8095529846107686 |  3   |
|    1    |   403    | 0.7722897438602593 |  4   |
|    1    |   568    | 0.7698631436770199 |  5   |
|    2    |    50    | 1.1256258487701416 |  1   |
|    2    |   181    | 1.0651773168490484 |  2   |
|    2    |   121    |  0.94162796323116  |  3   |
|    2    |    7     | 0.8380612421494263 |  4   |
|    2    |    9     | 0.831989913032605  |  5   |
|    3    |   313    | 0.6353766620159149 |  1   |
|    3    |   328    | 0.6032880300825293 |  2   |
|    3    |   315    | 0.5422587123784152 |  3   |
|    3    |   331    | 0.5355071858926252 |  4   |
|    3    |   332    | 0.5316696112806146 |  5   |
|    4    |    50    | 1.1311477082116264 |  1   |
|    4    |   288    | 1.048715