# 搭建推荐系统

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

## 1、导入数据

In [2]:
# reading users file
u_cols = ['user_id', 'age', 'sex', 'aoccupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

#reading rating file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

#reading items file
i_cols = ['movie_id', 'movies_title', 'release_date', 'video_release_date',
          'IMDb_URL', 'unknown', 'action', 'adventure', 'animation', 
          'children\'s', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 
          'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 
          'thiller', 'war', 'western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')


## 2、查看数据是否引入正确

In [3]:
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,aoccupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie_id,movies_title,release_date,video_release_date,IMDb_URL,unknown,action,adventure,animation,children's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thiller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## 3、读入训练和测试数据

In [6]:
# 在测试集里面，每个用户都对其中的10个电影进行了打分，之前说过，每个用户至少对20部电影有过评分
# 那么剩下的所有行就构成了训练集，100000 - 943*10 = 90570
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape,ratings_test.shape

((90570, 4), (9430, 4))

## 4、搭建协同过滤模型

In [7]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
# 创建一个用户电影矩阵
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples(index=True):
    data_matrix[line[1]-1,line[2]-1] = line[3]

# 计算相似度
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
## 计算用户相似度
user_similarity = cosine_similarity(data_matrix)
## 计算商品相似度
item_similarity = cosine_similarity(data_matrix.T)

## 5、预测

In [11]:
def predict(ratings, similarity, type='user'):
    # 这里用的是修正后的评分公式
    if type=='user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = ratings - mean_user_rating[:, np.newaxis]   # 这里的np.newaxis和None、转置其实差不多，就是将其变成一个列向量
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    # 这里是直接套用的PPT公式
    elif type=='item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [12]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [13]:
user_prediction.shape

(943, 1682)