In [1]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

In [2]:
movies

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [52]:
type(ratings)

pandas.core.frame.DataFrame

In [4]:
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [4]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
print(n_users,n_items)

data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]
    
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

943 1682


In [15]:
data_matrix.shape

(943, 1682)

In [5]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [11]:
user_prediction.shape

(943, 1682)

In [12]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

# 基于用户的推荐----查询每个用户的未看电影推荐列表

In [6]:
user_id = 3
topN = 30

#列出已经看过的电影
seen_movies=ratings.loc[(ratings.user_id== user_id) & (ratings.rating >=4)]
seen_movies=seen_movies.join(movies.set_index('movie_id'), on='movie_id', how='inner')
#加入action是为了在left join的时候，判断Nan
seen_movies_id = seen_movies[["movie_id","Action"]]
seen_movies=seen_movies[["movie_id","title","rating"]]
print("################### 用户已经看过的电影：########################")
print(seen_movies[:topN])

#找到推荐殷勤的预测打分
#rec_list = user_prediction[user_id-1].tolist()
rec_list = list(enumerate(user_prediction[user_id-1],start=1))
rec_movies=pd.DataFrame(rec_list, columns=['movie_id', 'rating'])

# 去除已经看过的电影
rec_movies=rec_movies.join(seen_movies_id.set_index('movie_id'), on='movie_id', how='left')
rec_movies=rec_movies[np.isnan(rec_movies["Action"])][["movie_id","rating"]]

#按评分降序，列出推荐的未看过的电影
rec_movies=rec_movies.join(movies.set_index('movie_id'), on='movie_id', how='inner')
rec_movies=rec_movies[["movie_id","title","rating"]]
rec_movies=rec_movies.sort_values('rating',ascending=False)
print("")
print("################### 系统推荐的电影：############################")
print(rec_movies[:topN])


################### 用户已经看过的电影：########################
       movie_id                                              title  rating
3840        331                                   Edge, The (1997)       4
6178        328                           Conspiracy Theory (1997)       5
9021        318                            Schindler's List (1993)       4
11227       348                          Desperate Measures (1998)       4
14381       327                                    Cop Land (1997)       4
24412       321                                      Mother (1996)       5
24716       260                               Event Horizon (1997)       4
29583       320  Paradise Lost: The Child Murders at Robin Hood...       5
33981       342                Man Who Knew Too Little, The (1997)       4
35195       329                          Desperate Measures (1998)       4
37188       181                          Return of the Jedi (1983)       4
48447       344                              

In [13]:
user_prediction.shape

(943, 1682)

In [14]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

# 基于电影的推荐

In [13]:
item_prediction.shape

(943, 1682)

In [38]:
item_prediction[1:10]

array([[0.10854432, 0.13295661, 0.12558851, ..., 0.13445801, 0.13657587,
        0.13711081],
       [0.08568497, 0.09169006, 0.08764343, ..., 0.08465892, 0.08976784,
        0.09084451],
       [0.05369279, 0.05960427, 0.05811366, ..., 0.05793681, 0.06167338,
        0.06228061],
       ...,
       [0.0906776 , 0.09324217, 0.11337969, ..., 0.13386342, 0.12840319,
        0.13013888],
       [0.04446489, 0.05055583, 0.05160457, ..., 0.05511418, 0.05545723,
        0.05553877],
       [0.36606029, 0.40207446, 0.42393615, ..., 0.4630846 , 0.45213368,
        0.45289842]])

In [14]:
user_id = 3
topN = 30

#列出已经看过的电影
seen_movies=ratings.loc[(ratings.user_id== user_id) & (ratings.rating >=4)]
seen_movies=seen_movies.join(movies.set_index('movie_id'), on='movie_id', how='inner')
#加入action是为了在left join的时候，判断Nan
seen_movies_id = seen_movies[["movie_id","Action"]]
seen_movies=seen_movies[["movie_id","title","rating"]]
print("################### 用户已经看过的电影：########################")
print(seen_movies[:topN])

#找到推荐殷勤的预测打分
#rec_list = user_prediction[user_id-1].tolist()
rec_list = list(enumerate(item_prediction[user_id-1],start=1))
rec_movies=pd.DataFrame(rec_list, columns=['movie_id', 'rating'])

# 去除已经看过的电影
rec_movies=rec_movies.join(seen_movies_id.set_index('movie_id'), on='movie_id', how='left')
rec_movies=rec_movies[np.isnan(rec_movies["Action"])][["movie_id","rating"]]

#按评分降序，列出推荐的未看过的电影
rec_movies=rec_movies.join(movies.set_index('movie_id'), on='movie_id', how='inner')
rec_movies=rec_movies[["movie_id","title","rating"]]
rec_movies=rec_movies.sort_values('rating',ascending=False)
print("")
print("################### 系统推荐的电影：############################")
print(rec_movies[:topN])

################### 用户已经看过的电影：########################
       movie_id                                              title  rating
3840        331                                   Edge, The (1997)       4
6178        328                           Conspiracy Theory (1997)       5
9021        318                            Schindler's List (1993)       4
11227       348                          Desperate Measures (1998)       4
14381       327                                    Cop Land (1997)       4
24412       321                                      Mother (1996)       5
24716       260                               Event Horizon (1997)       4
29583       320  Paradise Lost: The Child Murders at Robin Hood...       5
33981       342                Man Who Knew Too Little, The (1997)       4
35195       329                          Desperate Measures (1998)       4
37188       181                          Return of the Jedi (1983)       4
48447       344                              

In [36]:
movie_id_list=[331,318,1554,1209]
movie_id_list=pd.DataFrame(movie_id_list,columns=["movie_id"])

movies.join(movie_id_list.set_index('movie_id'), on='movie_id', how='inner')

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
317,318,Schindler's List (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Schindler's%2...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
330,331,"Edge, The (1997)",26-Sep-1997,,http://us.imdb.com/M/title-exact?Edge%2C+The+(...,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1208,1209,Mixed Nuts (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Mixed%20Nuts%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1553,1554,Safe Passage (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Safe%20Passag...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
