In [20]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

## Load data to pandas

In [21]:
movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
rating = pd.read_table("ratings.dat", sep="::",
                           names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
user=pd.read_table('users.dat',sep='::',names=['user_id','gender','age','occupation','zip-code'],engine='python')
##sep:Delimiter to use, names:List of column names to use

In [22]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
user_id      1000209 non-null int64
movie_id     1000209 non-null int64
rating       1000209 non-null int64
timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [23]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [19]:
user.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [26]:
rating.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


## Convert rating records to user-movie utility matrix

use pandas.pivot_table

In [28]:
utility = pd.pivot_table(data=rating, 
                            values='rating', 
                            index='user_id', 
                            columns='movie_id', 
                            fill_value=0)

In [29]:
utility.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
utility.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6040 entries, 1 to 6040
Columns: 3706 entries, 1 to 3952
dtypes: int64(3706)
memory usage: 170.8 MB


we find that there are lots of zeros in the matrix so we convert to sparse matrix using scipy.sparse.lil_matrix

In [32]:
highest_user_id = rating.user_id.max()
highest_movie_id = rating.movie_id.max()
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
ratings_mat

<6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [35]:
for _, row in rating.iterrows():
    # subtract 1 from id's due to match 0 indexing
    ratings_mat[row.user_id-1, row.movie_id-1] = row.rating

In [36]:
ratings_mat

<6040x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in LInked List format>

## Calculate item-item similarity matrix

In [37]:
utility_mat=ratings_mat

In [45]:
utility_mat.T

<3952x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 1000209 stored elements in LInked List format>

In [38]:
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)


In [39]:
item_sim_mat

array([[1.        , 0.39034871, 0.26794263, ..., 0.09347942, 0.04282933,
        0.18269056],
       [0.39034871, 1.        , 0.24094645, ..., 0.08701306, 0.02606255,
        0.12218461],
       [0.26794263, 0.24094645, 1.        , ..., 0.0622576 , 0.01007255,
        0.097786  ],
       ...,
       [0.09347942, 0.08701306, 0.0622576 , ..., 1.        , 0.20280851,
        0.2346385 ],
       [0.04282933, 0.02606255, 0.01007255, ..., 0.20280851, 1.        ,
        0.19297221],
       [0.18269056, 0.12218461, 0.097786  , ..., 0.2346385 , 0.19297221,
        1.        ]])

## Calculate neighborhood

In [46]:
# sort,result from min to max
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [42]:
neighborhoods.shape

(3952, 75)

In [43]:
neighborhoods

array([[ 589,  149,  550, ..., 1264, 3113,    0],
       [1875, 1195, 2640, ...,  652, 3488,    1],
       [1887,  376,  316, ...,  586, 3449,    2],
       ...,
       [2319, 1457,  447, ..., 2768, 3948, 3949],
       [ 578, 3311,  643, ..., 3191, 3914, 3950],
       [3622, 3565, 3949, ..., 3892, 3896, 3951]], dtype=int64)

## Make rating prediction on a user


In [51]:
# Let's pick a lucky user
user_id = 100
items_rated_by_this_user=ratings_mat[user_id].nonzero()[1]
items_rated_by_this_user

array([  20,   70,  144,  164,  207,  259,  291,  328,  359,  367,  375,
        376,  379,  419,  465,  473,  479,  484,  545,  551,  588,  647,
        735,  779,  848, 1010, 1035, 1195, 1197, 1199, 1209, 1213, 1239,
       1274, 1286, 1319, 1355, 1370, 1371, 1372, 1373, 1374, 1375, 1384,
       1437, 1543, 1551, 1561, 1579, 1586, 1675, 1721, 1800, 1834, 1875,
       1953, 1999, 2000, 2005, 2093, 2272, 2352, 2401, 2403, 2404, 2405,
       2408, 2410, 2411, 2420, 2523, 2570, 2615, 2616, 2627, 2639, 2641,
       2698, 2722, 2734, 2762, 2793, 2825, 2871, 2915, 2946, 2947, 2948,
       2990, 3081, 3207, 3258, 3267, 3396, 3417, 3437, 3439, 3526, 3554,
       3696, 3754, 3792, 3840, 3935, 3945, 3947], dtype=int32)

In [64]:
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

  # This is added back by InteractiveShellApp.init_path()


[4.82368197 4.88525636 5.         ... 5.         0.         4.74581838]
Execution time: 1.171328 seconds


In [65]:
pred_ratings.shape

(3952,)

## Get final recommendations for a user¶

In [67]:
# Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[-n:]


[867, 1877, 2161, 1842, 2804, 901, 101, 2097, 1712, 1222]