In [1]:
import numpy as np
import pandas as pd

In [2]:
movies_df = pd.read_csv("movies.csv",usecols=["movieId","title"],dtype={"movieId":"int32","title":"str"})

rating_df = pd.read_csv("ratings.csv", usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
df = pd.merge(movies_df,rating_df, on="movieId")

In [6]:
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [7]:
df.isnull().sum()

movieId    0
title      0
userId     0
rating     0
dtype: int64

In [8]:
df.groupby("title")["rating"].count()

title
'71 (2014)                                    1
'Hellboy': The Seeds of Creation (2004)       1
'Round Midnight (1986)                        2
'Salem's Lot (2004)                           1
'Til There Was You (1997)                     2
                                             ..
eXistenZ (1999)                              22
xXx (2002)                                   24
xXx: State of the Union (2005)                5
¡Three Amigos! (1986)                        26
À nous la liberté (Freedom for Us) (1931)     1
Name: rating, Length: 9719, dtype: int64

In [9]:
movie_rating_count = pd.DataFrame(data=df.groupby("title")["rating"].count())

In [10]:
movie_rating_count = movie_rating_count.rename(columns={"rating":"totalRatingCount"})

In [11]:
movie_rating_count.head()

Unnamed: 0_level_0,totalRatingCount
title,Unnamed: 1_level_1
'71 (2014),1
'Hellboy': The Seeds of Creation (2004),1
'Round Midnight (1986),2
'Salem's Lot (2004),1
'Til There Was You (1997),2


In [12]:
rating_with_totalRatingCount = df.merge(movie_rating_count, left_on="title", right_on="title", how="left")

In [13]:
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [14]:
popularity_threshold = 50

rating_popular_movie = rating_with_totalRatingCount.query("totalRatingCount>=@popularity_threshold")
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [15]:
rating_popular_movie.shape

(41362, 5)

In [16]:
movie_features_df = rating_popular_movie.pivot_table(index="title", columns="userId", values="rating").fillna(0)

movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [17]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

In [18]:
from sklearn.neighbors import NearestNeighbors

model_KNN = NearestNeighbors(metric="cosine", algorithm="brute")

model_KNN.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [19]:
query_index = np.random.choice(movie_features_df.shape[0])

print(query_index)
distances, indices = model_KNN.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 6)

316


In [20]:
distances.flatten()

array([0.        , 0.36870807, 0.3971433 , 0.39811695, 0.4048547 ,
       0.40667826], dtype=float32)

In [21]:
for i in range(0,len(distances.flatten())):
    
    if i==0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
        
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Princess Bride, The (1987):

1: Monty Python and the Holy Grail (1975), with distance of 0.368708074092865:
2: Star Wars: Episode V - The Empire Strikes Back (1980), with distance of 0.39714330434799194:
3: Ferris Bueller's Day Off (1986), with distance of 0.39811694622039795:
4: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), with distance of 0.4048547148704529:
5: Groundhog Day (1993), with distance of 0.4066782593727112:
