In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_movies = pd.read_csv('E:\Practice\Recommendation System\Movie_Recommendation/movies.csv', usecols=['movieId','title'], dtype={'movieId':'int32', 'title':'str'''})

In [3]:
data_rating = pd.read_csv('E:\Practice\Recommendation System\Movie_Recommendation/ratings.csv', usecols=['userId','movieId','rating'], dtype={'movieId':'int32', 'userId':'int32', 'rating':'float32'})

In [4]:
data_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
data_rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
data = pd.merge(data_movies, data_rating, on='movieId')
data.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [7]:
movie_data = data.dropna(axis=0, subset=['title'])
movie_count_data = (movie_data.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating':'totalRatingCount'})[['title','totalRatingCount']])
movie_count_data.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [8]:
movie_count_data

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [9]:
rating_with_totalRatingCount = movie_count_data.merge(data, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,title,totalRatingCount,movieId,userId,rating
0,'71 (2014),1,117867,610,4.0
1,'Hellboy': The Seeds of Creation (2004),1,97757,332,4.0
2,'Round Midnight (1986),2,26564,332,3.5
3,'Round Midnight (1986),2,26564,377,3.5
4,'Salem's Lot (2004),1,27751,345,5.0


In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_count_data['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [12]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')

In [13]:
rating_popular_movie.head()

Unnamed: 0,title,totalRatingCount,movieId,userId,rating
104,10 Things I Hate About You (1999),54,2572,12,5.0
105,10 Things I Hate About You (1999),54,2572,19,3.0
106,10 Things I Hate About You (1999),54,2572,68,4.5
107,10 Things I Hate About You (1999),54,2572,92,5.0
108,10 Things I Hate About You (1999),54,2572,104,5.0


In [15]:
rating_popular_movie.shape

(41362, 5)

In [17]:
#pivot matrix

movie_features = rating_popular_movie.pivot_table(index = 'title', columns = 'userId', values='rating').fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [18]:
from scipy.sparse import csr_matrix


In [19]:
movie_features_df_matrix = csr_matrix(movie_features.values)

In [20]:
from sklearn.neighbors import NearestNeighbors

In [21]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [23]:
movie_features.shape

(450, 606)

In [26]:
query_index = np.random.choice(movie_features.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

115


In [31]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Dark Knight Rises, The (2012):

1: Dark Knight, The (2008), with distance of 0.33394569158554077:
2: Inception (2010), with distance of 0.3824954032897949:
3: Avengers, The (2012), with distance of 0.3876444101333618:
4: Interstellar (2014), with distance of 0.4171207547187805:
5: Django Unchained (2012), with distance of 0.4194602370262146:
