In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies_cleaned = movies.drop(columns = ['genres'])
ratings_cleaned = ratings.drop(columns = ['timestamp'])

In [6]:
movies_cleaned

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [7]:
ratings_cleaned

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [8]:
merged_data = movies_cleaned.merge(ratings_cleaned,on = 'movieId')

In [9]:
merged_data.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [10]:
no_of_rating =  merged_data.groupby('title')['rating'].count()
dataset_df = pd.DataFrame(no_of_rating)
dataset_df.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'71 (2014),1
'Hellboy': The Seeds of Creation (2004),1
'Round Midnight (1986),2
'Salem's Lot (2004),1
'Til There Was You (1997),2


In [11]:
# As both will have ratings so rename it
dataset_df.rename(columns={'rating':'no_of_rating'},inplace = True)

In [12]:
updated_df = merged_data.merge(dataset_df,on ='title')
updated_df.head()

Unnamed: 0,movieId,title,userId,rating,no_of_rating
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [13]:
updated_df.sort_values('no_of_rating',ascending = False).head()
# So the highest votes given to a movie was 329 only

Unnamed: 0,movieId,title,userId,rating,no_of_rating
10097,356,Forrest Gump (1994),141,4.0,329
10225,356,Forrest Gump (1994),387,4.0,329
10243,356,Forrest Gump (1994),426,5.0,329
10242,356,Forrest Gump (1994),425,5.0,329
10241,356,Forrest Gump (1994),423,5.0,329


In [14]:
# So, let us take a thresold value of no_of_rating
thresold_value = 50
thresold_based_df = updated_df.query('no_of_rating >= @thresold_value')
thresold_based_df.head()
# So, only those movies are considered whose no_of_rating > 50.

Unnamed: 0,movieId,title,userId,rating,no_of_rating
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [15]:
thresold_based_df.shape

(41362, 5)

In [16]:
movie_pt = thresold_based_df.pivot_table(index = 'title',columns='userId',values='rating').fillna(0)
movie_pt.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [19]:
# Converting Data to Array Matrix
from scipy.sparse import csr_matrix
movie_pt_matrix = csr_matrix(movie_pt.values)
movie_pt_matrix

<450x606 sparse matrix of type '<class 'numpy.float64'>'
	with 41360 stored elements in Compressed Sparse Row format>

In [20]:
# The Nearest Neighbors algorithm used here is a Unsupervised Algo, it it not a Supervised Classifer
from sklearn.neighbors import NearestNeighbors
knn_model = NearestNeighbors(metric='cosine',algorithm='brute')
knn_model.fit(movie_pt_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [21]:
movie_pt_matrix.shape

(450, 606)

In [40]:
fetch_index = np.random.choice(movie_pt.shape[0])
fetch_index

384

In [41]:
distance, indices = knn_model.kneighbors(movie_pt.iloc[fetch_index,:].values.reshape(1,-1),n_neighbors = 10)
# use kneighbors to get nearest movie near the fetch index                   
# You must reshape it to get the string value.
# Choose how many nearest movies you want to get by defining it in n_neighbors

In [45]:
for i in range(0,len(distance.flatten())):
    if i == 0:
        print("Recommendation for {}".format(movie_pt.index[fetch_index]))
    else:
        print('{index}:{movie}, with distance {distance}'.format(index = i,movie = movie_pt.index[indices.flatten()[i]],
                                                                distance = distance.flatten()[i]))

Recommendation for Star Wars: Episode IV - A New Hope (1977)
1:Star Wars: Episode V - The Empire Strikes Back (1980), with distance 0.16759264477662794
2:Star Wars: Episode VI - Return of the Jedi (1983), with distance 0.20936143443836897
3:Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), with distance 0.291165883452314
4:Matrix, The (1999), with distance 0.3365532537559339
5:Indiana Jones and the Last Crusade (1989), with distance 0.3582813138541151
6:Back to the Future (1985), with distance 0.37719351004093404
7:Star Wars: Episode I - The Phantom Menace (1999), with distance 0.39562333225899726
8:Terminator, The (1984), with distance 0.4030132965528984
9:Godfather, The (1972), with distance 0.4046831966011223
