In [1]:
import pandas as pd
import numpy as np

In [8]:
# read movies.csv and ratings.csv files and storing data into data frames.
moviesDf = pd.read_csv('C:\\Users\\OM\\Documents\\movies.csv',usecols=['movieId','title'],
                        dtype={'movieId': 'int32', 'title': 'str'})
ratingDf = pd.read_csv('C:\\Users\\OM\\Documents\\ratings.csv',usecols=['userId', 'movieId', 'rating'], 
                      dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [27]:
# Prints first five rows of data frame.
moviesDf.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [28]:
ratingDf.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
# Merge ratings dataframe and movies dataframe based on movieId field.
mergedDF = pd.merge(ratingDf,moviesDf,on='movieId')
mergedDF.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [12]:
# Creates new data frame by removing missing values and grouping title and rating fields.
combinedMovieRating = mergedDF.dropna(axis = 0, subset = ['title'])
movieRatingCount = (combinedMovieRating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movieRatingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [13]:
# Merge two data frames by title field.
totalRatingCountDf = combinedMovieRating.merge(movieRatingCount, 
                                 left_on = 'title', right_on = 'title', how = 'left')
totalRatingCountDf.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [15]:
# Print statistics of totalRatingCount field. 
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movieRatingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [17]:
# Filters the totalRatingCount field based on the threshold value of popularity.
popularity_threshold = 50
filteredMoviesDf = totalRatingCountDf.query('totalRatingCount >= @popularity_threshold')
filteredMoviesDf.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [19]:
# Returns the size of the data frame.
filteredMoviesDf.shape

(41362, 5)

In [20]:
# Creates a table from data frame and assigns 0 where data is NA.
pivotedMoviesTable = filteredMoviesDf.pivot_table(index='title',columns='userId',values='rating').fillna(0)
pivotedMoviesTable.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [21]:
# Creates a csr_matrix from a pivoted table and prepares a model.
from scipy.sparse import csr_matrix

moviesCSRMatrix = csr_matrix(pivotedMoviesTable.values)

from sklearn.neighbors import NearestNeighbors


knnModel = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knnModel.fit(moviesCSRMatrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [22]:
pivotedMoviesTable.shape

(450, 606)

In [25]:
# Collect one random data from a pivoted table and use it to 
# test in our trained model to return the distances and 
# indices of first 6 data from the final result.
# We set the value of K = 6 in our KNN model. (n_neighbors = 6) 
queryIndex = np.random.choice(pivotedMoviesTable.shape[0])
print(queryIndex)
distances, indices = knnModel.kneighbors(pivotedMoviesTable.iloc[queryIndex,:].
                                         values.reshape(1, -1), n_neighbors = 6)

430


In [26]:
# Print the movie title and the corresponding distance from 
# query data from the final result.  
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(pivotedMoviesTable.
                                                  index[queryIndex]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, pivotedMoviesTable.
                          index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Waterworld (1995):

1: Outbreak (1995), with distance of 0.3990267515182495:
2: True Lies (1994), with distance of 0.41963857412338257:
3: Stargate (1994), with distance of 0.43197691440582275:
4: Braveheart (1995), with distance of 0.4486045837402344:
5: Batman Forever (1995), with distance of 0.4569993019104004:
