# **CA05 – kNN based Movie Recommender Engine**

**What question are we trying to answer?**

Given a movies data set, what are the 5 most similar movies to a movie query?

In [None]:
#Importing necessary libraries:
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

In [None]:
#Reading the data
movie_df = pd.read_csv('https://github.com/ArinB/MSBA-CA-Data/raw/main/CA05/movies_recommendation_data.csv')
movie_df

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0
5,98,21,6.8,0,1,0,0,1,0,1,0
6,31,Gifted,7.6,0,1,0,0,0,0,0,0
7,3,Travelling Salesman,5.9,0,1,0,0,0,1,0,0
8,51,Avatar,7.9,0,0,0,0,0,0,0,0
9,47,The Karate Kid,7.2,0,1,0,0,0,0,0,0


### Exploratory Analysis and Data Preparation

In [None]:
#Checking the column names
movie_df.columns

Index(['Movie ID', 'Movie Name', 'IMDB Rating', 'Biography', 'Drama',
       'Thriller', 'Comedy', 'Crime', 'Mystery', 'History', 'Label'],
      dtype='object')

In [None]:
#Replacing spaces in column names with underscores and checking new column names
movie_df.columns = movie_df.columns.str.replace(' ', '_')

movie_df.columns

Index(['Movie_ID', 'Movie_Name', 'IMDB_Rating', 'Biography', 'Drama',
       'Thriller', 'Comedy', 'Crime', 'Mystery', 'History', 'Label'],
      dtype='object')

In [None]:
#Checking the shape of the data
movie_df.shape

(30, 11)

In [None]:
#Checking the data type of each column
movie_df.dtypes

Movie ID         int64
Movie Name      object
IMDB Rating    float64
Biography        int64
Drama            int64
Thriller         int64
Comedy           int64
Crime            int64
Mystery          int64
History          int64
Label            int64
dtype: object

In [None]:
#Describing IMDB Rating column (since this is the only column where applying describe function makes sense)
movie_df.IMDB_Rating.describe()

count    30.000000
mean      7.696667
std       0.666169
min       5.900000
25%       7.300000
50%       7.750000
75%       8.175000
max       8.800000
Name: IMDB_Rating, dtype: float64

In [None]:
#Checking for missing values - no missing values in any of the column
movie_df.isnull().sum()

Movie_ID       0
Movie_Name     0
IMDB_Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64

In [None]:
#Droping the Label column as it is not needed for running kNN algorithm
movie_df = movie_df.drop('Label', axis=1)
movie_df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0


### Building a Recommender System

In [None]:
#Setting feature columns and assigning them to a new variable 'x'
x = movie_df[['IMDB_Rating', 'Biography', 'Drama','Thriller', 'Comedy', 'Crime', 'Mystery', 'History']]

#Setting a class column and assigning it to a new variable 'y'
y = movie_df['Movie_Name']

In [None]:
#Defining the kNN model and fitting it on the feature and class data
knn_model = NearestNeighbors(n_neighbors=5).fit(x,y)

**Information about the movie 'The Post':**

IMDB Rating = 7.2, Biography = Yes, Drama = Yes, Thriller = No, Comedy = No, Crime = No, Mystery = No, History = Yes

In [None]:
#Creating a list containing information about the movie 'The Post'
The_Post = [7.2, 1, 1, 0, 0, 0, 0, 1]

In [None]:
#Calculating the distances between 'The Post' and all other movies in the dataset, and returning the indices of the 5 movies with the
#shortest distances
distances, indices = knn_model.kneighbors([The_Post])
print(indices)

[[28 27 29 16  2]]




In [None]:
#Getting the indices of the recommended movies and assigning them to a new variable "recommended_indices"
recommended_indices = indices[0]
#Getting the names of the recommended movies and assigning them to a new variable "recommended_movies"
recommended_movies = movie_df.iloc[recommended_indices]['Movie_Name'].values

In [None]:
#Printing the indices of the recommended movies
print(recommended_indices)

[28 27 29 16  2]


### Recommendations

In [None]:
#Using "recommended_movies" to print the top 5 recommended movies after watching 'The Post'
print("Top 5 Recommendations After Watching 'The Post':\n")
for title in recommended_movies:
    print(title)

Top 5 Recommendations After Watching 'The Post':

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind
