Data Source and Contents

In [36]:
#Import Necessary Libraries and Functions
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [37]:
#Import the database into a dataframe
df=pd.read_csv("https://github.com/ArinB/MSBA-CA-Data/raw/main/CA05/movies_recommendation_data.csv")
df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [38]:
#Make sure all the datasets are in the right data type so that the model can read it
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [39]:
#Drop unnecessary columns that are not relevant to the model
#Drop label because we are not classifying the movies into a category
df=df.drop("Label", axis=1)
#Drop the Movie ID because it will not affect the way the model makes its decision
df=df.drop("Movie ID", axis=1)

In [40]:
df

Unnamed: 0,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,The Imitation Game,8.0,1,1,1,0,0,0,0
1,Ex Machina,7.7,0,1,0,0,0,1,0
2,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,Forrest Gump,8.8,0,1,0,0,0,0,0
5,21,6.8,0,1,0,0,1,0,1
6,Gifted,7.6,0,1,0,0,0,0,0
7,Travelling Salesman,5.9,0,1,0,0,0,1,0
8,Avatar,7.9,0,0,0,0,0,0,0
9,The Karate Kid,7.2,0,1,0,0,0,0,0


In [41]:
#Separate the columns into numerical and nominal databases

#The only nominal variable in the dataframe is the movie title
df_title=df['Movie Name']

#The rest of the variables are numerical representations
df_movie_info=df.drop('Movie Name', axis=1)

In [42]:
#Show the descriptive statistics for the numerical variables
df_movie_info.describe()

Unnamed: 0,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1
std,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129
min,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
#See if there are any missing values in the database
df_movie_info.isnull().sum()

Unnamed: 0,0
IMDB Rating,0
Biography,0
Drama,0
Thriller,0
Comedy,0
Crime,0
Mystery,0
History,0


Building the Recommender

In [44]:
#Build the KNN Model
knn_model=NearestNeighbors(n_neighbors=5)
#Apply the model to the numerical data
knn_model.fit(df_movie_info)

In [45]:
#Add the new information for the test movie
the_post_info=[7.2, 1, 1, 0, 0, 0, 0, 1]

In [46]:
#The kneighbors function returns the distances from each point to the nearest neighbor and the indexes of the nearest neighbors.
#Plug the test movie into the function as they base it on the initial model that was built
distance, movie_index= knn_model.kneighbors([the_post_info])



In [47]:
#Printing the names of the 5 nearest neighbors by utilizing the indexes returned from the model and the dataframe with the movie titles
print("\nTop 5 similar movies to 'The Post':")
for i, index in enumerate(movie_index[0]):
    title = df_title.iloc[index]
    if title != "The Post":
        print(f"{title}")



Top 5 similar movies to 'The Post':
12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind
