Data Source and Contents

In [1]:
#Import Necessary Libraries and Functions
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.neighbors import NearestNeighbors
import pickle

In [2]:
#Prinitng Versions of Necessary Libraries for Requirements.txt

print("Pandas Version:", pd.__version__)
print("Numpy Version:", np.__version__)
print("Scikit-Learn:", sk.__version__)


Pandas Version: 2.2.2
Numpy Version: 2.0.2
Scikit-Learn: 1.6.1


In [3]:
#Import the database into a dataframe
file_path="movies_recommendation_data.csv"
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [4]:
#Make sure all the datasets are in the right data type so that the model can read it
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [5]:
#Drop unnecessary columns that are not relevant to the model
#Drop label because we are not classifying the movies into a category
df=df.drop("Label", axis=1)
#Drop the Movie ID because it will not affect the way the model makes its decision
df=df.drop("Movie ID", axis=1)

In [6]:
df

Unnamed: 0,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,The Imitation Game,8.0,1,1,1,0,0,0,0
1,Ex Machina,7.7,0,1,0,0,0,1,0
2,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,Forrest Gump,8.8,0,1,0,0,0,0,0
5,21,6.8,0,1,0,0,1,0,1
6,Gifted,7.6,0,1,0,0,0,0,0
7,Travelling Salesman,5.9,0,1,0,0,0,1,0
8,Avatar,7.9,0,0,0,0,0,0,0
9,The Karate Kid,7.2,0,1,0,0,0,0,0


In [7]:
#Separate the columns into numerical and nominal databases

#The only nominal variable in the dataframe is the movie title
df_title=df['Movie Name']

#The rest of the variables are numerical representations
df_movie_info=df.drop('Movie Name', axis=1)

In [8]:
#Show the descriptive statistics for the numerical variables
df_movie_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   IMDB Rating  30 non-null     float64
 1   Biography    30 non-null     int64  
 2   Drama        30 non-null     int64  
 3   Thriller     30 non-null     int64  
 4   Comedy       30 non-null     int64  
 5   Crime        30 non-null     int64  
 6   Mystery      30 non-null     int64  
 7   History      30 non-null     int64  
dtypes: float64(1), int64(7)
memory usage: 2.0 KB


In [9]:
#Convert Categorical Variables to the proper format
df_movie_info['Biography']=pd.Categorical(df_movie_info['Biography'])
df_movie_info['Drama']=pd.Categorical(df_movie_info['Drama'])
df_movie_info['Thriller']=pd.Categorical(df_movie_info['Thriller'])
df_movie_info['Comedy']=pd.Categorical(df_movie_info['Comedy'])
df_movie_info['Crime']=pd.Categorical(df_movie_info['Crime'])
df_movie_info['Mystery']=pd.Categorical(df_movie_info['Mystery'])
df_movie_info['History']=pd.Categorical(df_movie_info['History'])

In [10]:
df_movie_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   IMDB Rating  30 non-null     float64 
 1   Biography    30 non-null     category
 2   Drama        30 non-null     category
 3   Thriller     30 non-null     category
 4   Comedy       30 non-null     category
 5   Crime        30 non-null     category
 6   Mystery      30 non-null     category
 7   History      30 non-null     category
dtypes: category(7), float64(1)
memory usage: 1.4 KB


In [11]:
#See if there are any missing values in the database
df_movie_info.isnull().sum()

Unnamed: 0,0
IMDB Rating,0
Biography,0
Drama,0
Thriller,0
Comedy,0
Crime,0
Mystery,0
History,0


Building the Recommender

In [12]:
#Build the KNN Model
knn_model=NearestNeighbors(n_neighbors=5)
#Apply the model to the numerical data
fitted_knn_model=knn_model.fit(df_movie_info)

In [17]:
#Save ML Model as serialized persistent object
ml_model='movie_recommender.sav'
pickle.dump(fitted_knn_model, open(ml_model,'wb'))

title_df='movie_title.sav'
pickle.dump(df_title, open(title_df,'wb'))

In [14]:
#=========================== In-code TESTING SECTION ===========================
### TEST THE ABOVE CODE FOR PRECTION WITH HARD-CODED TEST DATA
# ========== Testind Data Record for one hypothetical movie ==============
# Title = The Post
# IMDB Rating = 7.2
# Biography = 1
# Drama = 1
# Thriller = 0
# Comedy = 0
# Crime = 0
# Mystery = 0
# History = 1

In [21]:
#Loading test data into an array

Movie_Title = 'The Post'
IMDB_Rating = 7.2
Biography = 1
Drama = 1
Thriller = 0
Comedy = 0
Crime = 0
Mystery = 0
History = 1

features=[IMDB_Rating, Biography, Drama, Thriller, Comedy, Crime, Mystery, History]
features_array=np.array(features)
single_sample = features_array.reshape(1,-1)


# Load persistent ML model object and use for prediction
loaded_model = pickle.load(open('movie_recommender.sav', 'rb'))
df_title_2 = pickle.load(open('movie_title.sav', 'rb'))

#The kneighbors function returns the distances from each point to the nearest neighbor and the indexes of the nearest neighbors.
#Plug the test movie into the function as they base it on the initial model that was built
distance, movie_index= knn_model.kneighbors(single_sample)

#Printing the names of the 5 nearest neighbors by utilizing the indexes returned from the model and the dataframe with the movie titles
print(f"\nTop 5 similar movies to {Movie_Title}:")
for i, index in enumerate(movie_index[0]):
    title = df_title_2.iloc[index]
    if title != "The Post":
        print(f"{title}")


Top 5 similar movies to The Post:
12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind


