In [None]:
#Building a movie recommendation engine using python

In [None]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Load the data
from google.colab import files
uploaded=files.upload()

Saving movie_sample_dataset.csv to movie_sample_dataset.csv


In [None]:
#Store the data
df=pd.read_csv('movie_sample_dataset.csv')
#Show the first 3 rows of data
df.head(3)

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,language,country,budget,imdb_score,actors,movie_facebook_likes
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,English,USA,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,English,USA,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,English,USA,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000


In [None]:
#Get a count of the number of rows/movies in the data set and the number of columns
df.shape

(99, 13)

In [None]:
#Create a list of important columns  for the recommendation engine
columns=['director_name','actors','genres','movie_title']

In [None]:
#Adding new column movie_id to dataset
df['Movie_ID'] = range(0, 0+len(df))


In [None]:
#Show the data
df[columns].head(3)

Unnamed: 0,director_name,actors,genres,movie_title
0,Martin Scorsese,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",Biography|Comedy|Crime|Drama,The Wolf of Wall Street
1,Shane Black,"Robert Downey Jr.,Jon Favreau,Don Cheadle",Action|Adventure|Sci-Fi,Iron Man 3
2,Quentin Tarantino,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",Crime|Drama|Mystery|Thriller|Western,The Hateful Eight


In [None]:
#Check for any missing values in the important columns
a=df[columns].isnull().values.any()
print(a)
df[columns].isnull().sum()

True


director_name    11
actors            0
genres            1
movie_title       0
dtype: int64

In [None]:
#Cleaning the data containing null values
DataDummies = pd.get_dummies(columns)
DataDummies

Unnamed: 0,actors,director_name,genres,movie_title
0,0,1,0,0
1,1,0,0,0
2,0,0,1,0
3,0,0,0,1


In [None]:
#Checking the cleaning is successful or not
print(DataDummies.isnull().sum())
b=DataDummies.isnull().values.any()
print(b)

actors           0
director_name    0
genres           0
movie_title      0
dtype: int64
False


In [None]:
#Create a function to combine the values of the important columns into a single string
def get_important_features(data):
  important_features=[]
  for i in range(0,data.shape[0]):
    important_features.append(str(data['actors'][i]) +' '+ str(data['director_name'][i]) +' '+ str(data['genres'][i]) +' '+str(data['movie_title'][i]))

  return important_features


In [None]:
#Create a column to hold combined strings
df['important_features'] = get_important_features(df)

#Show the data
df.head(20)

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,language,country,budget,imdb_score,actors,movie_facebook_likes,Movie_ID,important_features
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,English,USA,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000,0,"Leonardo DiCaprio,Matthew McConaughey,Jon Favr..."
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,English,USA,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000,1,"Robert Downey Jr.,Jon Favreau,Don Cheadle Shan..."
2,color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,English,USA,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000,2,"Craig Stark,Jennifer Jason Leigh,Zoë Bell Quen..."
3,Color,Kenneth Lonergan,186,46495.0,Drama,Margaret,2011,English,usa,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0,3,"Matt Damon,Kieran Culkin,John Gallagher Jr. Ke..."
4,Color,Peter Jackson,186,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation of Smaug,2013,English,USA,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000,4,"Aidan Turner,Adam Brown,James Nesbitt Peter Ja..."
5,,,183,330249062.0,Action|Adventure|Sci-Fi,Batman v Superman: Dawn of Justice,202,English,USA,250000000.0,6.9,"Henry Cavill,Lauren Cohan,Alan D. Purwin",197000,5,"Henry Cavill,Lauren Cohan,Alan D. Purwin nan A..."
6,Color,Peter Jackson,-50,303001229.0,Adventure|Fantasy,The Hobbit: An Unexpected Journey,2012,English,USA,180000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",166000,6,"Aidan Turner,Adam Brown,James Nesbitt Peter Ja..."
7,Color,Edward Hall,180,,Drama|Romance,Restless,2012,English,UK,,7.2,"Rufus Sewell,Hayley Atwell,Charlotte Rampling",434,7,"Rufus Sewell,Hayley Atwell,Charlotte Rampling ..."
8,Color,Joss Whedon,173,623279547.0,Action|Adventure|Sci-Fi,The Avengers,2012,English,USA,220000000.0,8.1,"Chris Hemsworth,Robert Downey Jr.,Scarlett Joh...",123000,8,"Chris Hemsworth,Robert Downey Jr.,Scarlett Joh..."
9,Color,Joss Whedon,173,623279547.0,Action|Adventure|Sci-Fi,The Avengers,2012,English,USA,220000000.0,8.1,"Chris Hemsworth,Robert Downey Jr.,Scarlett Joh...",123000,9,"Chris Hemsworth,Robert Downey Jr.,Scarlett Joh..."


In [None]:
#Convert the text to a matrix of token counts
cm= CountVectorizer().fit_transform(df['important_features'])

In [None]:
#Get the cosine similarity matrix from the count matrix
cs=cosine_similarity(cm)
#Print the cosine similarity matrix
print(cs)

[[1.         0.12524486 0.17647059 ... 0.13453456 0.070014   0.12964074]
 [0.12524486 1.         0.         ... 0.         0.0745356  0.        ]
 [0.17647059 0.         1.         ... 0.13453456 0.070014   0.06482037]
 ...
 [0.13453456 0.         0.13453456 ... 1.         0.08006408 0.14824986]
 [0.070014   0.0745356  0.070014   ... 0.08006408 1.         0.07715167]
 [0.12964074 0.         0.06482037 ... 0.14824986 0.07715167 1.        ]]


In [None]:
#Get the shape of the cosine similarity matrix
cs.shape

(99, 99)

In [None]:
#Get the title of the movie that the user likes
title=input(print('Enter movie name:'))

#Find the movies id
movie_id=df[df.movie_title==title]['Movie_ID'].values[0]

Enter movie name:
The Avengers


In [None]:
#Create a list of enumerations for the similarity scores
#We get a list of tuples as[(movie_id,similarity score),(...)]
scores= list(enumerate(cs[movie_id]))

In [None]:
#Sort the list
#Here, x corresponds to scores and position 1 is the similarity score, Reverse=True sorts list in descending order
sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)
sorted_scores= sorted_scores[1:]

In [None]:
#Print the sorted scores
print(sorted_scores)

[(9, 0.9999999999999998), (60, 0.8767140075192089), (41, 0.6454972243679026), (88, 0.5009794328681195), (1, 0.46666666666666656), (66, 0.33333333333333326), (80, 0.33333333333333326), (58, 0.32274861218395134), (25, 0.3131121455425747), (29, 0.28644594961577313), (49, 0.2760262237369417), (28, 0.2666666666666666), (38, 0.2666666666666666), (59, 0.2666666666666666), (5, 0.2581988897471611), (17, 0.2581988897471611), (51, 0.2581988897471611), (84, 0.2581988897471611), (36, 0.2369395511036369), (37, 0.23094010767585027), (44, 0.23094010767585027), (71, 0.223606797749979), (13, 0.21483446221182984), (19, 0.21081851067789195), (34, 0.20701966780270625), (93, 0.20701966780270625), (4, 0.18257418583505536), (89, 0.18257418583505536), (40, 0.14907119849998599), (52, 0.14907119849998599), (53, 0.14907119849998599), (77, 0.14907119849998599), (24, 0.14322297480788657), (83, 0.14322297480788657), (6, 0.1333333333333333), (18, 0.1333333333333333), (30, 0.1333333333333333), (31, 0.1333333333333333)

In [None]:
#Create a loop to print the first 7 similar movies
j=0
print("The 7 most recommended movies to", title,'are:\n')
for item in sorted_scores:
  movie_title=df[df.Movie_ID==item[0]]['movie_title'].values[0]
  print(j+1, movie_title)
  j=j+1
  if j>7:
    break

The 7 most recommended movies to The Avengers are:

1 The Avengers
2 Avengers: Age of Ultron
3 Captain America: Civil War
4 Captain America: The Winter Soldier
5 Iron Man 3
6 The Judge
7 The Wolverine
8 The Amazing Spider-Man 2
