In [15]:
# Calculate mean of vote average column 
C = metadata['vote_average'].mean() 
print(C)

5.618207215133889


In [56]:
# Import Pandas 
import pandas as pd
# Load Movies Metadata

url= "https://github.com/jiyeon1997/movie-recommender-python/raw/master/movies_metadata.csv"
metadata = pd.read_csv(url, low_memory=True)
# Print the first three rows 
#metadata.head(3)
metadata[['title','vote_average','vote_count','overview']].head(20)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,vote_average,vote_count,overview
0,Toy Story,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,6.9,2413.0,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,6.5,92.0,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,6.1,34.0,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,5.7,173.0,Just when George Banks has recovered from his ...
5,Heat,7.7,1886.0,"Obsessive master thief, Neil McCauley leads a ..."
6,Sabrina,6.2,141.0,An ugly duckling having undergone a remarkable...
7,Tom and Huck,5.4,45.0,"A mischievous young boy, Tom Sawyer, witnesses..."
8,Sudden Death,5.5,174.0,International action superstar Jean Claude Van...
9,GoldenEye,6.6,1194.0,James Bond must unmask the mysterious head of ...


In [57]:
# Calculate the minimum number of votes required to be in the chart, m 
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [58]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m] 

print(q_movies.shape)
metadata.shape

(4555, 24)


(45466, 24)

In [59]:
# Function that computes the weighted rating of each movie 
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula 
    return (v/(v+m) * R) + (m/(m+v) * C)
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [60]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)
#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [61]:
#Print plot overviews of the first 5 movies. 
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [62]:
metadata.shape
q_movies.shape

(4555, 25)

In [63]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'

tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string 
#metadata['overview'] = metadata['overview'].fillna('')
q_movies['overview'] = q_movies['overview'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data 
tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
#tfidf_matrix = tfidf.fit_transform(metadata['overview'])
#Output the shape of tfidf_matrix 
tfidf_matrix.shape

(4555, 19694)

In [64]:
#Array mapping from feature integer indices to feature name. 
tfidf.get_feature_names()[5000:5010]
#tfidf.get_feature_names_out()[5000:5010]

['did',
 'didn',
 'dido',
 'die',
 'died',
 'diego',
 'dies',
 'diesel',
 'diet',
 'dietary']

In [65]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)
cosine_sim[1]

(4555, 4555)


array([0.00522362, 1.        , 0.01249039, ..., 0.        , 0.01420965,
       0.01535064])

In [66]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(q_movies.index, index=q_movies['title']).drop_duplicates()
indices[:10]

title
The Shawshank Redemption         314
The Godfather                    834
Dilwale Dulhania Le Jayenge    10309
The Dark Knight                12481
Fight Club                      2843
Pulp Fiction                     292
Schindler's List                 522
Whiplash                       23673
Spirited Away                   5481
Life Is Beautiful               2211
dtype: int64

In [67]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return q_movies['title'].iloc[movie_indices]

In [68]:
get_recommendations('The Godfather')

5345          K-19: The Widowmaker
1189                      Das Boot
2965       The World Is Not Enough
5743                       Solaris
4238     Atlantis: The Lost Empire
40024            Deepwater Horizon
461           Hot Shots! Part Deux
897          2001: A Space Odyssey
7101          The Butterfly Effect
10384                   Flightplan
Name: title, dtype: object

In [69]:
# Load keywords and credits
keywordsURL = 'https://github.com/waelbeso/ML/raw/main/keywords.csv'
creditsURL = 'https://github.com/waelbeso/ML/raw/main/credits_4555.csv'
credits = pd.read_csv(creditsURL) 
keywords = pd.read_csv(keywordsURL)
# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])
# Convert IDs to int. Required for merging 
keywords['id'] = keywords['id'].astype('int') 
credits['id'] = credits['id'].astype('int') 
metadata['id'] = metadata['id'].astype('int')
# Merge keywords and credits into your main metadata dataframe 
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')
# Print the first two movies of your newly merged metadata 
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [70]:
# Parse the stringified features into their corresponding python objects 
from ast import literal_eval
features = ['cast', 'crew', 'keywords', 'genres'] 
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [71]:
print (metadata.shape)
import numpy as np
# Import Numpy import numpy as np
def get_director(x): 
    for i in x:
        if i['job'] == 'Director': 
            return i['name']
    return np.nan

(2012, 27)


In [72]:
import numpy as np
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3: 
            names = names[:3]
        return names
    #Return empty list in case of missing/malformed data
    return []
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)
# Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...",John Lasseter,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...",Joe Johnston,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...",Howard Deutch,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."


In [73]:
print (metadata.shape)

# Function to convert all strings to lower case and strip names of spaces

def clean_data(x):
    if isinstance(x, list):
        return [ str.lower(str(x).replace(" ", "")) for i in x ]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)
    
print (metadata.shape)

(2012, 28)
(2012, 28)


In [74]:

#This function makes use of the property of the cosine similarity funciton that
#the order and types of inputs don't matter, what matters is the similarity
#between different soups of words
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
'''
def create_soup(x):
    return' '.join( str(x['keywords'] ))  + ' ' + ' '.join( str( x['cast'] )) + ' ' + ' '.join(str(x['director'])) + ' ' + ' '.join(str(x['genres']))
'''
metadata['soup'] = metadata.apply(create_soup, axis=1)
print(metadata[['soup']].head())
#metadata[['title', 'soup', 'cast', 'director', 'keywords', 'genres']].head()
print(metadata[['soup']].shape)

                                                soup
0  [{'id':931,'name':'jealousy'},{'id':4290,'name...
1  [{'id':10090,'name':'boardgame'},{'id':10941,'...
2  [{'id':1495,'name':'fishing'},{'id':12392,'nam...
3  [{'id':818,'name':'basedonnovel'},{'id':10131,...
4  [{'id':1009,'name':'baby'},{'id':1599,'name':'...
(2012, 1)


In [75]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

#count = CountVectorizer(stop_words='english') 
count_matrix = count.fit_transform(metadata['soup'])

In [76]:
metadata['soup'].shape

(2012,)

In [77]:
count_matrix = count.fit_transform(metadata['soup']) 
count_matrix.shape

(2012, 125992)

In [78]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [79]:
get_recommendations('Toy Story', cosine_sim2)

7234                       Dawn of the Dead
4883                           Gosford Park
2646                         The Iron Giant
11921                 Live Free or Die Hard
362                                The Mask
23472    Mission: Impossible - Rogue Nation
6920                             Stagecoach
1299              A Nightmare on Elm Street
21171                              The Heat
1848                       Chariots of Fire
Name: title, dtype: object