In [1]:
import pandas as pd
import numpy as np  
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def get_recommendation(movies_df, film_name, metric):
    film_id = movies_df[movies_df['original_title'] == film_name].index[0]
    metric[film_id, film_id] = -1
    rec_ids = np.argsort(metric[film_id, :])[::-1][:10] 
    return list(movies_df.loc[rec_ids, 'original_title'])

def get_country_name(pc):
    pc = ast.literal_eval(pc)
    if not isinstance(pc, list):
        return ''
    return '' if len(pc) == 0 else pc[0]['name']

def extract_cast(cast):
    max_actors = 6
    credit_list = ast.literal_eval(cast)[:max_actors]
    return ' '.join([''.join(act['name'].lower().split(' ')) for act in credit_list])

def extract_director(crew):
    for cm in ast.literal_eval(crew):
        if cm['job'] == 'Director':
            return ''.join(cm['name'].lower().split(' '))
    return ''

In [35]:
movies = pd.read_csv('../data/movies_metadata.csv')
# print(movies.shape)
# print(movies.isna().sum())

# Selecting a small dataset
movies.dropna(subset=['overview', 'vote_average', 'production_countries', 'release_date'], inplace=True)

# filtering by languages
languages = ['en', 'es', 'fr', 'it', 'de']
movies = movies[movies['original_language'].isin(languages)]

# get recent movies
from_year = 1970
movies[movies['release_date'].str[:4].astype(float) > from_year]

# filtering by vote (to estimate wiht a reduced dataset)
movies['production_countries'] = movies['production_countries'].apply(get_country_name)
mean_vote = movies['vote_average'].mean()
most_voted = \
    (movies['vote_count'] >= 25) & \
    (movies['vote_average'] >= mean_vote - 0.5)
movies = movies[most_voted].reset_index(drop=True)
movies.shape

  movies = pd.read_csv('../data/movies_metadata.csv')


(11010, 24)

In [36]:
# getting credits
m_credits = pd.read_csv('../data/credits.csv')

# keping credits for the selected films
movies['id'] = movies['id'].astype('int64')
m_credits = pd.merge(m_credits, movies, on='id', how='inner')

# we need to get the actors name from the cast and the director name from the crew
m_credits['cast'] = m_credits['cast'].apply(extract_cast)
m_credits = m_credits[m_credits['cast'] != ''].reset_index(drop=True)
m_credits['director'] = m_credits['crew'].apply(extract_director)
m_credits = m_credits[m_credits['director'] != ''].reset_index(drop=True)

# adding the director to the cast to have the complete cast
director_factor = 2
m_credits['whole_cast'] = m_credits['cast'] + director_factor*(' ' + m_credits['director'])

In [7]:
# Getting genre matrix

# formating genre
m_credits['genres'] = m_credits['genres'].apply(lambda genre: ' '.join([g['name'] for g in ast.literal_eval(genre)]).lower())

# tokenizing genre
gen_stop_words = ['the', 'movie', 'production', 'productions', 'film']
cv1 = CountVectorizer(stop_words=gen_stop_words)
x_gen = cv1.fit_transform(m_credits['genres'])

# computing cosine similarity based on genre (this is slow but only needs to be computed once)
gen_metric = cosine_similarity(x_gen.toarray())

In [9]:
film = 'Toy Story'
get_recommendation(m_credits, film, gen_metric)

['The Great Mouse Detective',
 'Over the Hedge',
 "Mickey's Once Upon a Christmas",
 'Frankenweenie',
 'Madly Madagascar',
 "Dug's Special Mission",
 "Surf's Up",
 'Barbie as The Princess & the Pauper',
 'The Lion King 1½',
 'Hotel Transylvania 2']

In [10]:
# Getting overview matrix

# tokenizing overview
overview_stop_words = ['the', 'movie', 'production', 'productions', 'film', 'is', 'and', 'or']
cv2 = CountVectorizer(stop_words='english', token_pattern=r'\b[^\d\W]+\b')
x_ov = cv2.fit_transform(movies['overview']).toarray()
tokens = cv2.get_feature_names_out()

# computing cosine similarity based on overview (this is slow but only needs to be computed once)
ov_metric = cosine_similarity(x_ov)

In [11]:
film = 'Toy Story'
get_recommendation(m_credits, film, ov_metric)

['Toy Story 2',
 'Toy Story 3',
 'The 40 Year Old Virgin',
 'Four Rooms',
 '2010',
 'While You Were Sleeping',
 'The Brady Bunch Movie',
 "Harry Potter and the Philosopher's Stone",
 'Singles',
 'Bridge to Terabithia']

In [12]:
# Getting casting matrix

cv3 = CountVectorizer(stop_words='english', token_pattern=r'\b[^\d\W]+\b')
x_cast = cv3.fit_transform(m_credits['whole_cast']).toarray()

# computing cosine similarity based on overview (this is slow but only needs to be computed once)
cast_metric = cosine_similarity(x_cast)

In [15]:
film = 'Inglourious Basterds'
get_recommendation(m_credits, film, cast_metric)

['Death Proof',
 'The Hateful Eight',
 'Reservoir Dogs',
 "My Best Friend's Birthday",
 'Kill Bill: Vol. 2',
 'Pulp Fiction',
 'Django Unchained',
 'Jackie Brown',
 'Quel maledetto treno blindato',
 'Kill Bill: Vol. 1']

In [5]:
movies = pd.read_csv('../data/movies_metadata.csv')

  movies = pd.read_csv('../data/movies_metadata.csv')


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [18]:
languages = ['en', 'es', 'fr', 'it', 'de']
movies = movies[movies['original_language'].isin(languages)]

from_year = 1970
movies[movies['release_date'].str[:4].astype(float) > from_year]

(38310, 24)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45458,False,,0,"[{'id': 27, 'name': 'Horror'}]",,289923,tt0252966,en,The Burkittsville 7,A film archivist revisits the story of Rustin ...,...,2000-10-03,0.0,30.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Do you know what happened 50 years before ""The...",The Burkittsville 7,False,7.0,1.0
45459,False,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,222848,tt0112613,en,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,...,1995-01-01,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Caged Heat 3000,False,3.5,1.0
45460,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",...,1991-05-13,0.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Robin Hood,False,5.7,26.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0


In [23]:
movies['release_date'].isna().sum()

73