A more advanced recommender based on Credits, Genre, and Keywords

This recommender system is based on the following metadata: the 3 top actors, the director, related genre, and the movie plot keywords.

In [None]:
import pandas as pd

# Load movies metadata
metadata = pd.read_csv("../data/external/movies_metadata.csv", low_memory=False)

In [None]:
# Load keywords and credits
credits = pd.read_csv("../data/external/credits.csv")

credits.head(10)

In [None]:
keywords = pd.read_csv("../data/external/keywords.csv")

keywords.head(10)

In [None]:
# Remove rows with bad IDs
metadata = metadata.drop([19730, 29503, 35587])

# Converts IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

metadata.head(2)

From these features, need to extract the three most important actors, the director and the keywords associated with that movie.

But first, need to convert the "stringified" lists into a form that's usable later.

In [None]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)


In [None]:
metadata.head(2)

movie_idx = 15
print(metadata['title'][movie_idx])
actors = [i['name'] for i in metadata['cast'][movie_idx]]
keywords = [i['name'] for i in metadata['keywords'][movie_idx]]
genres = [i['name'] for i in metadata['genres'][movie_idx]]
genres
#for actor in metadata['cast'][movie_idx]:
#    print(actor['name'])



In [None]:
import numpy as np

# Write a function to help extract the required information from each feature

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [None]:
# Function to return the top 3 elements or the entire list, which is more
# List -> cast, keywords, or genres
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If not, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    
    # Return empty list in case of missing / malformed data
    return []


In [None]:
# Define new director, cast, genres, and keywords features that are in suitable format
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

# Print the new features for the first 5 films
metadata[['title', 'director', 'cast', 'keywords', 'genres']].head(5)

Next we need to clean-up the names and keywords.
This involves converting all of it to lowercase and stripping all the spaces between them. This ensures that the vectorizer does its job properly.

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


In [None]:
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

# Print the new features for the first 5 films
metadata[['title', 'director', 'cast', 'keywords', 'genres']].head(5)

In [None]:
# Define a create_soup function that simply joins all the required columns by a space. This "soup" will be fed to the vectorizer eventually
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


In [None]:
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [None]:
metadata[['soup']].head(3)

Next step is same as what was done for the plot description based vectorizer. Only difference is that we use CountVectorizer() instead of TF-IDF; doesn't make any intuitive sense to down weight actor/director's presence if he/she has acted in or directed more movies.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', dtype=np.float32)
count_matrix = count.fit_transform(metadata['soup'])

count_matrix.shape

In [None]:
# Compute the cosine similarity matrix based on count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of the main dataframe and construct reverse mapping
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [None]:
# Define function that takes in movie title and output the most similar movies
def get_recommendation(title, cosine_sim=cosine_sim):
    # Get index of the movie that matches the titles
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with the input movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movie titles
    return metadata['title'].iloc[movie_indices]


In [None]:
get_recommendation('The Dark Knight Rises', cosine_sim)

In [None]:
get_recommendation('The Godfather', cosine_sim)

In [None]:
get_recommendation('Toy Story', cosine_sim)