# Metadata-based Recommender System

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
cred_df = pd.read_csv('credits.csv')
key_df = pd.read_csv('keywords.csv')
df = pd.read_csv('movies_metadata.csv', dtype={'popularity':str})
df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count', 'overview', 'id']]

In [3]:
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,overview,id
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,Just when George Banks has recovered from his ...,11862


## Joining the three datasets

In [6]:
# id column of df is not 'int' to join
#Convert the IDs of df into int, but bad data
#df['id'] = df['id'].astype('int')

In [7]:
# since df['id'] is not clean
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

#Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

#Filter all rows that have a null ID
df = df[df['id'].notnull()]

In [8]:
# Convert IDs into integer
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

In [9]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,overview,id,cast,crew,keywords
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## Transforming

In [10]:
# convert release date to pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# extract year from datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

# function to convert 'year' into integers and NaT to 0
def convert_int(x):
    try:
        return int(x)
    except:
        return 0
    
# Apply function to year
df['year'] = df['year'].apply(convert_int)
df = df.drop('release_date', axis=1)

In [11]:
# convert stringed json to list (basically remove quotes)
from ast import literal_eval
df['genres'] = df['genres'].fillna('[]')
df['genres'] = df['genres'].apply(literal_eval)
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [12]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,overview,id,cast,crew,keywords,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",1995


In [13]:
# convert from string to list of dicts
features = ['cast', 'crew', 'keywords']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [14]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [15]:
# Extract only the director, or np.nan
# here x is going to be a list (of dicts)
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

df['director'] = df['crew'].apply(get_director)
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [16]:
# Extract only the top three keywords and cast, and genres
def generate_list(x):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [17]:
df = df.drop('crew', axis=1)
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,overview,id,cast,keywords,year,director
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",862,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]",1995,John Lasseter
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...",1995,Joe Johnston
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]",1995,Howard Deutch
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",31357,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...",1995,Forest Whitaker
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,Just when George Banks has recovered from his ...,11862,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",1995,Charles Shyer


In [18]:
# stripping whitespaces in people names to make them unique single words, eg, ryangosling and ryanreynolds
def sanitize(x):
    # for cast, genre and keywords columns
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # for director column
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

## Vectorizing

Using CountVectorizer instead of tf-idf because some influential actors and directors are in many movies and they will get less weightage in tf-idf.

Also using cosine similarity instead of the faster linear kernel (dot product) beacuse the magnitudes of the vectors in CountVectorizer are not unity

In [19]:
# Making a meta-data soup for vectorization
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)

In [20]:
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [21]:
# randomly sampling to overcome memory issues later on
df_slice = df.sample(frac=0.5, random_state=123).reset_index()

In [22]:
#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_slice['soup'])
count_matrix.shape

(23314, 48414)

In [23]:
# compute cosine similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [24]:
# create reverse index of movie titles
df_slice = df_slice.reset_index()
indices = pd.Series(df_slice.index, index=df_slice['title'])

In [25]:
# The function
def movie_recommender(title, cosine_sim=cosine_sim2, movies=df_slice, indices=indices):
    # get the index of the title
    ind = indices[title]
    
    # pairwise scores with all movies
    sim_scores = list(enumerate(cosine_sim[ind]))
    #print(sim_scores)
    
    # sort the similarities in descending order
    sim_scores = sorted(sim_scores, key=lambda tup: tup[1], reverse=True)
    
    # Pick the first 10 similar movies
    sim_scores = sim_scores[1:11]
    
    # get the movie indices
    similar_movies_ind = [x[0] for x in sim_scores]
    
    # return the movie titles
    return movies['title'].iloc[similar_movies_ind]

In [26]:
movie_recommender('Stuart Little')

6329                            Stuart Little 2
10748               A Journey Through Fairyland
6291                    Daishizen no Majū: Bagi
10631        VeggieTales: Josh and the Big Wall
1955                  Atlantis: The Lost Empire
17913            The Year Without a Santa Claus
21942                             Impy's Island
12917                       The Tale of the Fox
5837                               A.R.C.H.I.E.
13550    Homeward Bound: The Incredible Journey
Name: title, dtype: object

In [27]:
movie_recommender('Invincible Shaolin')

22735             Masked Avengers
6324            The Flying Dagger
11878     The One-Armed Swordsman
2846                     Redeemer
7491               Shaolin Temple
8859                     Headshot
19214              Raging Phoenix
20279               Hand of Death
20444                  Death Duel
455      Last Hurrah for Chivalry
Name: title, dtype: object

In [28]:
movie_recommender('The Exorcist')

7270               Darkness Falls
17234                 Getting Out
18093         Aashiq Banaya Aapne
10203    When the Lights Went Out
10840           The Last Exorcism
21910              The Invitation
9996                          III
4753           The Least of These
8456                 The Cabining
9667                    Last Exit
Name: title, dtype: object

### More scope for experiments:
1. With number of keywords, genres, and cast <br>
2. Make definite sub-genres as a sub-genre might only belong to one movie <br>
3. More weight to the director, perhaps, by repeating the name ‘n’ times <br>
4. Other metadata like production companies, countries, languages <br>