In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval

metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)

# Using Overview Cosine Similarity

In [2]:
vecotrizer = TfidfVectorizer(stop_words='english')
corpus = metadata['overview'].fillna('')
tfidf = vecotrizer.fit_transform(corpus)
tfidf.shape

(45466, 75827)

In [3]:
cos_sim = linear_kernel(tfidf, tfidf)

In [12]:
movie_indices = pd.Series(metadata.index, metadata['title'])
def get_recommendataions(title, sim_matrix, num_recomm):
    idx = movie_indices[title]
    similarities = list(enumerate(sim_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    recomm_movie_indices = [id for id, score in similarities[:num_recomm]]
    
    return metadata.iloc[recomm_movie_indices]['title']

In [5]:
get_recommendataions('The Godfather', cos_sim, 10)

834                         The Godfather
1178               The Godfather: Part II
44030    The Godfather Trilogy: 1972-1990
1914              The Godfather: Part III
23126                          Blood Ties
11297                    Household Saints
34717                   Start Liquidation
10821                            Election
38030            A Mother Should Be Loved
17729                   Short Sharp Shock
Name: title, dtype: object

# Content-based Recommendataion

In [2]:
# Add more content
metadata['id'] = metadata[metadata['id'].apply(lambda x: x.isnumeric())]['id'].astype(int) # Remove non-numeric ID
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Transform credits and keywords
for feature in ['cast', 'crew', 'keywords', 'genres']:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [3]:
# Utility methods for feature engineering
def get_director(crew):
    director = list(filter(lambda c: c['job'] == 'Director', crew))
    return director[0]['name'] if director else np.nan

def get_list(data, field_name):
    values = [d[field_name] for d in data]
    return values[:3]

def clean_str(data):
    return str.lower(data.replace(' ', '')) if data is not np.nan else ''

def clean_str_list(data):
    return [str.lower(s.replace(' ', '')) for s in data]

In [4]:
metadata['director'] = metadata['crew'].apply(get_director)
metadata['director'] = metadata['director'].apply(clean_str)
for feature in ['cast', 'keywords', 'genres']:
    metadata[feature] = metadata[feature].apply(get_list, field_name='name') # Get first 3 stars
    metadata[feature] = metadata[feature].apply(clean_str_list)
    
metadata[['original_title', 'director', 'cast', 'keywords', 'genres']]

Unnamed: 0,original_title,director,cast,keywords,genres
0,Toy Story,johnlasseter,"[tomhanks, timallen, donrickles]","[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,joejohnston,"[robinwilliams, jonathanhyde, kirstendunst]","[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,Grumpier Old Men,howarddeutch,"[waltermatthau, jacklemmon, ann-margret]","[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,Waiting to Exhale,forestwhitaker,"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,Father of the Bride Part II,charlesshyer,"[stevemartin, dianekeaton, martinshort]","[baby, midlifecrisis, confidence]",[comedy]
...,...,...,...,...,...
46623,رگ خواب,hamidnematollah,"[leilahatami, kouroshtahami, elhamkorda]",[tragiclove],"[drama, family]"
46624,Siglo ng Pagluluwal,lavdiaz,"[angelaquino, perrydizon, hazelorencio]","[artist, play, pinoy]",[drama]
46625,Betrayal,markl.lester,"[erikaeleniak, adambaldwin, juliedupage]",[],"[action, drama, thriller]"
46626,Satana likuyushchiy,yakovprotazanov,"[iwanmosschuchin, nathalielissenko, pavelpavlov]",[],[]


In [42]:
def create_soup(data):
    soup = ''
    for feature in ['keywords', 'cast', 'genres']:
        soup += '{0} '.format(' '.join(data[feature]))
    return soup + ' ' + data['director']

In [43]:
metadata['soup'] = metadata.apply(create_soup, axis=1)
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(metadata['soup'])
count_matrix.shape

(46628, 73881)

In [44]:
cos_sim = cosine_similarity(count_matrix, count_matrix)

In [47]:
get_recommendataions('The Godfather', cos_sim, 10)

841                       The Godfather
1934            The Godfather: Part III
1199             The Godfather: Part II
15609                   The Rain People
18940                         Last Exit
34488                              Rege
35802            Manuscripts Don't Burn
35803            Manuscripts Don't Burn
8001     The Night of the Following Day
18261                 The Son of No One
Name: title, dtype: object