In [1]:
import pandas as pd

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from nltk.stem.snowball import SnowballStemmer

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd. read_csv('resources/movies_metadata.csv')
md.shape, md.columns

((45466, 24),
 Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
        'imdb_id', 'original_language', 'original_title', 'overview',
        'popularity', 'poster_path', 'production_companies',
        'production_countries', 'release_date', 'revenue', 'runtime',
        'spoken_languages', 'status', 'tagline', 'title', 'video',
        'vote_average', 'vote_count'],
       dtype='object'))

In [3]:
md.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
md.drop([19730, 29503, 35587], inplace=True)
md['id'] = md['id'].astype('int')

In [5]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [6]:
links_small = pd.read_csv('resources/links_small.csv')
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
links_small.shape

(9125, 3)

In [8]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [9]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 24)

#### Recommendations based on the description of a movie

In [10]:
smd[['tagline', 'overview']] = smd[['tagline', 'overview']].fillna('')
smd['description'] = smd['overview'] + smd['tagline']

In [11]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape

(9099, 268124)

In [12]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(9099, 9099)

In [13]:
smd = smd.reset_index()
indices = pd.Series(smd.index, index=smd.title)

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]
    result = pd.DataFrame()
    result['title'] = smd['title'].iloc[movie_indices]
    result['cos_sim'] = [i[1] for i in sim_scores]
    return result

In [15]:
get_recommendations('The Godfather')

Unnamed: 0,title,cos_sim
973,The Godfather: Part II,0.22006
8387,The Family,0.100294
3509,Made,0.067618
4196,Johnny Dangerously,0.065622
29,Shanghai Triad,0.056142
5667,Fury,0.056028
2412,American Movie,0.055023
1582,The Godfather: Part III,0.050235
4221,8 Women,0.047508
2159,Summer of Sam,0.045952


In [16]:
get_recommendations('The Dark Knight')

Unnamed: 0,title,cos_sim
7931,The Dark Knight Rises,0.171374
132,Batman Forever,0.122444
1113,Batman Returns,0.10089
8227,"Batman: The Dark Knight Returns, Part 2",0.084762
7565,Batman: Under the Red Hood,0.084197
524,Batman,0.081623
7901,Batman: Year One,0.077807
2579,Batman: Mask of the Phantasm,0.069629
2696,JFK,0.061759
8165,"Batman: The Dark Knight Returns, Part 1",0.060949


#### Recommendations based on the movie metadata

In [17]:
credits = pd.read_csv('resources/credits.csv')
keywords = pd.read_csv('resources/keywords.csv')

In [18]:
credits['id'] = credits['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')

In [19]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [20]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 27)

In [21]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

In [22]:
stemmer = SnowballStemmer('english')

def little_trick(char_sequence):
    return str.lower(char_sequence.replace(" ", "_"))

def get_director(crew):
    return next(([little_trick(person['name'])] * 2 
                 for person in crew if person['job'] == 'Director'), [])

def get_main_actors(cast):
    return [little_trick(i['name']) 
            for i in cast[:3]] if isinstance(cast, list) else []

def process_keywords(raw_keywords):
    return [little_trick(stemmer.stem(i['name'])) 
            for i in raw_keywords] if isinstance(raw_keywords, list) else []

In [23]:
smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(get_main_actors)
smd['keywords'] = smd['keywords'].apply(process_keywords)

In [24]:
smd[['title', 'director', 'cast', 'keywords']].head()

Unnamed: 0,title,director,cast,keywords
0,Toy Story,"[john_lasseter, john_lasseter]","[tom_hanks, tim_allen, don_rickles]","[jealousi, toy, boy, friendship, friend, rival..."
1,Jumanji,"[joe_johnston, joe_johnston]","[robin_williams, jonathan_hyde, kirsten_dunst]","[board_gam, disappear, based_on_children's_boo..."
2,Grumpier Old Men,"[howard_deutch, howard_deutch]","[walter_matthau, jack_lemmon, ann-margret]","[fish, best_friend, duringcreditssting, old_men]"
3,Waiting to Exhale,"[forest_whitaker, forest_whitaker]","[whitney_houston, angela_bassett, loretta_devine]","[based_on_novel, interracial_relationship, sin..."
4,Father of the Bride Part II,"[charles_shyer, charles_shyer]","[steve_martin, diane_keaton, martin_short]","[babi, midlife_crisi, confid, age, daughter, m..."


In [25]:
smd['words_soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['words_soup'] = smd['words_soup'].apply(lambda x: ' '.join(x))

In [26]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2), stop_words='english')
count_matrix = count.fit_transform(smd['words_soup'])
count_matrix.shape

(9219, 119953)

In [27]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(9219, 9219)

In [28]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [29]:
get_recommendations('The Dark Knight')

Unnamed: 0,title,cos_sim
8031,The Dark Knight Rises,0.423972
6218,Batman Begins,0.403064
6623,The Prestige,0.233304
7659,Batman: Under the Red Hood,0.222188
8927,Kidnapping Mr. Heineken,0.212708
1134,Batman Returns,0.200132
5943,Thursday,0.193329
1260,Batman & Robin,0.174201
2085,Following,0.174201
2448,Nighthawks,0.160623


In [30]:
get_recommendations('Avatar')

Unnamed: 0,title,cos_sim
8401,Star Trek Into Darkness,0.242091
974,Aliens,0.238904
8724,Jupiter Ascending,0.203653
3216,Dungeons & Dragons,0.197386
3060,Sinbad and the Eye of the Tiger,0.193971
4966,Hercules in New York,0.193971
1668,Return from Witch Mountain,0.19245
4017,Hawk the Slayer,0.19245
1011,The Terminator,0.19213
7265,Dragonball Evolution,0.191663


#### Recomendations for a certain user

In [31]:
ratings = pd.read_csv('resources/ratings_small.csv')
ratings = ratings[ratings.movieId.isin(smd.id)]
ratings.shape

(32131, 4)

In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
10,1,1371,2.5,1260759135
11,1,1405,1.0,1260759203
13,1,2105,4.0,1260759139
15,1,2193,2.0,1260759198
16,1,2294,2.0,1260759108


In [33]:
def get_ext_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores[:len(sim_scores) - 1], 
                        key=lambda x: x[1], 
                        reverse=True)[1:11]
    movie_indices = [i[0] for i in sim_scores]
    
    result = pd.DataFrame()
    result['recommended_movie'] = smd['title'].iloc[movie_indices]
    result['cos_sim'] = [i[1] for i in sim_scores]
    result['root_movie'] = title
    return result

In [34]:
def get_all_recommendations(user_id):
    def get_title(movie_id):
        return smd[smd['id'] == movie_id]['title'].item()
    
    user_info = ratings[(ratings.userId == user_id) & (ratings.rating > 3.0)]
    user_info['movie_title'] = user_info['movieId'].apply(get_title)
    
    dfs = [get_ext_recommendations(x) for x in user_info['movie_title']]
    recommended_movies = pd.concat(dfs)
    
    current_user_ratings = pd.Series(user_info['rating'].values, index=user_info.movie_title)
    recommended_movies['root_movie_rating'] = recommended_movies['root_movie'].apply(
        lambda title: current_user_ratings[title])
    
    return recommended_movies

In [35]:
movies = get_all_recommendations(user_id=3)
movies.sort_values(by='cos_sim', ascending=False).head(10)

Unnamed: 0,recommended_movie,cos_sim,root_movie,root_movie_rating
1748,Frenzy,0.427394,Rope,3.5
1148,Jerry Maguire,0.42339,Say Anything...,4.0
762,Spellbound,0.377075,Rope,3.5
6318,Elizabethtown,0.368514,Say Anything...,4.0
7424,Surrogates,0.364979,Terminator 3: Rise of the Machines,4.5
1751,The Wrong Man,0.361678,Rope,3.5
1011,The Terminator,0.355907,Terminator 3: Rise of the Machines,4.5
2633,Singles,0.344265,Say Anything...,4.0
1777,Murder!,0.339581,Rope,3.5
8918,Every Thing Will Be Fine,0.337963,The Million Dollar Hotel,5.0


In [36]:
movies['score'] = movies['cos_sim'] * movies['root_movie_rating']
movies.sort_values(by='score', ascending=False).head(10)

Unnamed: 0,recommended_movie,cos_sim,root_movie,root_movie_rating,score
1148,Jerry Maguire,0.42339,Say Anything...,4.0,1.693561
8918,Every Thing Will Be Fine,0.337963,The Million Dollar Hotel,5.0,1.689816
4530,The State of Things,0.337963,The Million Dollar Hotel,5.0,1.689816
4481,The American Friend,0.335013,The Million Dollar Hotel,5.0,1.675063
7424,Surrogates,0.364979,Terminator 3: Rise of the Machines,4.5,1.642407
1011,The Terminator,0.355907,Terminator 3: Rise of the Machines,4.5,1.601583
2826,The End of Violence,0.31334,The Million Dollar Hotel,5.0,1.566699
1748,Frenzy,0.427394,Rope,3.5,1.495878
6318,Elizabethtown,0.368514,Say Anything...,4.0,1.474055
8854,Terminator Genisys,0.326164,Terminator 3: Rise of the Machines,4.5,1.467738
