In [47]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise as pair
from scipy import spatial
import warnings; warnings.simplefilter('ignore')

In [75]:
#read files
movie_data = pd.read_csv('dataset/movies_metadata.csv')
credits = pd.read_csv('dataset/credits.csv')

In [76]:
movie_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [77]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [78]:
credits.isnull().sum()

cast    0
crew    0
id      0
dtype: int64

In [79]:
movie_data.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [80]:
#convert genres into a list of string
movie_data['genres'] = movie_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
genre_list = []
for i in movie_data.index:
    for g in movie_data['genres'][i]:
        if g not in genre_list:
            genre_list.append(g)
print(len(genre_list))

32


In [81]:
#extract the name of director from movie crew memebers
def find_director(crew):
    for i in crew:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [82]:
movie_data = movie_data.drop([19730, 29503, 35587])
movie_data['id'] = movie_data['id'].astype('int')
movie_data['collection'] = movie_data['belongs_to_collection'].fillna('[]').apply(literal_eval).apply(lambda x: x['id'] if isinstance(x,dict) else 0)
movie_data['collection_name'] = movie_data['belongs_to_collection'].fillna('[]').apply(literal_eval).apply(lambda x: x['name'] if isinstance(x,dict) else np.NAN)
credits['id'] = credits['id'].astype('int')
movie_data = movie_data.merge(credits, on='id')
movie_data['cast'] = movie_data['cast'].apply(literal_eval)
movie_data['cast'] = movie_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
movie_data['cast'] = movie_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movie_data['crew'] = movie_data['crew'].apply(literal_eval)
movie_data['director'] = movie_data['crew'].apply(find_director)
movie_data['director'] = movie_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [83]:
movie_data.shape

(45538, 29)

In [84]:
movie_data.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,collection,collection_name,cast,crew,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,10194,Toy Story Collection,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",johnlasseter


In [85]:
movie_data.drop(movie_data.columns.difference(['genres','id','collection','cast','director',"title","collection_name"]),axis=1,inplace=True)

In [86]:
movie_data.head()

Unnamed: 0,genres,id,title,collection,collection_name,cast,director
0,"[Animation, Comedy, Family]",862,Toy Story,10194,Toy Story Collection,"[tomhanks, timallen, donrickles]",johnlasseter
1,"[Adventure, Fantasy, Family]",8844,Jumanji,0,,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston
2,"[Romance, Comedy]",15602,Grumpier Old Men,119050,Grumpy Old Men Collection,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch
3,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,0,,"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker
4,[Comedy],11862,Father of the Bride Part II,96871,Father of the Bride Collection,"[stevemartin, dianekeaton, martinshort]",charlesshyer


In [87]:
movie_data.to_csv('processed_metadata.csv')

In [33]:
movie_data['director'] = movie_data['director'].apply(lambda x: [x,x,x])
movie_data['meta_combined'] = movie_data['cast'] + movie_data['director'] + movie_data['genres']
movie_data['meta_combined'] = movie_data['meta_combined'].apply(lambda x: ' '.join(x))
vectorizer = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = vectorizer.fit_transform(movie_data['meta_combined'])

In [34]:
similarity = pair.cosine_similarity(count_matrix)
np.fill_diagonal(similarity,0)

In [35]:
similarity.shape

(45538, 45538)

In [36]:
for i in range(len(movie_data)):
    if(movie_data.iloc[i]['collection'] != 0):
        collection_list = movie_data[movie_data['collection']==movie_data.iloc[i]['collection']].index.tolist()
        for index in collection_list:
            similarity[i,index] = 1.2 * similarity[i,index]

In [39]:
similarity_df = pd.DataFrame(similarity,index=movie_data['id'],columns=movie_data['id'])

In [40]:
top_10_list = []
for i in range(len(similarity_df)):
    top_10_list.append(similarity_df.iloc[i].nlargest(10).index.tolist())

In [41]:
len(top_10_list)

45538

In [43]:
top_10_df = pd.DataFrame(top_10_list,index=movie_data['id'])

In [44]:
top_10_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
862,863,13925,13927,13926,13928,920,9487,13934,49013,213121
8844,9354,13466,2023,256347,15139,10249,1771,331,7978,671
15602,13596,27472,11522,10393,2617,15143,25269,41579,2122,218931
31357,13969,9715,26479,25269,33644,801,63115,214216,19509,175541
11862,11846,8849,11215,47874,10879,19076,10385,333348,52856,8388


In [45]:
top_10_df.dtypes

0    int64
1    int64
2    int64
3    int64
4    int64
5    int64
6    int64
7    int64
8    int64
9    int64
dtype: object

In [46]:
top_10_df.to_csv('top_10_list.csv')