In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('movie_metadata.csv')

In [3]:
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [4]:
df.shape

(5043, 28)

In [5]:
df.columns[df.dtypes == 'O']

Index(['color', 'director_name', 'actor_2_name', 'genres', 'actor_1_name',
       'movie_title', 'actor_3_name', 'plot_keywords', 'movie_imdb_link',
       'language', 'country', 'content_rating'],
      dtype='object')

In [6]:
df.columns[df.dtypes != 'O']

Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross',
       'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'budget', 'title_year',
       'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio',
       'movie_facebook_likes'],
      dtype='object')

In [7]:
#Keeping only required columns for recommendation purpose
keep_cols = ['director_name', 'genres', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'movie_title', 'plot_keywords']

In [8]:
df = df.loc[:, keep_cols]

In [9]:
df.head()

Unnamed: 0,director_name,genres,actor_1_name,actor_2_name,actor_3_name,movie_title,plot_keywords
0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Joel David Moore,Wes Studi,Avatar,avatar|future|marine|native|paraplegic
1,Gore Verbinski,Action|Adventure|Fantasy,Johnny Depp,Orlando Bloom,Jack Davenport,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...
2,Sam Mendes,Action|Adventure|Thriller,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Spectre,bomb|espionage|sequel|spy|terrorist
3,Christopher Nolan,Action|Thriller,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...
4,Doug Walker,Documentary,Doug Walker,Rob Walker,,Star Wars: Episode VII - The Force Awakens ...,


In [10]:
df.shape

(5043, 7)

In [11]:
#checking missing Values
df.isnull().sum(axis = 0).sort_values(ascending = False)

plot_keywords    153
director_name    104
actor_3_name      23
actor_2_name      13
actor_1_name       7
movie_title        0
genres             0
dtype: int64

In [12]:
type(df['plot_keywords'][0])

str

In [13]:
#filling missing values with 'unknown' string
for col in ['plot_keywords', 'director_name', 'actor_3_name', 'actor_2_name', 'actor_1_name']:
    df[col] = df[col].replace(np.nan, 'unknown')

In [14]:
#combining actors into cast column
def join_actors(x):
    return ''.join(x['actor_1_name']) + ',' + ''.join(x['actor_2_name']) + ',' +  ''.join(x['actor_3_name'])

In [15]:
col = ['actor_1_name', 'actor_2_name', 'actor_3_name']
df['cast'] = df[col].apply(join_actors, axis = 1)

In [16]:
df['cast'] = df['cast'].apply(lambda x : [str.lower(word.replace(' ', ''))for word in x.split(',')])

In [17]:
df.drop(['actor_1_name', 'actor_2_name', 'actor_3_name'], axis = 1, inplace = True)

In [18]:
df.head()

Unnamed: 0,director_name,genres,movie_title,plot_keywords,cast
0,James Cameron,Action|Adventure|Fantasy|Sci-Fi,Avatar,avatar|future|marine|native|paraplegic,"[cchpounder, joeldavidmoore, wesstudi]"
1,Gore Verbinski,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,"[johnnydepp, orlandobloom, jackdavenport]"
2,Sam Mendes,Action|Adventure|Thriller,Spectre,bomb|espionage|sequel|spy|terrorist,"[christophwaltz, rorykinnear, stephaniesigman]"
3,Christopher Nolan,Action|Thriller,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,"[tomhardy, christianbale, josephgordon-levitt]"
4,Doug Walker,Documentary,Star Wars: Episode VII - The Force Awakens ...,unknown,"[dougwalker, robwalker, unknown]"


In [19]:
df['cast'][0]

['cchpounder', 'joeldavidmoore', 'wesstudi']

In [20]:
for col in ['genres', 'plot_keywords']:
    df[col] = df[col].apply(lambda x : [str.lower(word.replace(' ', '')) for word in x.split('|')])

In [21]:
df.head()

Unnamed: 0,director_name,genres,movie_title,plot_keywords,cast
0,James Cameron,"[action, adventure, fantasy, sci-fi]",Avatar,"[avatar, future, marine, native, paraplegic]","[cchpounder, joeldavidmoore, wesstudi]"
1,Gore Verbinski,"[action, adventure, fantasy]",Pirates of the Caribbean: At World's End,"[goddess, marriageceremony, marriageproposal, ...","[johnnydepp, orlandobloom, jackdavenport]"
2,Sam Mendes,"[action, adventure, thriller]",Spectre,"[bomb, espionage, sequel, spy, terrorist]","[christophwaltz, rorykinnear, stephaniesigman]"
3,Christopher Nolan,"[action, thriller]",The Dark Knight Rises,"[deception, imprisonment, lawlessness, policeo...","[tomhardy, christianbale, josephgordon-levitt]"
4,Doug Walker,[documentary],Star Wars: Episode VII - The Force Awakens ...,[unknown],"[dougwalker, robwalker, unknown]"


In [22]:
df['movie_title'] = df['movie_title'].str[:-1]
df['movie_title'] = df['movie_title'].str.lower()

In [23]:
df['director_name'] = df['director_name'].apply(lambda x : [str.lower(x.replace(' ', ''))])

In [24]:
#Selection of first 3 plot keywords and genres
def genre_keywords(x):
    if isinstance(x, list):
        names = [i for i in x]
        if len(names) > 3:
            return names[:3]
        return names
    return []

In [25]:
for col in ['genres', 'plot_keywords']:
    df[col] = df[col].apply(genre_keywords)

In [26]:
df.head()

Unnamed: 0,director_name,genres,movie_title,plot_keywords,cast
0,[jamescameron],"[action, adventure, fantasy]",avatar,"[avatar, future, marine]","[cchpounder, joeldavidmoore, wesstudi]"
1,[goreverbinski],"[action, adventure, fantasy]",pirates of the caribbean: at world's end,"[goddess, marriageceremony, marriageproposal]","[johnnydepp, orlandobloom, jackdavenport]"
2,[sammendes],"[action, adventure, thriller]",spectre,"[bomb, espionage, sequel]","[christophwaltz, rorykinnear, stephaniesigman]"
3,[christophernolan],"[action, thriller]",the dark knight rises,"[deception, imprisonment, lawlessness]","[tomhardy, christianbale, josephgordon-levitt]"
4,[dougwalker],[documentary],star wars: episode vii - the force awakens ...,[unknown],"[dougwalker, robwalker, unknown]"


In [27]:
#Joining the features into one feature - soup
def soup(x):
    return ' '.join(x['director_name']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['plot_keywords']) + ' ' + ' '.join(x['cast'])

In [28]:
type(df['director_name'][0])

list

In [29]:
type(df['genres'][0])

list

In [30]:
df['soup'] = df.apply(soup, axis = 1)

In [31]:
df.head()

Unnamed: 0,director_name,genres,movie_title,plot_keywords,cast,soup
0,[jamescameron],"[action, adventure, fantasy]",avatar,"[avatar, future, marine]","[cchpounder, joeldavidmoore, wesstudi]",jamescameron action adventure fantasy avatar f...
1,[goreverbinski],"[action, adventure, fantasy]",pirates of the caribbean: at world's end,"[goddess, marriageceremony, marriageproposal]","[johnnydepp, orlandobloom, jackdavenport]",goreverbinski action adventure fantasy goddess...
2,[sammendes],"[action, adventure, thriller]",spectre,"[bomb, espionage, sequel]","[christophwaltz, rorykinnear, stephaniesigman]",sammendes action adventure thriller bomb espio...
3,[christophernolan],"[action, thriller]",the dark knight rises,"[deception, imprisonment, lawlessness]","[tomhardy, christianbale, josephgordon-levitt]",christophernolan action thriller deception imp...
4,[dougwalker],[documentary],star wars: episode vii - the force awakens ...,[unknown],"[dougwalker, robwalker, unknown]",dougwalker documentary unknown dougwalker robw...


In [32]:
df['soup'][0]

'jamescameron action adventure fantasy avatar future marine cchpounder joeldavidmoore wesstudi'

In [33]:
df['soup'][1]

'goreverbinski action adventure fantasy goddess marriageceremony marriageproposal johnnydepp orlandobloom jackdavenport'

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
countvect = CountVectorizer(stop_words = 'english')
countmatrix = countvect.fit_transform(df['soup'])

In [36]:
countmatrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
countmatrix.shape

(5043, 14187)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
cosine_sim = cosine_similarity(countmatrix, countmatrix)

In [40]:
cosine_sim[0]

array([1. , 0.3, 0.2, ..., 0. , 0. , 0. ])

In [41]:
cosine_sim.shape

(5043, 5043)

In [42]:
indices = pd.Series(df.index, index = df['movie_title'])

In [43]:
indices.shape

(5043,)

In [44]:
indices.head()

movie_title
avatar                                                    0
pirates of the caribbean: at world's end                  1
spectre                                                   2
the dark knight rises                                     3
star wars: episode vii - the force awakens                4
dtype: int64

In [45]:
df['movie_title'][0]

'avatar'

In [46]:
print(indices['avatar'])

0


In [47]:
def get_recommendations(title, cosine_sim):
    
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
    
    movie_indices = [score[0] for score in sim_scores]
    
    return df['movie_title'].iloc[movie_indices[:10]]

In [48]:
get_recommendations('spectre', cosine_sim)

2                         spectre
30                        skyfall
3493                      skyfall
286                 casino royale
365     die hard with a vengeance
862            executive decision
2944                casino royale
12              quantum of solace
299             the expendables 2
347        a good day to die hard
Name: movie_title, dtype: object