## DataCollection-2017

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import warnings
warnings.filterwarnings("ignore")

In [57]:
# Dataset URL: https://www.kaggle.com/rounakbanik/the-movies-dataset
credits_df = pd.read_csv('credits.csv')

In [58]:
credits_df.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [63]:
credits_df.shape

(45476, 3)

In [59]:
meta_df = pd.read_csv('movies_metadata.csv')

In [60]:
meta_df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [61]:
meta_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [64]:
meta_df.shape

(45466, 24)

In [62]:
meta_df['release_date'].head()

0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object

In [8]:
# Converting Feature  'release_date' value into datetime format
meta_df['release_date'] = pd.to_datetime(meta_df['release_date'], errors='coerce')
meta_df['release_date'].head()

0   1995-10-30
1   1995-12-15
2   1995-12-22
3   1995-12-22
4   1995-02-10
Name: release_date, dtype: datetime64[ns]

In [9]:
meta_df['year'] = meta_df['release_date'].dt.year

In [10]:
meta_df['year'].value_counts().sort_index()

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

##  We will be extracting the record for movies of year 2017 only. as we already have movies up to the year 2016 in the DataCollection-2016 file.
## In this, We don't have enough data for the movies from 2018, 2019 and 2020, so we will deal with that later. 

In [11]:
new_meta_df = meta_df.loc[meta_df.year == 2017,['genres','id','title','year','vote_average','vote_count']]

In [12]:
new_meta_df.head(3)

Unnamed: 0,genres,id,title,year,vote_average,vote_count
26560,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,6.6,2814.0
26561,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,0.0,0.0
26565,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,0.0,0.0


In [13]:
new_meta_df.shape

(532, 6)

In [14]:
credits_df['id'].dtype

dtype('int64')

In [15]:
new_meta_df['id'].dtype

dtype('O')

In [16]:
new_meta_df['id'] = new_meta_df['id'].astype(int)
new_meta_df['id'].dtype

dtype('int32')

In [17]:
data_df = pd.merge(new_meta_df, credits_df, on='id')

In [18]:
data_df.head(3)

Unnamed: 0,genres,id,title,year,vote_average,vote_count,cast,crew
0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",166426,Pirates of the Caribbean: Dead Men Tell No Tales,2017.0,6.6,2814.0,"[{'cast_id': 1, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4c9cc3a36847f8236a65', 'de..."
1,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",141052,Justice League,2017.0,0.0,0.0,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '55ef66dbc3a3686f1700a52d', 'de..."
2,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",284053,Thor: Ragnarok,2017.0,0.0,0.0,"[{'cast_id': 0, 'character': 'Thor Odinson', '...","[{'credit_id': '56a93fa4c3a36872db001e7a', 'de..."


In [19]:
print(type(data_df['genres'][0]))
print(type(data_df['crew'][0]))
print(type(data_df['cast'][0]))

<class 'str'>
<class 'str'>
<class 'str'>


In [20]:
# Converting str into list 
data_df['genres'] = data_df['genres'].map(lambda x: ast.literal_eval(x))
data_df['cast'] = data_df['cast'].map(lambda x: ast.literal_eval(x))
data_df['crew'] = data_df['crew'].map(lambda x: ast.literal_eval(x))

In [21]:
print(type(data_df['genres'][0]))
print(type(data_df['crew'][0]))
print(type(data_df['cast'][0]))

<class 'list'>
<class 'list'>
<class 'list'>


In [22]:
# Currently genres is not present in the required format, so we will be converting it into list
data_df['genres'][0]

[{'id': 12, 'name': 'Adventure'},
 {'id': 28, 'name': 'Action'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 35, 'name': 'Comedy'}]

In [23]:
def make_genresList(x):
    gen = []
    st = " "
    for i in x:
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            gen.append(scifi)
        else:
            gen.append(i.get('name'))
    if gen == []:
        return np.NaN
    else:
        return (st.join(gen))

In [24]:
data_df['genres_list'] = data_df['genres'].map(lambda x: make_genresList(x))

In [25]:
data_df['genres_list']

0      Adventure Action Fantasy Comedy
1      Action Adventure Fantasy Sci-Fi
2      Action Adventure Fantasy Sci-Fi
3       Action Adventure Comedy Sci-Fi
4             Fantasy Action Adventure
                    ...               
526                     Romance Comedy
527         Crime Comedy Action Family
528    Family Animation Romance Comedy
529               Crime Drama Thriller
530                                NaN
Name: genres_list, Length: 531, dtype: object

In [26]:
# We will be extracting actor1 name, actor2 name, actor3 name, director name from the crew feature
def get_actor1(x):
    casts = []
    st = " "
    for i in x:
        casts.append(i.get('name'))
    if casts == []:
        return np.NaN
    else:
        return (casts[0])

In [27]:
data_df['actor_1_name'] = data_df['cast'].map(lambda x: get_actor1(x))

In [28]:
def get_actor2(x):
    casts = []
    st = " "
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=1:
        return np.NaN
    else:
        return (casts[1])

In [29]:
data_df['actor_2_name'] = data_df['cast'].map(lambda x: get_actor2(x))

In [30]:
def get_actor3(x):
    casts = []
    st = " "
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=2:
        return np.NaN
    else:
        return (casts[2])

In [31]:
data_df['actor_3_name'] = data_df['cast'].map(lambda x: get_actor3(x))

In [32]:
def get_directors(x):
    dt = []
    st = " "
    for i in x:
        if i.get('job') == 'Director':
            dt.append(i.get('name'))
    if dt == []:
        return np.NaN
    else:
        return (st.join(dt))

In [33]:
data_df['director_name'] = data_df['crew'].map(lambda x: get_directors(x))

In [34]:
data_df['director_name']

0      Joachim Rønning Espen Sandberg
1                         Zack Snyder
2                       Taika Waititi
3                          James Gunn
4                       Sean McNamara
                    ...              
526                  Hannaleena Hauru
527             Jonathan A. Rosenbaum
528          Beth David Esteban Bravo
529                      Ravi Udyawar
530                     Daisy Asquith
Name: director_name, Length: 531, dtype: object

In [35]:
data_df.columns

Index(['genres', 'id', 'title', 'year', 'vote_average', 'vote_count', 'cast',
       'crew', 'genres_list', 'actor_1_name', 'actor_2_name', 'actor_3_name',
       'director_name'],
      dtype='object')

In [36]:
# Will be using these features for recommendation
data_df = data_df.loc[:,['title','director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','vote_average','vote_count']]

In [37]:
data_df.head(3)

Unnamed: 0,title,director_name,actor_1_name,actor_2_name,actor_3_name,genres_list,vote_average,vote_count
0,Pirates of the Caribbean: Dead Men Tell No Tales,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,6.6,2814.0
1,Justice League,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,0.0,0.0
2,Thor: Ragnarok,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,0.0,0.0


In [38]:
data_df['actor_1_name'] = data_df['actor_1_name'].str.replace(' ','')
data_df['actor_2_name'] = data_df['actor_2_name'].str.replace(' ','')
data_df['actor_3_name'] = data_df['actor_3_name'].str.replace(' ','')
data_df['director_name'] = data_df['director_name'].str.replace(' ','')

In [39]:
data_df['title'] = data_df['title'].str.lower()
data_df['actor_1_name'] = data_df['actor_1_name'].str.lower()
data_df['actor_2_name'] = data_df['actor_2_name'].str.lower()
data_df['actor_3_name'] = data_df['actor_3_name'].str.lower()
data_df['director_name'] = data_df['director_name'].str.lower()

In [40]:
data_df.head(3)

Unnamed: 0,title,director_name,actor_1_name,actor_2_name,actor_3_name,genres_list,vote_average,vote_count
0,pirates of the caribbean: dead men tell no tales,joachimrønningespensandberg,johnnydepp,javierbardem,geoffreyrush,Adventure Action Fantasy Comedy,6.6,2814.0
1,justice league,zacksnyder,benaffleck,henrycavill,galgadot,Action Adventure Fantasy Sci-Fi,0.0,0.0
2,thor: ragnarok,taikawaititi,chrishemsworth,tomhiddleston,cateblanchett,Action Adventure Fantasy Sci-Fi,0.0,0.0


In [41]:
data_df.isnull().sum()

title             0
director_name     4
actor_1_name     22
actor_2_name     55
actor_3_name     70
genres_list       7
vote_average      0
vote_count        0
dtype: int64

In [42]:
data_df = data_df.dropna(how='any')

In [43]:
data_df.isnull().sum()

title            0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres_list      0
vote_average     0
vote_count       0
dtype: int64

In [44]:
data_df = data_df.rename(columns={'genres_list':'genres'})
data_df = data_df.rename(columns={'title':'movie_title'})

In [45]:
data_df.head()

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count
0,pirates of the caribbean: dead men tell no tales,joachimrønningespensandberg,johnnydepp,javierbardem,geoffreyrush,Adventure Action Fantasy Comedy,6.6,2814.0
1,justice league,zacksnyder,benaffleck,henrycavill,galgadot,Action Adventure Fantasy Sci-Fi,0.0,0.0
2,thor: ragnarok,taikawaititi,chrishemsworth,tomhiddleston,cateblanchett,Action Adventure Fantasy Sci-Fi,0.0,0.0
3,guardians of the galaxy vol. 2,jamesgunn,chrispratt,zoesaldana,davebautista,Action Adventure Comedy Sci-Fi,7.6,4858.0
4,the king's daughter,seanmcnamara,piercebrosnan,williamhurt,benjaminwalker,Fantasy Action Adventure,0.0,4.0


In [46]:
# This feature will be used later during modelling
data_df['comb'] = data_df['actor_1_name'] + ' ' + data_df['actor_2_name'] + ' ' + data_df['actor_3_name'] + ' ' + data_df['director_name'] + ' ' + data_df['genres']

In [47]:
data_df.head(3)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count,comb
0,pirates of the caribbean: dead men tell no tales,joachimrønningespensandberg,johnnydepp,javierbardem,geoffreyrush,Adventure Action Fantasy Comedy,6.6,2814.0,johnnydepp javierbardem geoffreyrush joachimrø...
1,justice league,zacksnyder,benaffleck,henrycavill,galgadot,Action Adventure Fantasy Sci-Fi,0.0,0.0,benaffleck henrycavill galgadot zacksnyder Act...
2,thor: ragnarok,taikawaititi,chrishemsworth,tomhiddleston,cateblanchett,Action Adventure Fantasy Sci-Fi,0.0,0.0,chrishemsworth tomhiddleston cateblanchett tai...


In [48]:
# Importing the Movie data till 2016, which we saved earlier in DataCollection-2016 File
old_df = pd.read_csv('Data-till2016.csv')

In [49]:
old_df.head(2)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count,comb
0,avatar,jamescameron,cchpounder,joeldavidmoore,wesstudi,Action Adventure Fantasy Sci-Fi,7.9,3054.0,cchpounder joeldavidmoore wesstudi jamescamero...
1,pirates of the caribbean: at world's end,goreverbinski,johnnydepp,orlandobloom,jackdavenport,Action Adventure Fantasy,7.1,1238.0,johnnydepp orlandobloom jackdavenport goreverb...


In [50]:
# Appending both the Datasets
new_df = old_df.append(data_df)

In [51]:
old_df.shape, new_df.shape, data_df.shape

((5022, 9), (5480, 9), (458, 9))

In [52]:
new_df.drop_duplicates(subset='movie_title', keep='last', inplace=True)

In [53]:
 new_df.shape

(5343, 9)

In [54]:
new_df.to_csv('Data-till2017.csv', index=False)