## FinalPreProcessing: In this file, I have combined all the data that I have collected till year 2020 and performed some pre-processing.



In [1]:
import pandas as pd
import numpy as np

In [2]:
# In Dataset from Year 2018-2020, we lack many features such as genres, vote count, average vote given to a movie which were
# present in the earlier datasets that we created. 
# These are some important features which are required during modelling.
# So I will first handle this situation.

# Importing Dataset from year 2018 - 2020
data18_to_20 = pd.read_csv('Data-2018-to-20.csv')

In [3]:
data18_to_20.head()

Unnamed: 0,Title,Cast and crew
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...
2,Stratton,"Simon West (director); Duncan Falconer, Warren..."
3,Sweet Country,"Warwick Thornton (director); David Tranter, St..."
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...


In [4]:
data18_to_20.shape

(730, 2)

In [None]:
# I will be using TMDb API to extract genres of a movie, Number of votes given to a movie, 
# average vote rating given to a movie from TMDb Website.

In [5]:
from tmdbv3api import TMDb, Movie
import json
import requests
tmdb = TMDb()
tmdb.api_key = 'api_key'  

In [6]:
# Extracting the genres
tmdb_movie = Movie()
movie_names = []
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    try:
        movie_id = result[0].id
    except IndexError:
        movie_names.append(x)
        return
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
        genre_str = " " 
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return genre_str.join(genres)
    else:
        np.NaN      

# Extracting the average vote rating given to a movie out of 10
def get_vote_average(x):
    result = tmdb_movie.search(x)
    try:
        movie_id = result[0].id
    except IndexError:
        movie_names.append(x)
        return
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['vote_average']:
        return data_json['vote_average']
    else:
        return 0.0

# Extracting the number of votes given to a movie
def get_vote_count(x):
    result = tmdb_movie.search(x)
    try:
        movie_id = result[0].id
    except IndexError:
        movie_names.append(x)
        return
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['vote_count']:
        return data_json['vote_count']
    else:
        np.NaN      

In [7]:
data18_to_20['genres'] = data18_to_20['Title'].map(lambda x: get_genre(str(x)))

In [8]:
data18_to_20['vote_count'] = data18_to_20['Title'].map(lambda x: get_vote_count(str(x)))

In [9]:
data18_to_20['vote_average'] = data18_to_20['Title'].map(lambda x: get_vote_average(str(x)))

In [10]:
data18_to_20.head()

Unnamed: 0,Title,Cast and crew,genres,vote_count,vote_average
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Mystery Horror Thriller,1858.0,6.2
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,53.0,5.4
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,151.0,4.9
3,Sweet Country,"Warwick Thornton (director); David Tranter, St...",Drama History Western,98.0,6.7
4,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller,3172.0,6.3


In [11]:
data18_to_20.isnull().sum()

Title             0
Cast and crew     0
genres            7
vote_count       36
vote_average      1
dtype: int64

In [12]:
# Droping all those rows which contains null values.
data18_to_20.dropna(inplace=True, axis=0)

In [13]:
data18_to_20.isnull().sum()

Title            0
Cast and crew    0
genres           0
vote_count       0
vote_average     0
dtype: int64

In [14]:
data18_to_20.shape

(694, 5)

In [15]:
data18_to_20.head(3)

Unnamed: 0,Title,Cast and crew,genres,vote_count,vote_average
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Mystery Horror Thriller,1858.0,6.2
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,53.0,5.4
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,151.0,4.9


### Extracting director name(s), actor name(s) from cast and crew

In [16]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]
    
data18_to_20['director_name'] = data18_to_20['Cast and crew'].map(lambda x: get_director(x)) 

In [17]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

data18_to_20['actor_1_name'] = data18_to_20['Cast and crew'].map(lambda x: get_actor1(x)) 

In [18]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

data18_to_20['actor_2_name'] = data18_to_20['Cast and crew'].map(lambda x: get_actor2(x)) 

In [19]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])
    
data18_to_20['actor_3_name'] = data18_to_20['Cast and crew'].map(lambda x: get_actor3(x)) 

In [20]:
data18_to_20.head(3)

Unnamed: 0,Title,Cast and crew,genres,vote_count,vote_average,director_name,actor_1_name,actor_2_name,actor_3_name
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Mystery Horror Thriller,1858.0,6.2,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,Lauren Wolkstein (director); Christopher Radcl...,Thriller Drama,53.0,5.4,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,Stratton,"Simon West (director); Duncan Falconer, Warren...",Action Thriller,151.0,4.9,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan


In [21]:
# Renaming the feature 'title' to 'movie_title'. 
data18_to_20 = data18_to_20.rename(columns={'Title': 'movie_title'})

In [22]:
# Extracting the required features.
new_df18_to_20 = data18_to_20.loc[:,['movie_title','director_name','actor_1_name','actor_2_name','actor_3_name','genres','vote_average','vote_count']] 

In [23]:
new_df18_to_20.head()

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count
0,Insidious: The Last Key,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Mystery Horror Thriller,6.2,1858.0
1,The Strange Ones,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Thriller Drama,5.4,53.0
2,Stratton,Simon West,Dominic Cooper,Austin Stowell,Gemma Chan,Action Thriller,4.9,151.0
3,Sweet Country,Warwick Thornton,Bryan Brown,Sam Neill,,Drama History Western,6.7,98.0
4,The Commuter,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Action Thriller,6.3,3172.0


In [24]:
new_df18_to_20.isnull().sum()

movie_title       0
director_name     0
actor_1_name      0
actor_2_name     12
actor_3_name     62
genres            0
vote_average      0
vote_count        0
dtype: int64

In [25]:
new_df18_to_20['actor_2_name'] = new_df18_to_20['actor_2_name'].replace(np.nan,'unknown')
new_df18_to_20['actor_3_name'] = new_df18_to_20['actor_3_name'].replace(np.nan,'unknown')
new_df18_to_20['movie_title'] = new_df18_to_20['movie_title'].str.lower()

In [26]:
new_df18_to_20.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
vote_average     0
vote_count       0
dtype: int64

In [27]:
new_df18_to_20['actor_1_name'] = new_df18_to_20['actor_1_name'].str.replace(' ','')
new_df18_to_20['actor_2_name'] = new_df18_to_20['actor_2_name'].str.replace(' ','')
new_df18_to_20['actor_3_name'] = new_df18_to_20['actor_3_name'].str.replace(' ','')
new_df18_to_20['director_name'] = new_df18_to_20['director_name'].str.replace(' ','')

In [30]:
new_df18_to_20['movie_title'] = new_df18_to_20['movie_title'].str.lower()
new_df18_to_20['actor_1_name'] = new_df18_to_20['actor_1_name'].str.lower()
new_df18_to_20['actor_2_name'] = new_df18_to_20['actor_2_name'].str.lower()
new_df18_to_20['actor_3_name'] = new_df18_to_20['actor_3_name'].str.lower()
new_df18_to_20['director_name'] = new_df18_to_20['director_name'].str.lower()

In [31]:
new_df18_to_20['comb'] = new_df18_to_20['actor_1_name'] + ' ' + new_df18_to_20['actor_2_name'] + ' ' + new_df18_to_20['actor_3_name'] + ' ' + new_df18_to_20['director_name'] + ' ' + new_df18_to_20['genres']

In [32]:
new_df18_to_20.head(3)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count,comb
0,insidious: the last key,adamrobitel,linshaye,angussampson,leighwhannell,Mystery Horror Thriller,6.2,1858.0,linshaye angussampson leighwhannell adamrobite...
1,the strange ones,laurenwolkstein,alexpettyfer,jamesfreedson-jackson,emilyalthaus,Thriller Drama,5.4,53.0,alexpettyfer jamesfreedson-jackson emilyalthau...
2,stratton,simonwest,dominiccooper,austinstowell,gemmachan,Action Thriller,4.9,151.0,dominiccooper austinstowell gemmachan simonwes...


In [33]:
# Importing the collected Data till 2017
old_df = pd.read_csv('Data-till2017.csv')

In [34]:
old_df.head(3)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count,comb
0,avatar,jamescameron,cchpounder,joeldavidmoore,wesstudi,Action Adventure Fantasy Sci-Fi,7.9,3054.0,cchpounder joeldavidmoore wesstudi jamescamero...
1,pirates of the caribbean: at world's end,goreverbinski,johnnydepp,orlandobloom,jackdavenport,Action Adventure Fantasy,7.1,1238.0,johnnydepp orlandobloom jackdavenport goreverb...
2,spectre,sammendes,christophwaltz,rorykinnear,stephaniesigman,Action Adventure Thriller,6.8,994.0,christophwaltz rorykinnear stephaniesigman sam...


## Combining all the data till 2020

In [35]:
final_data = old_df.append(new_df18_to_20, ignore_index=True)

In [37]:
final_data.shape

(6037, 9)

In [38]:
final_data.head()

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,vote_average,vote_count,comb
0,avatar,jamescameron,cchpounder,joeldavidmoore,wesstudi,Action Adventure Fantasy Sci-Fi,7.9,3054.0,cchpounder joeldavidmoore wesstudi jamescamero...
1,pirates of the caribbean: at world's end,goreverbinski,johnnydepp,orlandobloom,jackdavenport,Action Adventure Fantasy,7.1,1238.0,johnnydepp orlandobloom jackdavenport goreverb...
2,spectre,sammendes,christophwaltz,rorykinnear,stephaniesigman,Action Adventure Thriller,6.8,994.0,christophwaltz rorykinnear stephaniesigman sam...
3,the dark knight rises,christophernolan,tomhardy,christianbale,josephgordon-levitt,Action Thriller,8.5,2701.0,tomhardy christianbale josephgordon-levitt chr...
4,john carter,andrewstanton,darylsabara,samanthamorton,pollywalker,Action Adventure Sci-Fi,6.6,738.0,darylsabara samanthamorton pollywalker andrews...


In [39]:
final_data.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
vote_average     0
vote_count       0
comb             0
dtype: int64

In [40]:
final_data.to_csv('Data-till2020.csv', index=False)