# Phase 1

## Preprocessing



In [1]:
import pandas as pd

from dateutil.parser import parse

from tmdbv3api import TMDb
from tmdbv3api import Movie
import keys  # TMDb API key file

import warnings
warnings.filterwarnings("ignore")

##### Reading the train datasets

In [2]:
# reading csv files
movies_base = pd.read_csv('datasets/1/train/movies-revenue.csv')
movies_director = pd.read_csv('datasets/1/train/movie-director.csv')

##### Checking nulls

In [3]:
movies_base.isnull().sum()

name             0
release_date     0
genre           14
MPAA_rating     47
revenue          0
dtype: int64

##### Joining directors dataset into the movies (main) dataset

In [4]:
# Merging dataframes; how = 'left' := use only keys from left frame.
df = pd.merge(movies_base, movies_director, on='name', how='left')

# displaying result
#df.head(n=10)

##### Filling the null directors

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [5]:
# Formatting the release_date to match the TMDb date format

for i, movie in df.iterrows():
    # Parse date from a string and return a datetime.datetime
    release_date = parse(movie['release_date'])
    # Remove the time from it reducing it to just the date
    release_date = release_date.date()

    # Parser doesn't do well with dates prior to the 80s
    # Correcting the dates newer than this year to a century earlier
    if release_date.year > 2023:
        release_date = release_date.replace(year=release_date.year - 100)

    # Editing the value at the original dataframe
    df.at[i, 'release_date'] = str(release_date)

df.head(n=10)

Unnamed: 0,name,release_date,genre,MPAA_rating,revenue,director
0,Recess: School's Out,2001-02-16,Comedy,G,"$54,656,124",
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,"$94,226,333",
2,Home on the Range,2004-04-02,Comedy,PG,"$67,910,166",Will Finn
3,Young Black Stallion,2003-12-25,Adventure,G,"$9,254,344",
4,What's Love Got to Do With It,1993-06-09,Drama,R,"$79,618,610",
5,Lady and the Tramp,1955-06-22,Drama,G,"$1,236,035,515",Hamilton Luske
6,Corky Romano,2001-10-12,Comedy,PG-13,"$35,705,805",
7,The Waterboy,1998-11-06,Comedy,PG-13,"$288,691,833",
8,Captain America: The Winter Soldier,2014-04-04,Action,PG-13,"$268,013,076",
9,Pirates of the Caribbean: Dead Man’…,2006-07-07,Adventure,PG-13,"$544,817,142",


##### Initializing TMDb API

In [6]:
# Using the TMDb to fill out the missing director from the original dataset

# https://github.com/AnthonyBloomer/tmdbv3api
# https://developers.themoviedb.org/3/getting-started/introduction

# Creating a base class instance from the api library
tmdb = TMDb()
tmdb.api_key = keys.tmdb_key
tmdb.language = 'en'
tmdb.debug = True


##### Using the TMDb API to fill missing directors via the movie title and its release date
Since the director name is still not encoded, the director's popularity score will replace it.

In [None]:
# Creating a Movie instance to search by the movie details
movie = Movie()

for i, mov in df.iterrows():
    if pd.isnull(mov['director']):
        search = movie.search(mov['name'])  # Search by the movie title
        for res in search:
            try:
                # Confirming the search results by the release date year
                if res['release_date'][:4] == mov['release_date'][:4]:
                    # Extracting the director from the movie credits
                    for member in movie.credits(res.id)['crew']:
                        if member['job'] == 'Director':
                            # Editing the value at the original dataframe
                            df.at[i, 'director'] = member['name']
                            break
                    break
            except BaseException as error:
                print('An exception occurred: {}'.format(error) + " " + mov['name'])

In [None]:
df.isnull().sum()

##### Cleaning up the revenue to extract the float value

In [5]:
df['revenue'] = df['revenue'].str.replace(',','').str.replace('$','').astype('float')
df.head()

Unnamed: 0,name,release_date,genre,MPAA_rating,revenue,director
0,Recess: School's Out,2001-02-16,Comedy,G,54656124.0,
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,94226333.0,
2,Home on the Range,2004-04-02,Comedy,PG,67910166.0,Will Finn
3,Young Black Stallion,2003-12-25,Adventure,G,9254344.0,
4,What's Love Got to Do With It,1993-06-09,Drama,R,79618610.0,


##### Encoding the MPAA Rating and Genre to indicator variables

In [6]:
# Preprocessing genre and MPAA_rating
df = pd.get_dummies(df, columns=["MPAA_rating"], prefix=["rating_is"])
df = pd.get_dummies(df, columns=["genre"], prefix=["genre_is"])
df.head()

Unnamed: 0,name,release_date,revenue,director,rating_is_G,rating_is_Not Rated,rating_is_PG,rating_is_PG-13,rating_is_R,genre_is_Action,...,genre_is_Black Comedy,genre_is_Comedy,genre_is_Concert/Performance,genre_is_Documentary,genre_is_Drama,genre_is_Horror,genre_is_Musical,genre_is_Romantic Comedy,genre_is_Thriller/Suspense,genre_is_Western
0,Recess: School's Out,2001-02-16,54656124.0,,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,D2: The Mighty Ducks,1994-03-25,94226333.0,,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Home on the Range,2004-04-02,67910166.0,Will Finn,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Young Black Stallion,2003-12-25,9254344.0,,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,What's Love Got to Do With It,1993-06-09,79618610.0,,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
def date_to_float(dt):
    # Splitting the date
    date_split = str(dt).split('-')
    # Calculating the months and days
    calc = (((float(date_split[1]) - 1) * 30) + float(date_split[2])) / 365
    # Adding calc to the years
    return float(date_split[0]) + calc

##### Splitting date to days and months and then converting it to a scalar via `date_to_float()`

In [None]:
df[release_date] = pd.to_datetime(df[release_date])

df['day'] = df['release_date'].dt.day
df['month'] = df['release_date'].dt.month
df['release_date'] = date_to_float(df[release_date])