# Phase 1

## Preprocessing



In [None]:
import pandas as pd

from dateutil.parser import parse

# Revenue (Money) Preprocessing
from re import sub
from decimal import Decimal

from tmdbv3api import TMDb
from tmdbv3api import Movie
import keys  # TMDb API key file

import warnings
warnings.filterwarnings("ignore")



##### Reading the train datasets

In [None]:
# reading csv files
movies_base = pd.read_csv('datasets/1/train/movies-revenue.csv')
movies_director = pd.read_csv('datasets/1/train/movie-director.csv')

##### Checking nulls

In [None]:
movies_base.isnull().sum()

##### Joining directors dataset into the movies (main) dataset

In [None]:
# Merging dataframes; how = 'left' := use only keys from left frame.
df = pd.merge(movies_base, movies_director, on='name', how='left')

# displaying result
#df.head(n=10)

##### Filling the null directors

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [None]:
# Formatting the release_date to match the TMDb date format

for i, movie in df.iterrows():
    # Parse date from a string and return a datetime.datetime
    release_date = parse(movie['release_date'])
    # Remove the time from it reducing it to just the date
    release_date = release_date.date()

    # Parser doesn't do well with dates prior to the 80s
    # Correcting the dates newer than this year to a century earlier
    if release_date.year > 2023:
        release_date = release_date.replace(year=release_date.year - 100)

    # Editing the value at the original dataframe
    df.at[i, 'release_date'] = str(release_date)

df.head(n=10)

##### Initializing TMDb API

In [None]:
# Using the TMDb to fill out the missing director from the original dataset

# https://github.com/AnthonyBloomer/tmdbv3api
# https://developers.themoviedb.org/3/getting-started/introduction

# Creating a base class instance from the api library
tmdb = TMDb()
tmdb.api_key = keys.tmdb_key
tmdb.language = 'en'
tmdb.debug = True


##### Using the TMDb API to fill missing directors via the movie title and its release date
Since the director name is still not encoded, the director's popularity score will replace it.

In [None]:
# Creating a Movie instance to search by the movie details
movie = Movie()

for i, mov in df.iterrows():
    if pd.isnull(mov['director']):
        search = movie.search(mov['name'])  # Search by the movie title
        for res in search:
            try:
                # Confirming the search results by the release date year
                if res['release_date'][:4] == mov['release_date'][:4]:
                    # Extracting the director from the movie credits
                    for member in movie.credits(res.id)['crew']:
                        if member['job'] == 'Director':
                            # Editing the value at the original dataframe
                            df.at[i, 'director'] = member['name']
                            break
                    break
            except BaseException as error:
                print('An exception occurred: {}'.format(error) + " " + mov['name'])

In [None]:
df.isnull().sum()

##### Cleaning up the revenue to extract the float value

In [None]:
df['revenue'] = df['revenue'].str.replace(',','').str.replace('$','').astype('float')
df.head(n=10)