# Phase 2

## Preprocessing

In [None]:
import warnings

import pandas as pd
import numpy as np
from dateutil.parser import parse
from tmdbv3api import Movie
from tmdbv3api import TMDb

import keys  # TMDb API key file

warnings.filterwarnings("ignore")

##### Reading the train datasets

In [None]:
# reading csv files
movies_base = pd.read_csv('datasets/2/train/movies-revenue-classification.csv')
movies_director = pd.read_csv('datasets/2/train/movie-director.csv')

##### Checking nulls

In [None]:
movies_base.isnull().sum()

##### Joining directors dataset into the movies (main) dataset

In [None]:
# Merging dataframes; how = 'left' := use only keys from left frame.
df = pd.merge(movies_base, movies_director, on='name', how='left')

# displaying result
df.head()

##### Filling the null directors

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.dtypes

In [None]:
# Formatting the release_date to match the TMDb date format

for i, movie in df.iterrows():
    # Parse date from a string and return a datetime.datetime
    release_date = parse(movie['release_date'])
    # Remove the time from it reducing it to just the date
    release_date = release_date.date()

    # Parser doesn't do well with dates prior to the 80s
    # Correcting the dates newer than this year to a century earlier
    if release_date.year > 2023:
        release_date = release_date.replace(year=release_date.year - 100)

    # Editing the value at the original dataframe
    df.at[i, 'release_date'] = release_date

df['release_date'] = pd.to_datetime(df['release_date'])
df.head(n=10)

##### Initializing TMDb API

In [None]:
# Using the TMDb to fill out the missing director from the original dataset

# https://github.com/AnthonyBloomer/tmdbv3api
# https://developers.themoviedb.org/3/getting-started/introduction

# Creating a base class instance from the api library
tmdb = TMDb()
tmdb.api_key = keys.tmdb_key
tmdb.language = 'en'
tmdb.debug = True


##### Using the TMDb API to fill missing directors via the movie title and its release date
Since the director name is still not encoded, the director's popularity score will replace it.

In [None]:
# Creating a Movie instance to search by the movie details
movie = Movie()

for i, mov in df.iterrows():
    search = movie.search(mov['name'])  # Search by the movie title
    for res in search:
        try:
            # Confirming the search results by the release date year
            mov_date_str = str(mov['release_date'])
            if res['release_date'][:4] == mov_date_str[:4]:
                # Extracting the director from the movie credits
                for member in movie.credits(res.id)['crew']:
                    if member['job'] == 'Director':
                        # Editing the value at the original dataframe
                        df.at[i, 'director'] = member['popularity']
                        break
                break
        except BaseException as error:
            print('An exception occurred: {}'.format(error) + " " + mov['name'])

In [None]:
df.isnull().sum()

In [None]:
df.head()

##### Encoding the MPAA Rating and Genre to indicator variables

In [None]:
# Fill the nulls with the mode
df['director'] = df['director'].fillna(value=df['director'].median())
df['genre'] = df['genre'].fillna(value=df['genre'].mode()[0])
df['MPAA_rating'] = df['MPAA_rating'].fillna(value=df['MPAA_rating'].mode()[0])
df.isnull().sum()

In [None]:
# Preprocessing genre and MPAA_rating

df = pd.get_dummies(df, columns=["MPAA_rating"], prefix=["rating_is"])
df = pd.get_dummies(df, columns=["genre"], prefix=["genre_is"])
df.head()

##### Encoding the release date to a scalar

In [None]:
def date_to_float(dt):
    # Calculating the months and days
    calc = (((dt.month - 1) * 30) + dt.day) / 365
    # Adding calc to the years
    return dt.year + calc

##### Splitting date to days and months and then converting it to a scalar via `date_to_float()`

In [None]:
df['month'] = df['release_date'].dt.month
df['day'] = df['release_date'].dt.day
df['season'] = df['release_date'].dt.quarter
df['release_date'] = df['release_date'].apply(date_to_float)
df.head()

##### Adding the budget and runtime columns from TMDb API

In [None]:
df = pd.read_csv('datasets/2/train/preprocessed.csv')

df['budget'] = np.nan
df['runtime'] = np.nan

movie = Movie()

# extract the budget and runtime from the TMDb API
for i, mov in df.iterrows():
    search = movie.search(mov['name'])  # Search by the movie title
    for res in search:
        try:
            # Confirming the search results by the release date year
            print(movie.details(res.id).)
            mov_date_str = str(mov['release_date'])
            if int(res['release_date'][:4]) == int(mov_date_str[:4]):
                # Editing the value at the original dataframe
                detail = movie.details(res.id)
                print(detail)
                df.at[i, 'budget'] = detail['budget']
                df.at[i, 'runtime'] = detail['runtime']
                break
        except BaseException as error:
            pass
            # print('An exception occurred: {}'.format(error) + " " + mov['name'])

df.head()

In [None]:
df['budget'] = df['budget'].replace(0, df['budget'].median())
df['budget'] = df['budget'].fillna(value=df['budget'].median())

df['runtime'] = df['runtime'].replace(0, df['runtime'].median())
df['runtime'] = df['runtime'].fillna(value=df['runtime'].median())

##### Adjusting for inflation

In [None]:
import cpi # https://pypi.org/project/cpi/

# Update the CPI data
cpi.update()

# Adjust the revenue to inflation using cpi
df['revenue'] = df.apply(lambda x: cpi.inflate(x['revenue'],
                                               int(x.release_date)), axis=1)
df['budget'] = df.apply(lambda x: cpi.inflate(x['budget'],
                                              int(x.release_date)), axis=1)

In [None]:
df.head()

##### Feature Engineering

In [None]:
df_animation = pd.read_csv('datasets/2/train/movie-voice-actors.csv')

# Remove duplicates on name column
df_animation = df_animation.drop_duplicates(subset=['name'], keep='first')

# Add column is_animation
df_animation['is_animation'] = 1

# Drop other columns that are not needed
df_animation = df_animation.drop(['voice_actor', 'character'], axis=1)

df_animation.head()

##### Merging the animation dataset with the main dataset

In [None]:
# Merge the animation dataset with the main dataset
df = pd.merge(df, df_animation, on='name', how='left')

# Fill the nulls with 0
df['is_animation'] = df['is_animation'].fillna(value=0)

df.head()

##### Encoding the movie success level to a scalar

In [None]:
# Encoding the movie success level to a scalar
df = pd.read_csv('datasets/2/train/preprocessed.csv')
df['MovieSuccessLevel'] = df['MovieSuccessLevel'].map({'S': 5,
                                                       'A': 4,
                                                       'B': 3,
                                                       'C': 2,
                                                       'D': 1})

##### Saving the preprocessed dataset

In [None]:
df.to_csv('datasets/2/train/preprocessed.csv', index=False)

#####

In [None]:
# Dividing Data
Y = df['MovieSuccessLevel']
X = df.drop(['name', 'MovieSuccessLevel'], axis=1)

##### Splitting the data into train and test sets

In [None]:
from sklearn import metrics
from sklearn import svm
from sklearn import tree

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Data Splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=20, test_size=0.2, shuffle=True)

# Feature Scaling
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ------------------------------------- #

# Hyper Parameters

C = 0.00001  # SVM regularization parameter
m_degree = 7

# ------------------------------------- #

# Poly Model
poly_model = svm.SVC(kernel='poly', degree=m_degree, C=C).fit(X_train, Y_train)
p = poly_model.predict(X_test)
print("Accuracy Poly:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Linear Model
linear_svc = svm.LinearSVC(C=C).fit(X_train, Y_train)
p = linear_svc.predict(X_test)
print("Accuracy linear:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Rbf Model
rbf_svc = svm.SVC(kernel='rbf', C=C).fit(X_train, Y_train)
p = rbf_svc.predict(X_test)
print("Accuracy rbf:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Linear Kernel Model
linear_kernel_svc = svm.SVC(kernel='linear', C=C).fit(X_train, Y_train)
p = linear_kernel_svc.predict(X_test)
print("Accuracy Linear kernel:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Logistic Regression Model
logistic_regression_model = LogisticRegression(random_state=0).fit(X_train, Y_train)
p = logistic_regression_model.predict(X_test)
print("Accuracy Logistic Regression:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Decision Tree Model

clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf = clf.fit(X_train, Y_train)
p = clf.predict(X_test)
print("Accuracy Decision Tree:", metrics.accuracy_score(Y_test, p), '\n')
