# Phase 2

## Preprocessing

In [1]:
import warnings

import pandas as pd
from dateutil.parser import parse
from tmdbv3api import Movie
from tmdbv3api import TMDb

import keys  # TMDb API key file

warnings.filterwarnings("ignore")

##### Reading the train datasets

In [2]:
# reading csv files
movies_base = pd.read_csv('datasets/2/train/movies-revenue-classification.csv')
movies_director = pd.read_csv('datasets/2/train/movie-director.csv')

##### Checking nulls

In [3]:
movies_base.isnull().sum()

name                  0
release_date          0
genre                14
MPAA_rating          47
MovieSuccessLevel     0
dtype: int64

##### Joining directors dataset into the movies (main) dataset

In [4]:
# Merging dataframes; how = 'left' := use only keys from left frame.
df = pd.merge(movies_base, movies_director, on='name', how='left')

# displaying result
df.head()

Unnamed: 0,name,release_date,genre,MPAA_rating,MovieSuccessLevel,director
0,Recess: School's Out,16-Feb-01,Comedy,G,C,
1,D2: The Mighty Ducks,25-Mar-94,Comedy,PG,B,
2,Home on the Range,2-Apr-04,Comedy,PG,C,Will Finn
3,Young Black Stallion,25-Dec-03,Adventure,G,D,
4,What's Love Got to Do With It,9-Jun-93,Drama,R,B,


##### Filling the null directors

In [5]:
df.isnull().sum()

name                   0
release_date           0
genre                 14
MPAA_rating           47
MovieSuccessLevel      0
director             428
dtype: int64

In [6]:
df.describe().T

Unnamed: 0,count,unique,top,freq
name,463,460,Cinderella,2
release_date,463,446,3-Mar-95,2
genre,449,12,Comedy,146
MPAA_rating,416,5,PG,156
MovieSuccessLevel,463,5,C,164
director,35,22,Ron Clements,6


In [7]:
df.dtypes

name                 object
release_date         object
genre                object
MPAA_rating          object
MovieSuccessLevel    object
director             object
dtype: object

In [8]:
# Formatting the release_date to match the TMDb date format

for i, movie in df.iterrows():
    # Parse date from a string and return a datetime.datetime
    release_date = parse(movie['release_date'])
    # Remove the time from it reducing it to just the date
    release_date = release_date.date()

    # Parser doesn't do well with dates prior to the 80s
    # Correcting the dates newer than this year to a century earlier
    if release_date.year > 2023:
        release_date = release_date.replace(year=release_date.year - 100)

    # Editing the value at the original dataframe
    df.at[i, 'release_date'] = release_date

df['release_date'] = pd.to_datetime(df['release_date'])
df.head(n=10)

Unnamed: 0,name,release_date,genre,MPAA_rating,MovieSuccessLevel,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,B,
2,Home on the Range,2004-04-02,Comedy,PG,C,Will Finn
3,Young Black Stallion,2003-12-25,Adventure,G,D,
4,What's Love Got to Do With It,1993-06-09,Drama,R,B,
5,Lady and the Tramp,1955-06-22,Drama,G,S,Hamilton Luske
6,Corky Romano,2001-10-12,Comedy,PG-13,C,
7,The Waterboy,1998-11-06,Comedy,PG-13,A,
8,Captain America: The Winter Soldier,2014-04-04,Action,PG-13,A,
9,Pirates of the Caribbean: Dead Man’…,2006-07-07,Adventure,PG-13,S,


##### Initializing TMDb API

In [9]:
# Using the TMDb to fill out the missing director from the original dataset

# https://github.com/AnthonyBloomer/tmdbv3api
# https://developers.themoviedb.org/3/getting-started/introduction

# Creating a base class instance from the api library
tmdb = TMDb()
tmdb.api_key = keys.tmdb_key
tmdb.language = 'en'
tmdb.debug = True


##### Using the TMDb API to fill missing directors via the movie title and its release date
Since the director name is still not encoded, the director's popularity score will replace it.

In [10]:
# Creating a Movie instance to search by the movie details
movie = Movie()

for i, mov in df.iterrows():
    search = movie.search(mov['name'])  # Search by the movie title
    for res in search:
        try:
            # Confirming the search results by the release date year
            mov_date_str = str(mov['release_date'])
            if res['release_date'][:4] == mov_date_str[:4]:
                # Extracting the director from the movie credits
                for member in movie.credits(res.id)['crew']:
                    if member['job'] == 'Director':
                        # Editing the value at the original dataframe
                        df.at[i, 'director'] = member['popularity']
                        break
                break
        except BaseException as error:
            print('An exception occurred: {}'.format(error) + " " + mov['name'])

In [11]:
df.isnull().sum()

name                  0
release_date          0
genre                14
MPAA_rating          47
MovieSuccessLevel     0
director             26
dtype: int64

In [12]:
df.head()

Unnamed: 0,name,release_date,genre,MPAA_rating,MovieSuccessLevel,director
0,Recess: School's Out,2001-02-16,Comedy,G,C,0.694
1,D2: The Mighty Ducks,1994-03-25,Comedy,PG,B,1.822
2,Home on the Range,2004-04-02,Comedy,PG,C,2.787
3,Young Black Stallion,2003-12-25,Adventure,G,D,4.423
4,What's Love Got to Do With It,1993-06-09,Drama,R,B,0.636


##### Encoding the MPAA Rating and Genre to indicator variables

In [13]:
# Fill the nulls with the mode
df['director'] = df['director'].fillna(value=df['director'].median())
df['genre'] = df['genre'].fillna(value=df['genre'].mode()[0])
df['MPAA_rating'] = df['MPAA_rating'].fillna(value=df['MPAA_rating'].mode()[0])
df.isnull().sum()

name                 0
release_date         0
genre                0
MPAA_rating          0
MovieSuccessLevel    0
director             0
dtype: int64

In [14]:
# Preprocessing genre and MPAA_rating

df = pd.get_dummies(df, columns=["MPAA_rating"], prefix=["rating_is"])
df = pd.get_dummies(df, columns=["genre"], prefix=["genre_is"])
df.head()

Unnamed: 0,name,release_date,MovieSuccessLevel,director,rating_is_G,rating_is_Not Rated,rating_is_PG,rating_is_PG-13,rating_is_R,genre_is_Action,...,genre_is_Black Comedy,genre_is_Comedy,genre_is_Concert/Performance,genre_is_Documentary,genre_is_Drama,genre_is_Horror,genre_is_Musical,genre_is_Romantic Comedy,genre_is_Thriller/Suspense,genre_is_Western
0,Recess: School's Out,2001-02-16,C,0.694,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,D2: The Mighty Ducks,1994-03-25,B,1.822,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Home on the Range,2004-04-02,C,2.787,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Young Black Stallion,2003-12-25,D,4.423,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,What's Love Got to Do With It,1993-06-09,B,0.636,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


##### Encoding the release date to a scalar

In [15]:
def date_to_float(dt):
    # Calculating the months and days
    calc = (((dt.month - 1) * 30) + dt.day) / 365
    # Adding calc to the years
    return dt.year + calc

##### Splitting date to days and months and then converting it to a scalar via `date_to_float()`

In [16]:
df['month'] = df['release_date'].dt.month
df['day'] = df['release_date'].dt.day
df['season'] = df['release_date'].dt.quarter
df['release_date'] = df['release_date'].apply(date_to_float)
df.head()

Unnamed: 0,name,release_date,MovieSuccessLevel,director,rating_is_G,rating_is_Not Rated,rating_is_PG,rating_is_PG-13,rating_is_R,genre_is_Action,...,genre_is_Documentary,genre_is_Drama,genre_is_Horror,genre_is_Musical,genre_is_Romantic Comedy,genre_is_Thriller/Suspense,genre_is_Western,month,day,season
0,Recess: School's Out,2001.126027,C,0.694,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,16,1
1,D2: The Mighty Ducks,1994.232877,B,1.822,0,0,1,0,0,0,...,0,0,0,0,0,0,0,3,25,1
2,Home on the Range,2004.252055,C,2.787,0,0,1,0,0,0,...,0,0,0,0,0,0,0,4,2,2
3,Young Black Stallion,2003.972603,D,4.423,1,0,0,0,0,0,...,0,0,0,0,0,0,0,12,25,4
4,What's Love Got to Do With It,1993.435616,B,0.636,0,0,0,0,1,0,...,0,1,0,0,0,0,0,6,9,2


##### Feature Engineering

In [18]:
df_animation = pd.read_csv('datasets/2/train/movie-voice-actors.csv')

# Remove duplicates on name column
df_animation = df_animation.drop_duplicates(subset=['name'], keep='first')

# Add column is_animation
df_animation['is_animation'] = 1

# Drop other columns that are not needed
df_animation = df_animation.drop(['voice_actor', 'character'], axis=1)

df_animation.head()

Unnamed: 0,name,is_animation
0,The Aristocats,1
1,The Return of Jafar,1
2,Aladdin,1
3,The Hunchback of Notre Dame,1
4,The Little Mermaid,1


##### Merging the animation dataset with the main dataset

In [19]:
# Merge the animation dataset with the main dataset
df = pd.merge(df, df_animation, on='name', how='left')

# Fill the nulls with 0
df['is_animation'] = df['is_animation'].fillna(value=0)

df.head()

Unnamed: 0,name,release_date,MovieSuccessLevel,director,rating_is_G,rating_is_Not Rated,rating_is_PG,rating_is_PG-13,rating_is_R,genre_is_Action,...,genre_is_Drama,genre_is_Horror,genre_is_Musical,genre_is_Romantic Comedy,genre_is_Thriller/Suspense,genre_is_Western,month,day,season,is_animation
0,Recess: School's Out,2001.126027,C,0.694,1,0,0,0,0,0,...,0,0,0,0,0,0,2,16,1,0.0
1,D2: The Mighty Ducks,1994.232877,B,1.822,0,0,1,0,0,0,...,0,0,0,0,0,0,3,25,1,0.0
2,Home on the Range,2004.252055,C,2.787,0,0,1,0,0,0,...,0,0,0,0,0,0,4,2,2,1.0
3,Young Black Stallion,2003.972603,D,4.423,1,0,0,0,0,0,...,0,0,0,0,0,0,12,25,4,0.0
4,What's Love Got to Do With It,1993.435616,B,0.636,0,0,0,0,1,0,...,1,0,0,0,0,0,6,9,2,0.0


##### Encoding the movie success level to a scalar

In [2]:
# Encoding the movie success level to a scalar
df = pd.read_csv('datasets/2/train/preprocessed.csv')
df['MovieSuccessLevel'] = df['MovieSuccessLevel'].map({'S': 5,
                                                       'A': 4,
                                                       'B': 3,
                                                       'C': 2,
                                                       'D': 1})

##### Saving the preprocessed dataset

In [3]:
df.to_csv('datasets/2/train/preprocessed.csv', index=False)

#####

In [4]:
# Dividing Data
Y = df['MovieSuccessLevel']
X = df.drop(['name', 'MovieSuccessLevel'], axis=1)

##### Splitting the data into train and test sets

In [6]:
from sklearn import metrics
from sklearn import svm
from sklearn import tree

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Data Splits
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=20, test_size=0.2, shuffle=True)

# Feature Scaling
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ------------------------------------- #

# Hyper Parameters

C = 0.00001  # SVM regularization parameter
m_degree = 7

# ------------------------------------- #

# Poly Model
poly_model = svm.SVC(kernel='poly', degree=m_degree, C=C).fit(X_train, Y_train)
p = poly_model.predict(X_test)
print("Accuracy Poly:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Linear Model
linear_svc = svm.LinearSVC(C=C).fit(X_train, Y_train)
p = linear_svc.predict(X_test)
print("Accuracy linear:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Rbf Model
rbf_svc = svm.SVC(kernel='rbf', C=C).fit(X_train, Y_train)
p = rbf_svc.predict(X_test)
print("Accuracy rbf:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Linear Kernel Model
linear_kernel_svc = svm.SVC(kernel='linear', C=C).fit(X_train, Y_train)
p = linear_kernel_svc.predict(X_test)
print("Accuracy Linear kernel:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Logistic Regression Model
logistic_regression_model = LogisticRegression(random_state=0).fit(X_train, Y_train)
p = logistic_regression_model.predict(X_test)
print("Accuracy Logistic Regression:", metrics.accuracy_score(Y_test, p), '\n')

# ------------------------------------- #

# Decision Tree Model

clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf = clf.fit(X_train, Y_train)
p = clf.predict(X_test)
print("Accuracy Decision Tree:", metrics.accuracy_score(Y_test, p), '\n')


Accuracy Poly: 0.40860215053763443 

Accuracy linear: 0.40860215053763443 

Accuracy rbf: 0.40860215053763443 

Accuracy Linear kernel: 0.40860215053763443 

Accuracy Logistic Regression: 0.3548387096774194 

Accuracy Decision Tree: 0.43010752688172044 

