# Movie Success Level Classification

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from datetime import datetime as dt

# preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import make_column_transformer

# models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import sklearn.metrics


### Configurations

In [None]:
plt.rcParams["figure.figsize"] = (20, 6)
plt.style.use("dark_background")
ENDDEL = "\n\t" + "---" * 15 + "\n"
pd.set_option("display.float_format", "{:,.0f}".format)


### Data Loading

In [None]:
def load_data(train):
    directory = "Training_data/" if train == True else "Testing_data/"
    directors = pd.read_csv(directory + "Classification_data/movie-director.csv", header=0, names=["Movie_Title", "Director"])
    actors = pd.read_csv(
        directory + "Classification_data/movie-voice-actors.csv", header=0, names=["Character", "Actor", "Movie_Title"]
    )
    success_lvl = pd.read_csv(
        directory + "Classification_data/movie-success-level.csv",
        header=0,
        names=["Movie_Title", "Release_Date", "Genre", "MPAA_Rating", "Success_Level"],
    )

    success_lvl.Release_Date = pd.to_datetime(success_lvl.Release_Date, format="%d-%b-%y")
    # Fix incorrect year parsing
    success_lvl.Release_Date = success_lvl.Release_Date.apply(
        lambda x: x.replace(year=x.year - 100 if x.year > dt.today().year else x.year)
    )

    return pd.merge(directors, pd.merge(actors, success_lvl, on="Movie_Title", how="outer"), on="Movie_Title", how="outer")


original_data = load_data(True)
original_data.head()


### Data Exploration

In [None]:
original_data.drop(["Release_Date"], axis=1).describe()


In [None]:
original_data.Release_Date.describe(datetime_is_numeric=True)


In [None]:
original_data.Success_Level.value_counts()


In [None]:
original_data.isna().sum()


### Data Cleaning

In [None]:
def can_fill_na(x):
    a = set(original_data["Movie_Title"][original_data[x].notna()])
    b = set(original_data["Movie_Title"][original_data[x].isna()])
    c = a - b
    # if they show up in both sets then they can be filled
    return (len(a) - len(b)) == len(c)


In [None]:
# Are there movies with the same title and missing X ?
print(can_fill_na("Success_Level"))
print(can_fill_na("Director"))
print(can_fill_na("Genre"))
print(can_fill_na("MPAA_Rating"))


In [None]:
# No, then drop those records
original_data.dropna(subset=["Success_Level"], inplace=True)


In [None]:
# Noise features
original_data.drop(["Character", "Actor"], axis=1, inplace=True)


In [None]:
original_data.set_index("Movie_Title").groupby("Movie_Title").sample(1)


In [None]:
original_data.set_index("Movie_Title").groupby("Movie_Title").sample(frac=0.5)


In [None]:
original_data.drop_duplicates()


In [None]:
original_data.isna().sum()


In [None]:
original_data.Director.fillna("Other", inplace=True)
original_data.Genre.fillna("Other", inplace=True)
original_data.MPAA_Rating.fillna("Not Rated", inplace=True)


### Feature Engineering

In [None]:
original_data.columns


In [None]:
def generate_features(data):
    data.Success_Level = data.Success_Level.map({"S": 0, "A": 1, "B": 2, "C": 3, "D": 4})

    data["Release_Day"] = data.Release_Date.dt.day
    data["Release_Month"] = data.Release_Date.dt.month
    data["Release_Year"] = data.Release_Date.dt.year

    data["Recent_Movie"] = dt.today().year - data["Release_Year"]

    data.drop(["Release_Date"], axis=1, inplace=True)

    data.Recent_Movie = data.Recent_Movie < 60


In [None]:
generate_features(original_data)


In [None]:
original_data.head()


In [None]:
categorical_features = ["Success_Level", "Director", "Genre", "MPAA_Rating", "Recent_Movie"]
numerical_features = ["Release_Day", "Release_Month", "Release_Year"]


### Visualization

#### Distributions

##### Categoricals

In [None]:
_, ax = plt.subplots(2, 2, figsize=(20, 10))
for i, f in enumerate(categorical_features[:-1]):
    sb.histplot(original_data[f], ax=ax[i // 2, i % 2])


##### Numericals

In [None]:
_, ax = plt.subplots(2, 2, figsize=(20, 10))
for i, f in enumerate(numerical_features):
    sb.histplot(original_data[f], ax=ax[i // 2, i % 2])


#### Using single features to predict success level

In [None]:
sb.scatterplot(x=original_data.Release_Day, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


In [None]:
sb.scatterplot(x=original_data.Release_Month, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


In [None]:
sb.scatterplot(x=original_data.Release_Year, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


In [None]:
sb.scatterplot(x=original_data.Genre, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


In [None]:
sb.scatterplot(x=original_data.MPAA_Rating, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


In [None]:
sb.scatterplot(x=original_data.Director, y=np.zeros(original_data.shape[0]), hue=original_data.Success_Level)


##### Conclusion: No obvious way to separate features on 1D plane

#### Using pairs of features to predict success level

In [None]:
sb.stripplot(x=original_data.Release_Year, y=original_data.MPAA_Rating, hue=original_data.Success_Level)


In [None]:
sb.stripplot(x=original_data.Director, y=original_data.MPAA_Rating, hue=original_data.Success_Level)


In [None]:
sb.stripplot(x=original_data.Genre, y=original_data.MPAA_Rating, hue=original_data.Success_Level)


In [None]:
sb.stripplot(x=original_data.Genre, y=original_data.Director, hue=original_data.Success_Level)


### Data Preprocessing

In [None]:
original_data.head()


In [None]:
# Split features and target
x_train = original_data.drop(["Movie_Title"], axis=1)
y_train = original_data["Success_Level"]

x_train.head()


In [None]:
column_transformer = make_column_transformer(
    (OneHotEncoder(drop="first"), ["Genre", "Director", "MPAA_Rating", "Recent_Movie"]),
    (MinMaxScaler(), ["Release_Day", "Release_Month", "Release_Year"]),
    n_jobs=-1,
    sparse_threshold=0,
    verbose_feature_names_out=False,
)


In [None]:
prep_x_train = column_transformer.fit_transform(x_train)


In [None]:
prep_x_train.shape


### Basic Models

In [None]:
# Naive Bayes
nb_model = GaussianNB()
nb_cv = cross_val_score(nb_model, prep_x_train, y_train, cv=10)
print("Naive Bayes CV", nb_cv, nb_cv.mean(), sep="\n", end=ENDDEL)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_cv = cross_val_score(lr_model, prep_x_train, y_train, cv=10)
print("Logistic Regression CV", lr_cv, lr_cv.mean(), sep="\n", end=ENDDEL)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=7)
dt_cv = cross_val_score(dt_model, prep_x_train, y_train, cv=10)
print("Decision Tree CV", dt_cv, dt_cv.mean(), sep="\n", end=ENDDEL)

# Support Vector Machine
svc_model = SVC(probability=True)
svc_cv = cross_val_score(svc_model, prep_x_train, y_train, cv=10)
print("Support Vector Machine CV", svc_cv, svc_cv.mean(), sep="\n", end=ENDDEL)

# K Nearest Neighbour
knn_model = KNeighborsClassifier()
knn_cv = cross_val_score(knn_model, prep_x_train, y_train, cv=10)
print("K Nearest Neighbour CV", knn_cv, knn_cv.mean(), sep="\n", end=ENDDEL)


### Ensemble Models

In [None]:
# Gradient Boost
xgb_model = XGBClassifier(random_state=7)
xgb_cv = cross_val_score(xgb_model, prep_x_train, y_train, cv=10)
print("Gradient Boost CV", xgb_cv, xgb_cv.mean(), sep="\n", end=ENDDEL)

# Random forest
rf_model = RandomForestClassifier(random_state=7)
rf_cv = cross_val_score(rf_model, prep_x_train, y_train, cv=10)
print("Random forest CV", rf_cv, rf_cv.mean(), sep="\n", end=ENDDEL)

# Voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ("nb_model", nb_model),
        ("lr_model", lr_model),
        ("dt_model", dt_model),
        ("svc_model", svc_model),
        ("knn_model", knn_model),
        ("rf_model", rf_model),
        ("xgb_model", xgb_model),
    ],
    voting="soft",
)

vc_cv = cross_val_score(voting_clf, prep_x_train, y_train, cv=10)
print("Voting Classifier CV", vc_cv, vc_cv.mean(), sep="\n", end=ENDDEL)


### Grid Search

##### Logistic Regression

In [None]:
# Logitic regression
lr_gs = GridSearchCV(
    lr_model,
    param_grid={"max_iter": [2000, 4000, 6000], "penalty": ["l1", "l2"], "C": np.logspace(-4, 4, 20), "solver": ["liblinear"]},
    cv=10,
    verbose=1,
    n_jobs=-1,
).fit(prep_x_train, y_train)
print("best score:", lr_gs.best_score_)
print("best parameters:", lr_gs.best_params_)


##### KNN

In [None]:
# KNN
knn_gs = GridSearchCV(
    knn_model,
    param_grid={
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "algorithm": ["auto", "ball_tree", "kd_tree"],
        "p": [1, 2],
    },
    cv=10,
    verbose=1,
    n_jobs=-1,
).fit(prep_x_train, y_train)
print("best score:", knn_gs.best_score_)
print("best parameters:", knn_gs.best_params_)


##### SVC

In [None]:
# SVC
svc_gs = GridSearchCV(
    svc_model,
    param_grid=[
        {"kernel": ["rbf"], "gamma": [0.1, 0.5, 1, 5, 10], "C": [0.1, 1, 10, 100]},
        {"kernel": ["linear"], "C": [0.1, 1, 10, 100]},
        {"kernel": ["poly"], "degree": [2, 3], "C": [0.1, 1, 10, 100]},
    ],
    cv=10,
    verbose=1,
    n_jobs=-1,
).fit(prep_x_train, y_train)
print("best score:", svc_gs.best_score_)
print("best parameters:", svc_gs.best_params_)


##### Random Forest

In [None]:
# Random forest
rf_gs = GridSearchCV(
    rf_model,
    param_grid={
        "n_estimators": [400, 500, 550],
        "criterion": ["gini", "entropy"],
        "bootstrap": [True],
        "max_depth": [15, 20],
        "max_features": ["sqrt", 10],
        "min_samples_leaf": [2, 3],
        "min_samples_split": [2, 3],
    },
    cv=10,
    verbose=True,
    n_jobs=-1,
).fit(prep_x_train, y_train)
print("best score:", rf_gs.best_score_)
print("best parameters:", rf_gs.best_params_)


##### XGB

In [None]:
# XGB
xgb_gs = GridSearchCV(
    xgb_model,
    param_grid={
        "n_estimators": [450, 500],
        "colsample_bytree": [0.75, 0.8],
        "max_depth": [None],
        "reg_alpha": [1],
        "reg_lambda": [2, 5, 10],
        "subsample": [0.55, 0.6],
        "learning_rate": [0.5],
        "gamma": [0.5, 1],
        "min_child_weight": [0.01],
        "sampling_method": ["uniform"],
    },
    cv=5,
    verbose=True,
    n_jobs=-1,
).fit(prep_x_train, y_train)
print("best score:", xgb_gs.best_score_)
print("best parameters:", xgb_gs.best_params_)


### Models Testing

#### Test Data Preprocessing 

In [None]:
test_data = load_data(False)


In [None]:
test_data.head()


In [None]:
test_data.drop(["Actor", "Character"], axis=1, inplace=True)


In [None]:
test_data.Director.fillna("Other", inplace=True)
test_data.Genre.fillna("Other", inplace=True)
test_data.MPAA_Rating.fillna("Not Rated", inplace=True)


In [None]:
test_data.isna().sum()


In [None]:
generate_features(test_data)


In [None]:
test_data.head()


In [None]:
x_test = column_transformer.transform(test_data)
y_test = test_data["Success_Level"]


In [None]:
x_test.shape


In [None]:
models = {
    "xgb_gs": xgb_gs,
    "rf_gs": rf_gs,
    "svc_gs": svc_gs,
    "knn_gs": knn_gs,
    "lr_gs": lr_gs,
}


def test_model(model_name, model):
    print(model_name, model.score(x_test, y_test), end=ENDDEL)


for k, v in models.items():
    test_model(k, v)
