# Using Neural Network to Predict Oscar Nominations

## Download and preprocess dataset
- download csv file
- convert csv into pandas DataFrame

In [1]:
# Download the dataset and save as a DataFrame
!gdown 1aG31LxOye3ZLvSme_SODJdfgaftUG-3D

Downloading...
From: https://drive.google.com/uc?id=1aG31LxOye3ZLvSme_SODJdfgaftUG-3D
To: /content/merged_all_cleaned.csv
  0% 0.00/26.4M [00:00<?, ?B/s] 91% 24.1M/26.4M [00:00<00:00, 237MB/s]100% 26.4M/26.4M [00:00<00:00, 242MB/s]


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [3]:
# dataset_df = pd.read_csv('/content/merged_all_cleaned.csv')
dataset_df = pd.read_csv('./merged_all_cleaned.csv')
dataset_df

Unnamed: 0,primaryTitle,imdb_id,releaseYear,releaseMonth01,releaseMonth02,releaseMonth03,releaseMonth04,releaseMonth05,releaseMonth06,releaseMonth07,...,oscars_nominee_VISUAL EFFECTS,oscars_winner_VISUAL EFFECTS,oscars_nominee_ANIMATED FEATURE FILM,oscars_winner_ANIMATED FEATURE FILM,oscars_nominee_WRITING (Adapted Screenplay),oscars_winner_WRITING (Adapted Screenplay),oscars_nominee_PRODUCTION DESIGN,oscars_winner_PRODUCTION DESIGN,oscars_nominee_DOCUMENTARY SHORT FILM,oscars_winner_DOCUMENTARY SHORT FILM
0,toy story,114709,1995,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,jumanji,113497,1995,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,grumpier old men,113228,1995,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,waiting to exhale,114885,1995,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,father of the bride part ii,113041,1995,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44822,caged heat 3000,112613,1995,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44823,robin hood,102797,1991,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44824,century of birthing,2028550,2011,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44825,betrayal,303758,2003,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Separate dataset into features and labels
- Extract identifiers (imdb, primaryTitle)
- Extract features (X)
- Extract labels, columns containing "oscars_nominee" (y)

In [4]:
# Identifiers: "imdb_id" and "primaryTitle"
identifiers_df = dataset_df[['imdb_id', 'primaryTitle', 'releaseYear']]

identifiers_df

Unnamed: 0,imdb_id,primaryTitle,releaseYear
0,114709,toy story,1995
1,113497,jumanji,1995
2,113228,grumpier old men,1995
3,114885,waiting to exhale,1995
4,113041,father of the bride part ii,1995
...,...,...,...
44822,112613,caged heat 3000,1995
44823,102797,robin hood,1991
44824,2028550,century of birthing,2011
44825,303758,betrayal,2003


In [5]:
# Features: drop "primaryTitle" and columns containing "oscars_nominee" or "oscars_winner"
# "releaseYear" is kept and used for splitting training/validation/testing sets
X_df = dataset_df.drop(dataset_df.filter(like='oscars_nominee').columns, axis=1)
X_df = X_df.drop(X_df.filter(like='oscars_winner').columns, axis=1)
X_df = X_df.drop(['imdb_id', 'primaryTitle'], axis=1)

X_df

Unnamed: 0,releaseYear,releaseMonth01,releaseMonth02,releaseMonth03,releaseMonth04,releaseMonth05,releaseMonth06,releaseMonth07,releaseMonth08,releaseMonth09,...,sag_nominee_SUPPORTING ROLE - FEMALE,sag_nominee_LEADING ROLE - MALE,sag_nominee_MALE SUPPORT IN A MOTION PICTURE,sag_nominee_SUPPORTING ROLE - MALE,sag_winner_CAST IN A MOTION PICTURE,sag_winner_LEADING ROLE - FEMALE,sag_winner_SUPPORTING ROLE - FEMALE,sag_winner_LEADING ROLE - MALE,sag_winner_MALE SUPPORT IN A MOTION PICTURE,sag_winner_SUPPORTING ROLE - MALE
0,1995,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1995,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1995,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1995,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1995,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44822,1995,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44823,1991,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44824,2011,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44825,2003,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Target Variables: filter columns containing "oscar_nominees"
# and remove "oscars_nominee_" from column names
y_df = dataset_df.filter(like='oscars_nominee')
y_df.columns = y_df.columns.str.replace('oscars_nominee_', '')

# Remove columns which are all zeroes
y_df = y_df.loc[:, (y_df != 0).any(axis=0)].astype(int)

# Sum up each column
print(y_df.sum())

# # Keep only the first 3 columns
# y_df = y_df.iloc[:, :2]

y_df

CINEMATOGRAPHY                  277
DIRECTING                       397
FILM EDITING                    380
ACTOR IN A SUPPORTING ROLE      367
ACTRESS IN A SUPPORTING ROLE    348
COSTUME DESIGN                  224
SOUND                           206
BEST PICTURE                    294
ACTOR IN A LEADING ROLE         195
ACTRESS IN A LEADING ROLE       175
VISUAL EFFECTS                  122
ANIMATED FEATURE FILM            58
PRODUCTION DESIGN                29
dtype: int64


Unnamed: 0,CINEMATOGRAPHY,DIRECTING,FILM EDITING,ACTOR IN A SUPPORTING ROLE,ACTRESS IN A SUPPORTING ROLE,COSTUME DESIGN,SOUND,BEST PICTURE,ACTOR IN A LEADING ROLE,ACTRESS IN A LEADING ROLE,VISUAL EFFECTS,ANIMATED FEATURE FILM,PRODUCTION DESIGN
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44822,0,0,0,0,0,0,0,0,0,0,0,0,0
44823,0,0,0,0,0,0,0,0,0,0,0,0,0
44824,0,0,0,0,0,0,0,0,0,0,0,0,0
44825,0,0,0,0,0,0,0,0,0,0,0,0,0


## Separate dataset into training, validation, and testing sets
- Training data: 1927 to 2010 (84 years)
- Validation data: 2011 to 2012 (2 years)
- Testing data: 2013 to 2017 (5 years)

In [7]:
train_mask = (X_df['releaseYear'] >= 1927) & (X_df['releaseYear'] <= 2010)
val_mask = (X_df['releaseYear'] >= 2011) & (X_df['releaseYear'] <= 2012)
test_mask = (X_df['releaseYear'] >= 2013) & (X_df['releaseYear'] <= 2017)

# Apply masks on identifiers, features, and target variables
identifiers_df_train = identifiers_df[train_mask]
identifiers_df_val = identifiers_df[val_mask]
identifiers_df_test = identifiers_df[test_mask]

X_df_train = X_df[train_mask]
X_df_val = X_df[val_mask]
X_df_test = X_df[test_mask]

y_df_train = y_df[train_mask]
y_df_val = y_df[val_mask]
y_df_test = y_df[test_mask]

# Check shape of DataFrame
print("Identifiers:")
print("------------")
print("Training Set  ", identifiers_df_train.shape)
print("Validation Set", identifiers_df_val.shape)
print("Testing Set   ", identifiers_df_test.shape)

print("\nFeatures:")
print("---------")
print("Training Set  ", X_df_train.shape)
print("Validation Set", X_df_val.shape)
print("Testing Set   ", X_df_test.shape)

print("\nLabels:")
print("-------")
print("Training Set  ", y_df_train.shape)
print("Validation Set", y_df_val.shape)
print("Testing Set   ", y_df_test.shape)

Identifiers:
------------
Training Set   (33514, 3)
Validation Set (3395, 3)
Testing Set    (7912, 3)

Features:
---------
Training Set   (33514, 115)
Validation Set (3395, 115)
Testing Set    (7912, 115)

Labels:
-------
Training Set   (33514, 13)
Validation Set (3395, 13)
Testing Set    (7912, 13)


## Feature scaling
- Create a transformer to scale the inputs
  - Standardize continuous features by removing the mean and scaling to unit variance
  - Binary features scales to [-1, 1] so that the data is also  centred around 0



In [8]:
# Determine which features are binary and which are continuous
drop_features = ['releaseYear']
binary_features = []
continuous_features = []

# Iterate through each column in X_df
for column in X_df.columns:
    unique_values = X_df[column].unique()

    # Check if the number of unique values is at most 2, indicating a binary feature
    if len(unique_values) == 1:
        drop_features.append(column)
    elif len(unique_values) <= 2:
        binary_features.append(column)
    elif column != 'releaseYear':
        continuous_features.append(column)

print(f"Dropping {len(drop_features)} features:")
print(drop_features)

print("\nBinary Features:")
print(binary_features)

print("\nContinuous Features:")
print(continuous_features)

Dropping 6 features:
['releaseYear', 'genreTalk-Show', 'genreGame-Show', 'genreShort', 'sag_nominee_MALE SUPPORT IN A MOTION PICTURE ', 'sag_winner_MALE SUPPORT IN A MOTION PICTURE ']

Binary Features:
['releaseMonth01', 'releaseMonth02', 'releaseMonth03', 'releaseMonth04', 'releaseMonth05', 'releaseMonth06', 'releaseMonth07', 'releaseMonth08', 'releaseMonth09', 'releaseMonth10', 'releaseMonth11', 'releaseMonth12', 'isAdult', 'languageEnglish', 'languageOther', 'productionCompanyWarnerBros', 'productionCompanyMetroGoldwynMayerMGM', 'productionCompanyParamountPictures', 'productionCompanyTwentiethCenturyFoxFilmCorporation', 'productionCompanyUniversalPictures', 'productionCompanyColumbiaPicturesCorporation', 'productionCompanyCanal', 'productionCompanyColumbiaPictures', 'productionCompanyRKORadioPictures', 'productionCompanyNewLineCinema', 'productionCompanyOther', 'productionCountryUnitedStatesofAmerica', 'productionCountryUnitedKingdom', 'productionCountryFrance', 'productionCountryGe

In [9]:
# Apply StandardScaler to continuous features
# Remove the mean and scale to unit variance
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_features', 'drop', drop_features),
        ('continuous_features', StandardScaler(), continuous_features),
        ('binary_features', MinMaxScaler(feature_range=(-1, 1)), binary_features)
        ],
    remainder='drop',
    verbose_feature_names_out=False,
)

## Train neural network model
- Create a pipeline to incorporate the transofmer
- Wrap a multi-layer perceptron classicier in a multi-target classifier

In [39]:
# Parameters to test out
hidden_layers = [(33, 33, 33), (50, 50), (100,)]
activations = ['logistic', 'relu']
alphas = [0.001, 0.01, 0.1]
max_iters = [2000]
solver = 'lbfgs'
random_state = 42

# hidden_layers = [(100,)]
# activations = ['relu']
# alphas = [0.001]
# max_iters = [1000]

In [40]:
top_count_dict = {
    '2010': 10,
    '2011': 9,
    '2012': 9,
    '2013': 9,
    '2014': 8,
    '2015': 7,
    '2016': 9,
    '2017': 9,
    '2018': 8,
    '2019': 9,
    '2020': 8,
    '>2020': 10,
}

# Define function to predict classes based on a threshold by year
def predict_by_year(model, X_df, y_df, identifiers_df, top_count=5):
    '''
    predict_by_year(model, X_df, y_df, identifiers_df, top_count=5)
    Predicts classes based on a threshold by year

    Parameters
    ----------
    model : Pipeline
        The model pipeline to use for prediction
    X_df : DataFrame
        The features DataFrame
    y_df : DataFrame
        The labels DataFrame
    identifiers_df : DataFrame
        The identifiers DataFrame
    top_count : int
        The number of top predictions to return

    Returns
    -------
    y_pred : DataFrame
        The predicted labels
    result_df : DataFrame
        The result DataFrame
    y_df_with_identifier : DataFrame
        The labels DataFrame with identifier
    '''
    # Preprocess the data
    X_df_transformed = pd.DataFrame(model.named_steps['preprocessor'].transform(X_df))

    # Extract class names
    class_names = y_df.columns.to_numpy()

    # Create a result table where each row is a category
    result_df = pd.DataFrame(class_names, columns=['category'])
    result_columns = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'TP', 'FP', 'FN', 'TN']
    result_df[result_columns] = 0

    # Create a new y_test with identifier
    y_df_with_identifier = identifiers_df.copy()
    unique_years = y_df_with_identifier['releaseYear'].unique()


    # Create a dataframe for prediction
    y_pred = pd.DataFrame(index=y_df.index)

    # Iterate through each category
    for i, category in enumerate(class_names):

        # Find probability of class 1.0
        y_pred_proba = model.named_steps['multi_target_classifier'].estimators_[i].predict_proba(X_df_transformed)[:, 1]

        # Append the probabilities to the identity dataframes
        y_df_with_identifier[category] = y_pred_proba

        # Create a new column for predictions
        y_df_with_identifier[f'{category}_prediction'] = 0

        # Iterate through each year
        for year in unique_years:
            # Change top_count if category is best_picture and year is 2010 or later
            if category == 'best_picture' and year >= 2010:
                if year > 2020:
                    top_count = top_count_dict['>2020']
                else:
                    top_count = top_count_dict[str(year)]

            # Extract probabilities and imdb_id for the current year
            year_mask = y_df_with_identifier['releaseYear'] == year
            y_pred_proba_year_df = y_df_with_identifier[year_mask]

            # Find the imdb of the top probabilities
            top_imdb_ids = y_pred_proba_year_df.sort_values(by=category, ascending=False).head(top_count)['imdb_id'].to_numpy()

            # Set prediction to 1 if imdb matches
            y_df_with_identifier.loc[y_df_with_identifier['imdb_id'].isin(top_imdb_ids), f'{category}_prediction'] = 1

            y_pred[category] = y_df_with_identifier[f'{category}_prediction']

        y_true = y_df.iloc[:, i]
        y_pred_category = y_pred[category]

        # Calculate metrics
        labels = [1.0, 0.0]

        try:
            accuracy_value = accuracy_score(y_true, y_pred_category)
            balanced_accuracy_value = balanced_accuracy_score(y_true, y_pred_category)
            precision_value = precision_score(y_true, y_pred_category, labels=labels, zero_division=0)
            recall_value = recall_score(y_true, y_pred_category, labels=labels, zero_division=0)
            f1_score_value = f1_score(y_true, y_pred_category, labels=labels, zero_division=0)
            roc_auc_value = roc_auc_score(y_true, y_pred_category, labels=labels)
            matrix = confusion_matrix(y_true, y_pred_category, labels=labels)
        except ValueError:
            continue

        # Update the result_df with the calculated metrics
        result_df.loc[result_df['category'] == category, 'accuracy'] = accuracy_value
        result_df.loc[result_df['category'] == category, 'balanced_accuracy'] = balanced_accuracy_value
        result_df.loc[result_df['category'] == category, 'precision'] = precision_value
        result_df.loc[result_df['category'] == category, 'recall'] = recall_value
        result_df.loc[result_df['category'] == category, 'f1_score'] = f1_score_value
        result_df.loc[result_df['category'] == category, 'roc_auc'] = roc_auc_value
        result_df.loc[result_df['category'] == category, 'TP'] = matrix[0][0]
        result_df.loc[result_df['category'] == category, 'FP'] = matrix[0][1]
        result_df.loc[result_df['category'] == category, 'FN'] = matrix[1][0]
        result_df.loc[result_df['category'] == category, 'TN'] = matrix[1][1]

    return y_pred, result_df, y_df_with_identifier

In [41]:
# Train the model using all permutations of hyperparameters and find the best model

# Set up the parameter grid to search without GridSearchCV
best_model = None
best_parameters = {'hidden_layer_sizes': None, 'activation': None, 'alpha': None}
best_metrics = {'accuracy': 0, 'balanced_accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'roc_auc': 0}
training_results_df = pd.DataFrame(columns=['hidden_layer_sizes', 'activation', 'alpha', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

# Create a list of all permutations of hyperparameters
permutations = [(hidden_layer_size, activation, alpha, max_iter) for hidden_layer_size in hidden_layers for activation in activations for alpha in alphas for max_iter in max_iters]

# Iterate through each combination of hyperparameters using tqdm
for hidden_layer_size, activation, alpha, max_iter in tqdm(permutations):
    # Create a multi-layer perceptron classifier
    classifier = MLPClassifier(
        hidden_layer_sizes=hidden_layer_size,
        activation=activation,
        solver=solver,
        alpha=alpha,
        max_iter=max_iter,
        random_state=random_state,
        verbose=True,
        )

    # Wrap MLP classifier in a multi-target classifier object
    multi_target_classifier = MultiOutputClassifier(classifier)

    # Create model pipeline
    model = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('multi_target_classifier', multi_target_classifier)
            ]
        )

    # Fit the model
    model.fit(X_df_train, y_df_train)

    # Predict classes
    y_pred, result_df, _ = predict_by_year(model, X_df_val, y_df_val, identifiers_df_val)

    # Extract metrics from result_df
    accuracy_value = result_df['accuracy'].mean()
    balanced_accuracy_value = result_df['balanced_accuracy'].mean()
    precision_value = result_df['precision'].mean()
    recall_value = result_df['recall'].mean()
    f1_score_value = result_df['f1_score'].mean()
    roc_auc_value = result_df['roc_auc'].mean()

    # Save training results to DataFrame
    new_row = {
        'hidden_layer_sizes': str(hidden_layer_size),
        'activation': activation,
        'alpha': alpha,
        'accuracy': accuracy_value,
        'balanced_accuracy': balanced_accuracy_value,
        'precision': precision_value,
        'recall': recall_value,
        'f1': f1_score_value,
        'roc_auc': roc_auc_value
    }
    training_results_df = training_results_df.append(new_row, ignore_index=True)

    # Print metrics
    print()
    print("-" * 20)
    print("Hidden Layer Size:", hidden_layer_size)
    print("Activation:", activation)
    print("Alpha:", alpha)
    print("-" * 20)
    print("Accuracy:", accuracy_value, "(Best:", best_metrics['accuracy'], ")")
    print("Balanced Accuracy:", balanced_accuracy_value, "(Best:", best_metrics['balanced_accuracy'], ")")
    print("Precision:", precision_value, "(Best:", best_metrics['precision'], ")")
    print("Recall:", recall_value, "(Best:", best_metrics['recall'], ")")
    print("F1 Score:", f1_score_value, "(Best:", best_metrics['f1'], ")")
    print("ROC AUC Score:", roc_auc_value, "(Best:", best_metrics['roc_auc'], ")")
    print()

    # Update best model if metrics are better
    if balanced_accuracy_value > best_metrics['balanced_accuracy']:
        best_model = model
        best_parameters['hidden_layer_sizes'] = hidden_layer_size
        best_parameters['activation'] = activation
        best_parameters['alpha'] = alpha
        best_parameters['max_iter'] = max_iter
        best_metrics['accuracy'] = accuracy_value
        best_metrics['balanced_accuracy'] = balanced_accuracy_value
        best_metrics['precision'] = precision_value
        best_metrics['recall'] = recall_value
        best_metrics['f1'] = f1_score_value
        best_metrics['roc_auc'] = roc_auc_value

  training_results_df = training_results_df.append(new_row, ignore_index=True)
  6%|▌         | 1/18 [00:14<04:14, 14.98s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: logistic
Alpha: 0.001
--------------------
Accuracy: 0.9180015860428232 (Best: 0 )
Balanced Accuracy: 0.4978257288641371 (Best: 0 )
Precision: 0.06923076923076923 (Best: 0 )
Recall: 0.0750971250971251 (Best: 0 )
F1 Score: 0.07076782866256551 (Best: 0 )
ROC AUC Score: 0.49782572886413706 (Best: 0 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 11%|█         | 2/18 [00:29<03:59, 14.98s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: logistic
Alpha: 0.01
--------------------
Accuracy: 0.9180015860428232 (Best: 0.9180015860428232 )
Balanced Accuracy: 0.4978257288641371 (Best: 0.4978257288641371 )
Precision: 0.06923076923076923 (Best: 0.06923076923076923 )
Recall: 0.0750971250971251 (Best: 0.0750971250971251 )
F1 Score: 0.07076782866256551 (Best: 0.07076782866256551 )
ROC AUC Score: 0.49782572886413706 (Best: 0.49782572886413706 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 17%|█▋        | 3/18 [00:44<03:44, 14.99s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: logistic
Alpha: 0.1
--------------------
Accuracy: 0.9180015860428232 (Best: 0.9180015860428232 )
Balanced Accuracy: 0.4978257288641371 (Best: 0.4978257288641371 )
Precision: 0.06923076923076923 (Best: 0.06923076923076923 )
Recall: 0.0750971250971251 (Best: 0.0750971250971251 )
F1 Score: 0.07076782866256551 (Best: 0.07076782866256551 )
ROC AUC Score: 0.49782572886413706 (Best: 0.49782572886413706 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 22%|██▏       | 4/18 [03:55<19:40, 84.32s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: relu
Alpha: 0.001
--------------------
Accuracy: 0.9204033080321741 (Best: 0.9180015860428232 )
Balanced Accuracy: 0.6921303235693871 (Best: 0.4978257288641371 )
Precision: 0.47692307692307695 (Best: 0.06923076923076923 )
Recall: 0.46250138750138753 (Best: 0.0750971250971251 )
F1 Score: 0.46001597301906894 (Best: 0.07076782866256551 )
ROC AUC Score: 0.6921303235693871 (Best: 0.49782572886413706 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 28%|██▊       | 5/18 [07:00<26:07, 120.61s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: relu
Alpha: 0.01
--------------------
Accuracy: 0.9204939390506401 (Best: 0.9204033080321741 )
Balanced Accuracy: 0.6973450918606579 (Best: 0.6921303235693871 )
Precision: 0.49230769230769234 (Best: 0.47692307692307695 )
Recall: 0.47288544788544795 (Best: 0.46250138750138753 )
F1 Score: 0.4734620728428777 (Best: 0.46001597301906894 )
ROC AUC Score: 0.6973450918606579 (Best: 0.6921303235693871 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 33%|███▎      | 6/18 [10:12<28:57, 144.76s/it]


--------------------
Hidden Layer Size: (33, 33, 33)
Activation: relu
Alpha: 0.1
--------------------
Accuracy: 0.920267361504475 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.680987880211911 (Best: 0.6973450918606579 )
Precision: 0.4538461538461539 (Best: 0.49230769230769234 )
Recall: 0.44028471528471524 (Best: 0.47288544788544795 )
F1 Score: 0.4381509443428948 (Best: 0.4734620728428777 )
ROC AUC Score: 0.680987880211911 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 39%|███▉      | 7/18 [12:37<26:32, 144.79s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: logistic
Alpha: 0.001
--------------------
Accuracy: 0.9197235753936785 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.6293377712924865 (Best: 0.6973450918606579 )
Precision: 0.36153846153846153 (Best: 0.49230769230769234 )
Recall: 0.3372571872571873 (Best: 0.47288544788544795 )
F1 Score: 0.3432333748123222 (Best: 0.4734620728428777 )
ROC AUC Score: 0.6293377712924862 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 44%|████▍     | 8/18 [15:12<24:43, 148.32s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: logistic
Alpha: 0.01
--------------------
Accuracy: 0.9198142064121446 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.632216037737443 (Best: 0.6973450918606579 )
Precision: 0.37692307692307686 (Best: 0.49230769230769234 )
Recall: 0.342968142968143 (Best: 0.47288544788544795 )
F1 Score: 0.3529564003248214 (Best: 0.4734620728428777 )
ROC AUC Score: 0.632216037737443 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 50%|█████     | 9/18 [17:52<22:45, 151.74s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: logistic
Alpha: 0.1
--------------------
Accuracy: 0.9199048374306107 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.639882510937166 (Best: 0.6973450918606579 )
Precision: 0.3923076923076923 (Best: 0.49230769230769234 )
Recall: 0.35825563325563337 (Best: 0.47288544788544795 )
F1 Score: 0.3681248575985419 (Best: 0.4734620728428777 )
ROC AUC Score: 0.639882510937166 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 56%|█████▌    | 10/18 [20:10<19:40, 147.55s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: relu
Alpha: 0.001
--------------------
Accuracy: 0.9204033080321741 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.6933374127228931 (Best: 0.6973450918606579 )
Precision: 0.47692307692307695 (Best: 0.49230769230769234 )
Recall: 0.46491563991563994 (Best: 0.47288544788544795 )
F1 Score: 0.4623933685853191 (Best: 0.4734620728428777 )
ROC AUC Score: 0.6933374127228931 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 61%|██████    | 11/18 [22:29<16:55, 145.01s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: relu
Alpha: 0.01
--------------------
Accuracy: 0.920357992522941 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.6887639356730441 (Best: 0.6973450918606579 )
Precision: 0.4692307692307693 (Best: 0.49230769230769234 )
Recall: 0.4557914307914308 (Best: 0.47288544788544795 )
F1 Score: 0.4547702383615696 (Best: 0.4734620728428777 )
ROC AUC Score: 0.6887639356730441 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 67%|██████▋   | 12/18 [24:52<14:26, 144.46s/it]


--------------------
Hidden Layer Size: (50, 50)
Activation: relu
Alpha: 0.1
--------------------
Accuracy: 0.920357992522941 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.6906356852430139 (Best: 0.6973450918606579 )
Precision: 0.4692307692307693 (Best: 0.49230769230769234 )
Recall: 0.4595349095349095 (Best: 0.47288544788544795 )
F1 Score: 0.4554506037478173 (Best: 0.4734620728428777 )
ROC AUC Score: 0.690635685243014 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 72%|███████▏  | 13/18 [27:48<12:50, 154.05s/it]


--------------------
Hidden Layer Size: (100,)
Activation: logistic
Alpha: 0.001
--------------------
Accuracy: 0.9204033080321741 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.6936260400020254 (Best: 0.6973450918606579 )
Precision: 0.476923076923077 (Best: 0.49230769230769234 )
Recall: 0.46549284049284056 (Best: 0.47288544788544795 )
F1 Score: 0.4613205164288755 (Best: 0.4734620728428777 )
ROC AUC Score: 0.6936260400020253 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 78%|███████▊  | 14/18 [30:34<10:30, 157.52s/it]


--------------------
Hidden Layer Size: (100,)
Activation: logistic
Alpha: 0.01
--------------------
Accuracy: 0.9204939390506401 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.7006417784007424 (Best: 0.6973450918606579 )
Precision: 0.49230769230769234 (Best: 0.49230769230769234 )
Recall: 0.4794788544788545 (Best: 0.47288544788544795 )
F1 Score: 0.4759725310808902 (Best: 0.4734620728428777 )
ROC AUC Score: 0.7006417784007424 (Best: 0.6973450918606579 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 83%|████████▎ | 15/18 [33:43<08:20, 166.99s/it]


--------------------
Hidden Layer Size: (100,)
Activation: logistic
Alpha: 0.1
--------------------
Accuracy: 0.9206298855783392 (Best: 0.9204939390506401 )
Balanced Accuracy: 0.7145938582219851 (Best: 0.7006417784007424 )
Precision: 0.5153846153846154 (Best: 0.49230769230769234 )
Recall: 0.5073149073149074 (Best: 0.4794788544788545 )
F1 Score: 0.5019734307660004 (Best: 0.4759725310808902 )
ROC AUC Score: 0.7145938582219851 (Best: 0.7006417784007424 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 89%|████████▉ | 16/18 [37:19<06:03, 181.83s/it]


--------------------
Hidden Layer Size: (100,)
Activation: relu
Alpha: 0.001
--------------------
Accuracy: 0.9204939390506401 (Best: 0.9206298855783392 )
Balanced Accuracy: 0.7012827838321571 (Best: 0.7145938582219851 )
Precision: 0.49230769230769234 (Best: 0.5153846153846154 )
Recall: 0.4807609057609058 (Best: 0.5073149073149074 )
F1 Score: 0.4772706481684809 (Best: 0.5019734307660004 )
ROC AUC Score: 0.7012827838321571 (Best: 0.7145938582219851 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
 94%|█████████▍| 17/18 [40:53<03:11, 191.52s/it]


--------------------
Hidden Layer Size: (100,)
Activation: relu
Alpha: 0.01
--------------------
Accuracy: 0.9204486235414071 (Best: 0.9206298855783392 )
Balanced Accuracy: 0.6967384578057244 (Best: 0.7145938582219851 )
Precision: 0.4846153846153847 (Best: 0.5153846153846154 )
Recall: 0.47169497169497177 (Best: 0.5073149073149074 )
F1 Score: 0.4690755752675258 (Best: 0.5019734307660004 )
ROC AUC Score: 0.6967384578057244 (Best: 0.7145938582219851 )



  training_results_df = training_results_df.append(new_row, ignore_index=True)
100%|██████████| 18/18 [44:38<00:00, 148.80s/it]


--------------------
Hidden Layer Size: (100,)
Activation: relu
Alpha: 0.1
--------------------
Accuracy: 0.9204486235414071 (Best: 0.9206298855783392 )
Balanced Accuracy: 0.6961265730455554 (Best: 0.7145938582219851 )
Precision: 0.48461538461538467 (Best: 0.5153846153846154 )
Recall: 0.4704711954711955 (Best: 0.5073149073149074 )
F1 Score: 0.4685871747791252 (Best: 0.5019734307660004 )
ROC AUC Score: 0.6961265730455554 (Best: 0.7145938582219851 )






In [35]:
# Parameters in best model:
print("-" * 20)
print("Best Model:")
print("-" * 20)
print("Hidden Layer Size:", best_parameters['hidden_layer_sizes'])
print("Activation:", best_parameters['activation'])
print("Alpha:", best_parameters['alpha'])
print("-" * 20)
print("Accuracy:         ", round(best_metrics['accuracy'], 4))
print("Balanced Accuracy:", round(best_metrics['balanced_accuracy'], 4))
print("Precision:        ", round(best_metrics['precision'], 4))
print("Recall:           ", round(best_metrics['recall'], 4))
print("F1 Score:         ", round(best_metrics['f1'], 4))
print("ROC AUC Score:    ", round(best_metrics['roc_auc'], 4))

--------------------
Best Model:
--------------------
Hidden Layer Size: (100,)
Activation: logistic
Alpha: 0.1
--------------------
Accuracy:          0.9206
Balanced Accuracy: 0.7146
Precision:         0.5154
Recall:            0.5073
F1 Score:          0.502
ROC AUC Score:     0.7146


In [44]:
# Save training results to CSV
training_results_df.to_csv('training_results.csv', index=False)

In [43]:
training_results_df

Unnamed: 0,hidden_layer_sizes,activation,alpha,accuracy,balanced_accuracy,precision,recall,f1,roc_auc
0,"(33, 33, 33)",logistic,0.001,0.918002,0.497826,0.069231,0.075097,0.070768,0.497826
1,"(33, 33, 33)",logistic,0.01,0.918002,0.497826,0.069231,0.075097,0.070768,0.497826
2,"(33, 33, 33)",logistic,0.1,0.918002,0.497826,0.069231,0.075097,0.070768,0.497826
3,"(33, 33, 33)",relu,0.001,0.920403,0.69213,0.476923,0.462501,0.460016,0.69213
4,"(33, 33, 33)",relu,0.01,0.920494,0.697345,0.492308,0.472885,0.473462,0.697345
5,"(33, 33, 33)",relu,0.1,0.920267,0.680988,0.453846,0.440285,0.438151,0.680988
6,"(50, 50)",logistic,0.001,0.919724,0.629338,0.361538,0.337257,0.343233,0.629338
7,"(50, 50)",logistic,0.01,0.919814,0.632216,0.376923,0.342968,0.352956,0.632216
8,"(50, 50)",logistic,0.1,0.919905,0.639883,0.392308,0.358256,0.368125,0.639883
9,"(50, 50)",relu,0.001,0.920403,0.693337,0.476923,0.464916,0.462393,0.693337


## Evaluate model

In [48]:
# Determine the accuracy, classification report, and confusion maxtrix for each category
# Preprocess the data
X_df_test_transformed = pd.DataFrame(model.named_steps['preprocessor'].transform(X_df_test))

# Extract feature names
feature_names = X_df.columns.to_numpy()

# Extract class names
class_names = y_df.columns.to_numpy()

# Create a result table where each row is a category
result_df = pd.DataFrame(class_names, columns=['category'])
result_columns = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'TP', 'FP', 'FN', 'TN']
result_df[result_columns] = 0

# Create a new y_test with identifier
y_df_test_with_identifier = identifiers_df_test.copy()
unique_years = y_df_test_with_identifier['releaseYear'].unique()

y_pred, result_df, y_df_test_with_identifier = predict_by_year(best_model, X_df_test, y_df_test, identifiers_df_test)

# Iterate through each category
for i, category in enumerate(class_names):
    # Classification report
    report = classification_report(y_df_test.iloc[:, i], y_pred[category])

    # Confusion matrix
    matrix = confusion_matrix(y_df_test.iloc[:, i], y_pred[category])

    print("\n-------------------------------------------------------")
    print(f"{category}:")
    print("-------------------------------------------------------")

    # Print classification report
    print(f"\nClassification Report ({category}):")
    print(report)

    # Print confusion matrix
    print(f"\nConfusion Matrix ({category}):")
    print(matrix)


-------------------------------------------------------
CINEMATOGRAPHY:
-------------------------------------------------------

Classification Report (CINEMATOGRAPHY):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7891
           1       0.20      0.24      0.22        21

    accuracy                           1.00      7912
   macro avg       0.60      0.62      0.61      7912
weighted avg       1.00      1.00      1.00      7912


Confusion Matrix (CINEMATOGRAPHY):
[[7871   20]
 [  16    5]]

-------------------------------------------------------
DIRECTING:
-------------------------------------------------------

Classification Report (DIRECTING):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7889
           1       0.48      0.52      0.50        23

    accuracy                           1.00      7912
   macro avg       0.74      0.76      0.75      7912
weighted

In [49]:
result_df.drop(columns=['TP', 'FP', 'FN', 'TN'])

Unnamed: 0,category,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc
0,CINEMATOGRAPHY,0.99545,0.61778,0.2,0.238095,0.217391,0.61778
1,DIRECTING,0.996967,0.760046,0.48,0.521739,0.5,0.760046
2,FILM EDITING,0.997219,0.781848,0.52,0.565217,0.541667,0.781848
3,ACTOR IN A SUPPORTING ROLE,0.997851,0.849303,0.56,0.7,0.622222,0.849303
4,ACTRESS IN A SUPPORTING ROLE,0.997093,0.771903,0.48,0.545455,0.510638,0.771903
5,COSTUME DESIGN,0.99545,0.61778,0.2,0.238095,0.217391,0.61778
6,SOUND,0.996587,0.49842,0.0,0.0,0.0,0.49842
7,BEST PICTURE,0.996967,0.756376,0.76,0.513514,0.612903,0.756376
8,ACTOR IN A LEADING ROLE,0.998231,0.869058,0.68,0.73913,0.708333,0.869058
9,ACTRESS IN A LEADING ROLE,0.998104,0.90549,0.52,0.8125,0.634146,0.90549


## Analyze feature importance

In [57]:
# Extract feature and class names from preprocessor
original_feature_names = X_df.columns
transformed_feature_names = preprocessor.get_feature_names_out(original_feature_names)

# Print feature and label names
print("Features", transformed_feature_names.shape)
print("Classes  ", class_names.shape)

Features (109,)
Classes   (13,)


In [58]:
transformed_feature_names.shape

(109,)

In [59]:
# Determine feature importance for each category
for i, category in enumerate(class_names):
    # Extract coefficients from the first layer
    coefficients = best_model.named_steps['multi_target_classifier'].estimators_[i].coefs_[0]

    # Calculate feature importances as the mean absolute value of coefficients
    feature_importance = np.mean(np.abs(coefficients), axis=1)

    # Create a DataFrame with feature names and importances
    feature_importance_df = pd.DataFrame({'Feature': transformed_feature_names, 'Importance': feature_importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Save top 3 most important feature to result_df
    result_df.loc[result_df['category'] == category, '1st_important_feature'] = feature_importance_df.iloc[0]['Feature']
    result_df.loc[result_df['category'] == category, '2nd_important_feature'] = feature_importance_df.iloc[1]['Feature']
    result_df.loc[result_df['category'] == category, '3rd_important_feature'] = feature_importance_df.iloc[2]['Feature']

    # Print feature importances
    print("----------------------------------------------------")
    print(f"{category}:")
    print("----------------------------------------------------")

    print("20 Most Important Features")
    print("--------------------------")
    print(feature_importance_df.head(20))

    print("\n20 Least Important Features")
    print("---------------------------")
    print(feature_importance_df.tail(20))
    print()

----------------------------------------------------
CINEMATOGRAPHY:
----------------------------------------------------
20 Most Important Features
--------------------------
                                              Feature  Importance
4                                            numVotes    0.738326
2                                      runtimeMinutes    0.701756
85          gg_nominee_Best Director - Motion Picture    0.636078
71             gg_nominee_Best Motion Picture - Drama    0.530624
1                                             revenue    0.525189
0                                              budget    0.519149
38                             productionCompanyOther    0.513980
59                                           genreWar    0.489760
89    gg_nominee_Best Original Score - Motion Picture    0.452283
39             productionCountryUnitedStatesofAmerica    0.443941
3                                       averageRating    0.442122
48                              

In [62]:
result_df.drop(["accuracy", "balanced_accuracy", "precision", "recall", "f1_score", "roc_auc", "TP", "FP", "FN", "TN"], axis=1)

Unnamed: 0,category,1st_important_feature,2nd_important_feature,3rd_important_feature
0,CINEMATOGRAPHY,numVotes,runtimeMinutes,gg_nominee_Best Director - Motion Picture
1,DIRECTING,runtimeMinutes,numVotes,budget
2,FILM EDITING,runtimeMinutes,numVotes,gg_nominee_Best Motion Picture - Drama
3,ACTOR IN A SUPPORTING ROLE,runtimeMinutes,gg_nominee_Best Performance by an Actor in a S...,budget
4,ACTRESS IN A SUPPORTING ROLE,gg_nominee_Best Performance by an Actress in a...,runtimeMinutes,sag_nominee_SUPPORTING ROLE - FEMALE
5,COSTUME DESIGN,runtimeMinutes,numVotes,revenue
6,SOUND,runtimeMinutes,revenue,numVotes
7,BEST PICTURE,gg_nominee_Best Screenplay - Motion Picture,revenue,numVotes
8,ACTOR IN A LEADING ROLE,gg_nominee_Best Performance by an Actor in a M...,sag_nominee_LEADING ROLE - MALE,numVotes
9,ACTRESS IN A LEADING ROLE,gg_nominee_Best Performance by an Actress in a...,sag_nominee_LEADING ROLE - FEMALE,gg_nominee_Best Performance by an Actress in a...


In [63]:
result_df

Unnamed: 0,category,accuracy,balanced_accuracy,precision,recall,f1_score,roc_auc,TP,FP,FN,TN,1st_important_feature,2nd_important_feature,3rd_important_feature
0,CINEMATOGRAPHY,0.99545,0.61778,0.2,0.238095,0.217391,0.61778,5,16,20,7871,numVotes,runtimeMinutes,gg_nominee_Best Director - Motion Picture
1,DIRECTING,0.996967,0.760046,0.48,0.521739,0.5,0.760046,12,11,13,7876,runtimeMinutes,numVotes,budget
2,FILM EDITING,0.997219,0.781848,0.52,0.565217,0.541667,0.781848,13,10,12,7877,runtimeMinutes,numVotes,gg_nominee_Best Motion Picture - Drama
3,ACTOR IN A SUPPORTING ROLE,0.997851,0.849303,0.56,0.7,0.622222,0.849303,14,6,11,7881,runtimeMinutes,gg_nominee_Best Performance by an Actor in a S...,budget
4,ACTRESS IN A SUPPORTING ROLE,0.997093,0.771903,0.48,0.545455,0.510638,0.771903,12,10,13,7877,gg_nominee_Best Performance by an Actress in a...,runtimeMinutes,sag_nominee_SUPPORTING ROLE - FEMALE
5,COSTUME DESIGN,0.99545,0.61778,0.2,0.238095,0.217391,0.61778,5,16,20,7871,runtimeMinutes,numVotes,revenue
6,SOUND,0.996587,0.49842,0.0,0.0,0.0,0.49842,0,2,25,7885,runtimeMinutes,revenue,numVotes
7,BEST PICTURE,0.996967,0.756376,0.76,0.513514,0.612903,0.756376,19,18,6,7869,gg_nominee_Best Screenplay - Motion Picture,revenue,numVotes
8,ACTOR IN A LEADING ROLE,0.998231,0.869058,0.68,0.73913,0.708333,0.869058,17,6,8,7881,gg_nominee_Best Performance by an Actor in a M...,sag_nominee_LEADING ROLE - MALE,numVotes
9,ACTRESS IN A LEADING ROLE,0.998104,0.90549,0.52,0.8125,0.634146,0.90549,13,3,12,7884,gg_nominee_Best Performance by an Actress in a...,sag_nominee_LEADING ROLE - FEMALE,gg_nominee_Best Performance by an Actress in a...


In [23]:
# Save result_df to csv
result_df.to_csv('result_df.csv', index=False)