# Overview

**GENERAL THOUGHTS:**



**DATA PREPROCESSING:**

Imbalanced data:
- over_sampling for imbalanced data
- cost-sensitive learning for imbalanced data

continuous data:
- Impute missing data: SimpleImputer(strategy='median')
- Standardize data: StandardScaler()

categorical data:
- Impute missing data: SimpleImputer(strategy='most_frequent')
- Ordinal & Nominal data encoding: OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
- Unknown values ecoding: custom encoder "OrdinalEncoderExtensionUnknowns()"

target data:
- target encoding: OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

**MULTI-CLASS CLASSIFIER:**
- Overview models to be considered:  
  - [X] Neural Net: Multi Layer Perceptron (MLP)


In [None]:
colab = False

In [None]:
if colab:
    !pip install optuna==3.5.0
    # !pip install optuna.integration
    !pip install lightning

In [None]:
# import os
import sys
import yaml
import copy

import pandas as pd
from sklearn.metrics import classification_report
import optuna
from optuna.integration import PyTorchLightningPruningCallback

import lightning as L
from lightning.pytorch.tuner import Tuner
import torch
from torchmetrics.classification import MulticlassF1Score
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# NOTE: if used in google colab, upload env_vars_colab.yml to current google colab directory!

# get config
if colab:
    with open('./env_vars_colab.yml', 'r') as file:
        config = yaml.safe_load(file)

    # custom imports
    sys.path.append(config['project_directory'])
else:
    with open('../env_vars.yml', 'r') as file:
        config = yaml.safe_load(file)

    # custom imports
    sys.path.append(config['project_directory'])

from src.tabular_lightning import (
    TabularDataModuleClassificationPACKAGING,
    MulticlassTabularLightningModule,
    MulticlassTabularMLP
)
from src import tabular_lightning_utils as tl_utils

In [None]:
SEED = 42 # Ensure same data split as in other notebooks

# Models and Training/HPO

In [None]:
class OptunaObjective(object):
    """Optuna objective for hyperparameter tuning."""
    def __init__(self, optuna_config) -> None:
        self.optuna_config = optuna_config
        self.dm = TabularDataModuleClassificationPACKAGING(
            data_dir=f"{config['data_directory']}/output/df_ml.csv",
            continuous_cols=['material_weight'],
            categorical_cols=[
                'material_number',
                'brand',
                'product_area',
                'core_segment',
                'component',
                'manufactoring_location',
                'characteristic_value',
                'packaging_code'
            ],
            target=['packaging_category'],
            oversampling=True,
            test_size=0.2,
            val_size=0.2,
            batch_size=64,
            SEED=SEED
        )
        self.dm.prepare_data()
        self.dm.setup(stage='fit')
        tl_utils.check_data_consitancy(self.dm)
        tl_utils.check_dataloader_output(self.dm, next(iter(self.dm.train_dataloader())))

        tabular_data_full = pd.concat([self.dm.train_dataset.get_dataframe, self.dm.val_dataset.get_dataframe, self.dm.test_dataset.get_dataframe], axis=0, ignore_index=True)
        self.embedding_sizes_cat_features = tl_utils.get_cat_feature_embedding_sizes(tabular_data_full, categorical_cols=self.dm.categorical_cols)

    def __call__(self, trial: optuna.Trial) -> float:

        # joblib.dump(study, 'study.pkl')

        # Define the hyperparameter search space
        hp_space_optuna = {
            'hidden_size': trial.suggest_categorical('hidden_size', [8, 16, 32, 64, 128]), # number of neurons in each layer
            'n_hidden_layers': trial.suggest_int("n_hidden_layers", 1, 6), # number of layers
            'batch_size': trial.suggest_categorical("batch_size", [16, 32, 64]), # number of samples per batch
            'dropout': trial.suggest_categorical("dropout", [0.0, 0.1, 0.2, 0.4]), # dropout rate
        }
        # Create a datamodule
        dm = copy.deepcopy(self.dm) # deep copy for distributed training
        dm.batch_size = hp_space_optuna['batch_size']
        # Create a model
        model = MulticlassTabularMLP(
            input_size=len(dm.feature_cols),
            output_size=dm.n_classes,
            hidden_size=hp_space_optuna['hidden_size'],
            n_hidden_layers=hp_space_optuna['n_hidden_layers'],
            dropout=hp_space_optuna['dropout'],
            norm=True,
        )
        # Create a LightningModule
        lightningmodel = MulticlassTabularLightningModule(
            model=model,
            learning_rate=0.001,
            train_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
            val_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
            test_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
        )
        # Create a trainer
        trainer = L.Trainer(
            devices="auto", # (os.cpu_count() / 2)
            callbacks=[
                PyTorchLightningPruningCallback(trial, monitor="val_loss"),
                # EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5),
            ],
            max_epochs=self.optuna_config["trainer_max_epochs"],
            precision='bf16-mixed',
            default_root_dir="lightning_logs/",
        )
        # Create a Tuner
        tuner = Tuner(trainer)
        lr_finder = tuner.lr_find(lightningmodel, datamodule=dm) # finds learning rate automatically
        new_lr = lr_finder.suggestion()
        lightningmodel.learning_rate = new_lr # update hparams of the model
        trial.set_user_attr("learning_rate", new_lr) # Track learning_rate as a user attribute
        # Train the model
        trainer.fit(
            model=lightningmodel,
            train_dataloaders=dm.train_dataloader(),
            val_dataloaders=dm.val_dataloader()
        )

        # score = trainer.test(model=lightningmodel, dataloaders=self.dm.test_dataloader())
        # score[0]['test_F1_macro_weighted']

        return trainer.callback_metrics["val_F1_macro_weighted"].item()

In [None]:
model_name = "MLP-v2"
optuna_config = {
    "experiment_name": model_name,
    "study_storage_directory": config['optuna_storage_directory'],
    "study_n_trials": 50,
    "study_timeout": 25000, # 3600 seconds/hour
    "study_n_jobs": -1,
    "trainer_max_epochs": 100,
}
optuna_config["storage_name"] = f"sqlite:///{optuna_config['study_storage_directory']}/{optuna_config['experiment_name']}.db"

In [None]:
# define hyper-parameter space, model + training, optimization metric via Objective
objective = OptunaObjective(optuna_config)

# define and run study for optimization
study = optuna.create_study(
    study_name=optuna_config['experiment_name'],
    storage=optuna_config["storage_name"],
    load_if_exists=True,
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.MedianPruner()
)

# define duration of the optimization process by and/or number_of_trails and timeout
study.optimize(
    objective,
    n_trials=optuna_config["study_n_trials"],
    timeout=optuna_config["study_timeout"],
    n_jobs=optuna_config["study_n_jobs"],
    show_progress_bar=True
)

### Analyse Optuna study

In [None]:
# print optimization results
try:
    study = optuna.load_study(study_name=optuna_config['experiment_name'], storage=optuna_config["storage_name"])
except:
    print("Study not saved to storage. Loading study from memory.")
print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
best_trial = study.best_trial
print("  Performance: ", best_trial.value)
print('  Best trial:', best_trial.params)
# print("  Params: ")
# for key, value in best_trial.params.items():
#     print(f"    {key}: {value}")

In [None]:
# history of all trials
hist = study.trials_dataframe()
hist.head()

In [None]:
# plot performance of all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# plot the parameter relationship concerning performance
optuna.visualization.plot_slice(study)

In [None]:
# plots the interactive visualization of the high-dimensional parameter relationship
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# plots parameter interactive chart from we can choose which hyperparameter space has to explore
optuna.visualization.plot_contour(study)

### Evaluate best model

In [None]:
best_trial.params

In [None]:
# Define best model

best_params = best_trial.params

# Evaluate best model on test data again
def eval_best_model(best_params, optuna_config) -> None:
    # datamodule
    dm=TabularDataModuleClassificationPACKAGING(
        data_dir=f"{config['data_directory']}/output/df_ml.csv",
        continuous_cols=['material_weight'],
        categorical_cols=[
            'material_number',
            'brand',
            'product_area',
            'core_segment',
            'component',
            'manufactoring_location',
            'characteristic_value',
            'packaging_code'
        ],
        target=['packaging_category'],
        oversampling=True,
        test_size=0.2,
        val_size=0.2,
        batch_size=best_params['batch_size'],
        SEED=SEED
    )
    dm.prepare_data()
    dm.setup(stage='fit')
    # model
    best_model = MulticlassTabularMLP(
        input_size=len(dm.feature_cols),
        output_size=dm.n_classes,
        hidden_size=best_params['hidden_size'],
        n_hidden_layers=best_params['n_hidden_layers'],
        dropout=best_params['dropout'],
        norm=True,
    )
    # lightningmodel
    lightningmodel = MulticlassTabularLightningModule(
        model=best_model,
        learning_rate=0.001,
        train_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
        val_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
        test_acc = MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
    )
    # trainer
    trainer = L.Trainer(
        devices="auto", # (os.cpu_count() / 2)
        callbacks=[
            EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5),
            ModelCheckpoint(
                monitor="val_loss",
                mode="min",
                save_top_k=1,
                every_n_epochs=3,
                enable_version_counter=False,
                dirpath=f"lightning_logs/checkpoints/{optuna_config['experiment_name']}",
                filename=f"best_model_{optuna_config['experiment_name']}",
            ),
        ],
        logger=CSVLogger(save_dir="logs/", name=optuna_config['experiment_name']),
        max_epochs=100,
        precision='bf16-mixed',
        default_root_dir="lightning_logs/",
    )
    # find learning rate
    tuner = Tuner(trainer)
    lr_finder = tuner.lr_find(lightningmodel, datamodule=dm) # finds learning rate automatically
    new_lr = lr_finder.suggestion()
    fig_lr = lr_finder.plot(suggest=True)
    lightningmodel.learning_rate = new_lr # update hparams of the model
    # train model
    trainer.fit(
        model=lightningmodel,
        train_dataloaders=dm.train_dataloader(),
        val_dataloaders=dm.val_dataloader()
    )
    fig_lr.savefig(f"lightning_logs/checkpoints/{optuna_config['experiment_name']}/learning_rate_best_model_{optuna_config['experiment_name']}.pdf")
    # plot training metrics
    metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv")
    tl_utils.plot_training_metrics(metrics)
    # evaluate model on test data
    score = trainer.test(model=lightningmodel, dataloaders=dm.test_dataloader())
    print(f"test_F1_macro_weighted: {score[0]['test_F1_macro_weighted']}")

    return lightningmodel.parameters, lightningmodel.learning_rate

# Evaluate best model on test data
lm_parameters, lm_learning_rate = eval_best_model(best_params, optuna_config)

# Load model from checkpoint and make predictions

In [None]:
def load_model_and_predict(
    study: optuna.Study = None,
    checkpoint_path: str = None,
) -> None:
    """Loads the best model from the checkpoint and predicts on the test set.
    Args:
        study (optuna.Study): The study object of optuna.
        checkpoint_path (str): The path to the checkpoint.
    Returns:
        None
    """
    # datamodule
    dm=TabularDataModuleClassificationPACKAGING(
        data_dir=f"{config['data_directory']}/output/df_ml.csv",
        continuous_cols=['material_weight'],
        categorical_cols=[
            'material_number',
            'brand',
            'product_area',
            'core_segment',
            'component',
            'manufactoring_location',
            'characteristic_value',
            'packaging_code'
        ],
        target=['packaging_category'],
        oversampling=True,
        test_size=0.2,
        val_size=0.2,
        batch_size=best_params['batch_size'],
        SEED=SEED
    )
    dm.prepare_data()
    dm.setup(stage='fit')
    # model
    best_model = MulticlassTabularMLP(
        input_size=len(dm.feature_cols),
        output_size=dm.n_classes,
        hidden_size=study.best_trial.params['hidden_size'],
        n_hidden_layers=study.best_trial.params['n_hidden_layers'],
        dropout=study.best_trial.params['dropout'],
        norm=True,
    )
    # Parameters that were not tracked (excluded), they need to be provided at the time of loading
    # NOTE: Those parameters are either complicated to track or were excluded to reduce logging of those parameters during training
    lighning_model_args = {
        "model": best_model,
        "learning_rate": study.best_trial.user_attrs["learning_rate"],
        # "train_acc": MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
        # "val_acc": MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
        # "test_acc": MulticlassF1Score(num_classes=dm.n_classes, average='weighted'),
    }
    # lighning_model_args["optimizer"] = torch.optim.Adam  # For compatibility. Not Used
    # lighning_model_args["loss"] = F.cross_entropy  # For compatibility. Not Used
    # lighning_model_args["optimizer_params"] = {}  # For compatibility. Not Used
    # lightning model
    best_trained_lightning_model = MulticlassTabularLightningModule.load_from_checkpoint(
        checkpoint_path=checkpoint_path,
        # map_location=torch.device('cpu'),
        strict=True,
        **lighning_model_args,
    )
    # trainer
    trainer = L.Trainer(
        devices="auto", # (os.cpu_count() / 2)
        callbacks=[
            EarlyStopping(monitor='val_loss', min_delta=0.00, patience=5),
        ],
        precision='bf16-mixed',
        default_root_dir="lightning_logs/",
    )
    # predict
    preds_y_test = torch.cat(trainer.predict(model=best_trained_lightning_model, dataloaders=dm.test_dataloader()))
    preds_y_test = dm.label_encoder_target.inverse_transform(preds_y_test.reshape(-1, 1))
    y_test = dm.label_encoder_target.inverse_transform(dm.test_dataset.get_dataframe.iloc[:, -1].values.reshape(-1, 1))
    # calculate classification report
    print(classification_report(y_test, preds_y_test))

    return


# configs to load model from checkpoint
load_config = {
    "experiment_name": model_name,
    "study_storage_directory": config['optuna_storage_directory'],
}
load_config["storage_name"] = f"sqlite:///{load_config['study_storage_directory']}/{load_config['experiment_name']}.db"

load_model_and_predict(
    study=optuna.load_study(study_name=load_config['experiment_name'], storage=load_config["storage_name"]),
    checkpoint_path=f"lightning_logs/checkpoints/{load_config['experiment_name']}/best_model_{load_config['experiment_name']}.ckpt",
)
    