In [None]:
import pandas as pd
from optuna.visualization import plot_optimization_history

from qsar.utils.visualizer import Visualizer
from qsar.utils.cross_validator import CrossValidator
from qsar.utils.extractor import Extractor
from qsar.utils.hyperparameter_optimizer import HyperParameterOptimizer

from qsar.models.xgboost import XgboostModel
from sklearn.preprocessing import StandardScaler

## 1. Data Import

In [None]:
data_paths = {
    "full_train": "../../data/full/train/full_train_unfiltered.csv",
    "full_test": "../../data/full/test/full_test_unfiltered.csv",
}

extractor = Extractor(data_paths)

# todo move the split after the preprocessing
x_dfs, y_dfs = extractor.split_x_y("Log_MP_RATIO")
df_full_tain = extractor.get_df("full_train")

## 3. Cross-validation

In [None]:
cross_validator = CrossValidator(df_full_tain)
visualizer = Visualizer()
X_list, y_list, df, y, n_folds = cross_validator.create_cv_folds()
visualizer.display_cv_folds(df, y, n_folds)

## 4. Model evaluation before hyperparameter optimization

In [None]:
xgboost_model = XgboostModel()
metrics = cross_validator.evaluate_model_performance(xgboost_model.model,
                                                                   x_dfs["full_train"], y_dfs["full_train"],
                                                                   x_dfs["full_test"], y_dfs["full_test"])
visualizer.display_model_performance("XGBoost", metrics)

## 5. Hyperparameter optimization

In [None]:
xgboost_model = XgboostModel()

optimizer = HyperParameterOptimizer(model=xgboost_model, data=df_full_tain, direction='maximize', trials=100)

study = optimizer.optimize()
trial = study.best_trial
print(trial.value, trial.params)

## 6. Model evaluation after hyperparameter optimization

In [None]:
xgboost_model.set_hyperparameters(**study.best_params)
metrics = cross_validator.evaluate_model_performance(
    xgboost_model.model, x_dfs["full_train"], y_dfs["full_train"], x_dfs["full_test"], y_dfs["full_test"])
visualizer.display_model_performance("XGBoost", metrics)

In [None]:
display(plot_optimization_history(study))

## 7. Prediction and results

In [None]:
y_full_train_pred, y_full_test_pred = cross_validator.get_predictions(xgboost_model.model, x_dfs["full_train"],
                                                                      y_dfs["full_train"], x_dfs["full_test"])
visualizer.display_true_vs_predicted("XGBoost", y_dfs["full_train"], y_dfs["full_test"], y_full_train_pred, y_full_test_pred)