In [1]:
import warnings
warnings.filterwarnings("ignore", category=Warning)

import pickle
from kaggle_titanic.config import cfg
from kaggle_titanic.evaluation.main import evaluation

import pandas as pd

from ml_assemblr.main_components.data_pod import DataPod
from ml_assemblr.transformer.model.xgb_model import XGBModel

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
with open(cfg.research_cache_path / "01_dp.pkl", "rb") as f:
    dp: DataPod = pickle.load(f)

In [4]:
from sklearn.metrics import accuracy_score
def custom_accuracy_varied_thres(preds, dtrain, threshold):
    labels = dtrain.get_label()
    preds_binary = (preds > threshold).astype(int)
    accuracy = accuracy_score(labels, preds_binary)
    return 'accuracy', accuracy

In [5]:
from functools import partial
custom_accuracy = partial(custom_accuracy_varied_thres, threshold=0.5)

In [6]:
xgb_config = dict(
  xgb_params={"objective": "binary:logistic", "eval_metric": "auc"}
)
cv_count = len(dp.variables["cv_idx_map"]['cv_split_idx_in_column_type'])

In [7]:
for i in range(cv_count):
    xgb_model = XGBModel(
        **xgb_config,
        fit_on_split="train",
        # val_on_split="valid",
        # num_boost_round=200,
        # early_stopping_rounds=20,
        # is_maximize_metric=True,
        # verbose_eval=True,
        # custom_metric=custom_accuracy,
        cv_idx=i,
    )
    dp: DataPod = dp.fit_transform(xgb_model)

In [8]:
dp.variables["model_threshold"] = 0.5

In [9]:
dp = evaluation(dp)

In [10]:
dp.variables["evaluation"]["df_evaluation"]

Unnamed: 0,metric,value_cv_0,value_cv_1,value_cv_2,value_cv_3,value_cv_4,value_cv_5,value_cv_6,value_cv_7,value_cv_8,value_cv_9,value_mean,value_std
0,accuracy_train,0.893258,0.90309,0.897472,0.886236,0.891854,0.900281,0.894663,0.898876,0.890449,0.898876,0.895506,0.004873
1,accuracy_valid,0.804469,0.804469,0.798883,0.854749,0.826816,0.815642,0.77095,0.826816,0.871508,0.821229,0.819553,0.02703
2,accuracy_test,,,,,,,,,,,,


In [11]:
dp.variables["evaluation"]["objective_accuracy_valid"]

0.8195530726256983

In [15]:
dp.slice_df(None, "features")

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Embarked_infrequent_sklearn
0,male,22.0,1,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,female,38.0,1,0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,female,26.0,0,0,7.9250,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,female,35.0,1,0,53.1000,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,male,35.0,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,male,,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1305,female,39.0,0,0,108.9000,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1306,male,38.5,0,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1307,male,,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0,0.0
