In [1]:
import sys
import mlflow
import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
ROOT            = Path("../data")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

TARGET = 'target'
EXPERIMENT_NAME = "Home_Credit_2024"

In [3]:
dt_now = datetime.datetime.now()
run_postfix = dt_now.isoformat()
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///tmp/working/mlruns/836281514613721981', creation_time=1730029393721, experiment_id='836281514613721981', last_update_time=1730029393721, lifecycle_stage='active', name='Home_Credit_2024', tags={}>

In [4]:
sys.path.append("../02_script/")
from process import read_file, read_files, to_pandas, feature_eng
from common import reduce_mem_usage, Common_XGB_Modelling

# train fe

In [5]:
train_data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}

../data/parquet_files/train/train_static_0_*.parquet
../data/parquet_files/train/train_applprev_1_*.parquet
../data/parquet_files/train/train_credit_bureau_a_1_*.parquet
../data/parquet_files/train/train_credit_bureau_a_2_*.parquet


In [6]:
train_overall = feature_eng(**train_data_store)

In [7]:
del train_data_store

# post fe

In [8]:
train_overall, str_features = to_pandas(train_overall)
train_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), float32(4), float64(660), int64(4), int8(2), object(146)
memory usage: 9.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), category(146), float32(4), float64(660), int64(4), int8(2)
memory usage: 7.9 GB


In [9]:
train_overall = reduce_mem_usage(train_overall)
train_overall.info()

Memory usage of dataframe is 8060.49 MB
Memory usage after optimization is: 2986.56 MB
Decreased by 62.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: category(146), float16(416), float32(247), float64(2), int16(1), int32(1), int8(4)
memory usage: 2.9 GB


In [10]:
drop_cols = str_features#[2:] #train_overall.pipe(Pipeline.filter_cols)
train_overall = train_overall.drop(drop_cols, axis=1)

In [11]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
selected_cat_features = [col for i, col in enumerate(selected_features) if col in str_features]

In [12]:
y_train = train_overall[TARGET].astype(np.float32).values
train = read_file(TRAIN_DIR / "train_base.parquet")
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
tmp = train_overall[["WEEK_NUM", "target"]]

# modelling

In [13]:
xgb_params = {
            "n_estimators" : 10,
            'objective': "binary:logistic",
            "eval_metric": "auc",
            "importance_type": "gain",
            "enable_categorical": True,
            'learning_rate': 0.02,
            'scale_pos_weight': 10,
            'booster': 'gbtree',
            'verbosity': 0,
            'seed': 42,
            "reg_alpha": 0.1,
            "reg_lambda": 10,
            "device": "gpu",
            "early_stopping_rounds": 10,
            "verbose_eval": 10,
        }

xgb_modelling = Common_XGB_Modelling(XGBClassifier)

In [14]:
mlflow.xgboost.autolog(log_input_examples = True, log_datasets=False, silent = True)
xgb_output =np.zeros(len(train_overall))

with mlflow.start_run(run_name = "xgb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            xgb_model, xgb_val_output = xgb_modelling.train_and_valid(train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                              train_overall.loc[val_idx][selected_features], y_train[val_idx], xgb_params
                                                              )
        xgb_output[val_idx] = xgb_val_output
        
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, xgb_output))

Fold : 1
[0]	validation_0-auc:0.75307
[0]	validation_0-auc:0.75307
[1]	validation_0-auc:0.75950
[2]	validation_0-auc:0.75961
[3]	validation_0-auc:0.76190
[4]	validation_0-auc:0.76657
[5]	validation_0-auc:0.76809
[6]	validation_0-auc:0.76882
[7]	validation_0-auc:0.76919
[8]	validation_0-auc:0.77178
[9]	validation_0-auc:0.77252
[9]	validation_0-auc:0.77252


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/04 03:03:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/836281514613721981/runs/dfaf7134b80843ea9c1ddc62e8b58d00.
2024/11/04 03:03:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 2
[0]	validation_0-auc:0.74977
[0]	validation_0-auc:0.74977
[1]	validation_0-auc:0.75133
[2]	validation_0-auc:0.75546
[3]	validation_0-auc:0.76480
[4]	validation_0-auc:0.76940
[5]	validation_0-auc:0.77005
[6]	validation_0-auc:0.77232
[7]	validation_0-auc:0.77279
[8]	validation_0-auc:0.77385
[9]	validation_0-auc:0.77594
[9]	validation_0-auc:0.77594


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/04 03:03:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/836281514613721981/runs/ae44086c338a48b79fa1b20a51a52b89.
2024/11/04 03:03:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 3
[0]	validation_0-auc:0.75508
[0]	validation_0-auc:0.75508
[1]	validation_0-auc:0.75605
[2]	validation_0-auc:0.76461
[3]	validation_0-auc:0.76570
[4]	validation_0-auc:0.76637
[5]	validation_0-auc:0.76741
[6]	validation_0-auc:0.76824
[7]	validation_0-auc:0.77219
[8]	validation_0-auc:0.77445
[9]	validation_0-auc:0.77620
[9]	validation_0-auc:0.77620


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/04 03:04:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/836281514613721981/runs/b580be0b1e004aa583ae959ab0e18ead.
2024/11/04 03:04:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 4
[0]	validation_0-auc:0.75155
[0]	validation_0-auc:0.75155
[1]	validation_0-auc:0.75899
[2]	validation_0-auc:0.75987
[3]	validation_0-auc:0.76271
[4]	validation_0-auc:0.76876
[5]	validation_0-auc:0.76954
[6]	validation_0-auc:0.77176
[7]	validation_0-auc:0.77228
[8]	validation_0-auc:0.77327
[9]	validation_0-auc:0.77459
[9]	validation_0-auc:0.77459


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/04 03:04:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/836281514613721981/runs/dd5a248403d34f4a9c329f5397025bbe.
2024/11/04 03:04:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 5
[0]	validation_0-auc:0.74941
[0]	validation_0-auc:0.74941
[1]	validation_0-auc:0.75521
[2]	validation_0-auc:0.75877
[3]	validation_0-auc:0.76010
[4]	validation_0-auc:0.76125
[5]	validation_0-auc:0.76204
[6]	validation_0-auc:0.76664
[7]	validation_0-auc:0.76692
[8]	validation_0-auc:0.76756
[9]	validation_0-auc:0.77001
[9]	validation_0-auc:0.77001


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/04 03:05:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/836281514613721981/runs/93cacc49d30c4489af0c3bbe0c1c8545.
2024/11/04 03:05:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.
2024/11/04 03:05:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgb_2024-11-04T03:00:45.679504 at: http://mlflow:5000/#/experiments/836281514613721981/runs/a90c6d06208942d28260a0aad6568b8f.
2024/11/04 03:05:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.
