In [1]:
import sys
import mlflow
import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
ROOT            = Path("data")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

TARGET = 'target'
EXPERIMENT_NAME = "Home_Credit_2024"

In [3]:
dt_now = datetime.datetime.now()
run_postfix = dt_now.isoformat()
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///tmp/working/mlruns/868492480804425921', creation_time=1731147408334, experiment_id='868492480804425921', last_update_time=1731147408334, lifecycle_stage='active', name='Home_Credit_2024', tags={}>

In [4]:
from unique_script.util import read_file, read_files, to_pandas, feature_eng

In [5]:
from ml_common.process import reduce_mem_usage
from ml_common.model import Common_CB_Modelling

# train fe

In [6]:
train_data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}

data/parquet_files/train/train_static_0_*.parquet
data/parquet_files/train/train_applprev_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_2_*.parquet


In [7]:
train_overall = feature_eng(**train_data_store)

In [8]:
del train_data_store

# post fe

In [9]:
train_overall, str_features = to_pandas(train_overall)
train_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), float32(4), float64(660), int64(4), int8(2), object(146)
memory usage: 9.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), category(146), float32(4), float64(660), int64(4), int8(2)
memory usage: 7.9 GB


In [10]:
train_overall = reduce_mem_usage(train_overall)
train_overall.info()

Memory usage of dataframe is 8060.49 MB
Memory usage after optimization is: 2986.56 MB
Decreased by 62.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: category(146), float16(416), float32(247), float64(2), int16(1), int32(1), int8(4)
memory usage: 2.9 GB


In [11]:
drop_cols = str_features#[2:] #train_overall.pipe(Pipeline.filter_cols)
train_overall = train_overall.drop(drop_cols, axis=1)

In [12]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
selected_cat_features = [col for i, col in enumerate(selected_features) if col in str_features]

In [13]:
y_train = train_overall[TARGET].astype(np.float32).values
train = read_file(TRAIN_DIR / "train_base.parquet")
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
tmp = train_overall[["WEEK_NUM", "target"]]

# modelling

In [14]:
cb_params = { 
            'random_seed': 42,
            "learning_rate": 0.04,
            'use_best_model': True,
            'iterations': 100,#00,
            'reg_lambda': 10,
            "scale_pos_weight": 10,
            "task_type": "GPU",
            'loss_function': 'Logloss',
}

cb_modelling = Common_CB_Modelling(CatBoostClassifier)

In [15]:
cb_output =np.zeros(len(train_overall))
with mlflow.start_run(run_name = "cb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            cb_model, cb_val_output = cb_modelling.train_and_valid(
                                                            train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                            train_overall.loc[val_idx][selected_features], y_train[val_idx], cb_params,
                                                            selected_cat_features)
            mlflow.catboost.log_model(cb_model, "artifacts")
            mlflow.log_params(cb_params)
            
            dataset = mlflow.data.from_pandas(train_overall.head(1)[selected_features])
            mlflow.log_input(dataset)
        cb_output[val_idx] = cb_val_output
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, cb_output)) 

Fold : 1
0:	learn: 0.6746160	test: 0.6753495	best: 0.6753495 (0)	total: 87.2ms	remaining: 8.63s
99:	learn: 0.4324914	test: 0.4502862	best: 0.4502862 (99)	total: 4.31s	remaining: 0us
bestTest = 0.4502861693
bestIteration = 99


2024/11/09 10:31:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/868492480804425921/runs/9a3789284f69459e94fa23ca275efc20.
2024/11/09 10:31:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 2
0:	learn: 0.6752579	test: 0.6747738	best: 0.6747738 (0)	total: 41.6ms	remaining: 4.12s
99:	learn: 0.4366362	test: 0.4336192	best: 0.4336192 (99)	total: 4.23s	remaining: 0us
bestTest = 0.4336191565
bestIteration = 99


2024/11/09 10:33:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/868492480804425921/runs/87a90de797bf46028622126bbce77e3b.
2024/11/09 10:33:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 3
0:	learn: 0.6749544	test: 0.6746393	best: 0.6746393 (0)	total: 45.4ms	remaining: 4.49s
99:	learn: 0.4375973	test: 0.4320629	best: 0.4320629 (99)	total: 4.36s	remaining: 0us
bestTest = 0.4320629143
bestIteration = 99


2024/11/09 10:35:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/868492480804425921/runs/7c75dd25f234454a90e418bb569a02c0.
2024/11/09 10:35:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 4
0:	learn: 0.6749245	test: 0.6749950	best: 0.6749950 (0)	total: 44.2ms	remaining: 4.38s
99:	learn: 0.4357353	test: 0.4381679	best: 0.4381679 (99)	total: 4.37s	remaining: 0us
bestTest = 0.4381679112
bestIteration = 99


2024/11/09 10:36:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/868492480804425921/runs/a9fc9a5d8dc14c26ba9f2fba43294431.
2024/11/09 10:36:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 5
0:	learn: 0.6749239	test: 0.6748681	best: 0.6748681 (0)	total: 42.3ms	remaining: 4.19s
99:	learn: 0.4362203	test: 0.4365731	best: 0.4365731 (99)	total: 4.25s	remaining: 0us
bestTest = 0.436573118
bestIteration = 99


2024/11/09 10:38:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/868492480804425921/runs/3ad4c2ef57c64b17807a39ea3e597df1.
2024/11/09 10:38:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.
2024/11/09 10:38:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run cb_2024-11-09T10:28:04.779125 at: http://mlflow:5000/#/experiments/868492480804425921/runs/722a1109641049faa5a1b83e634eec84.
2024/11/09 10:38:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.
