In [1]:
import sys
import mlflow
import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

from common_script.process import reduce_mem_usage
from common_script.model import Common_CB_Modelling

from unique_script.util import read_file, read_files, to_pandas, feature_eng

In [2]:
ROOT            = Path("data")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

TARGET = 'target'
EXPERIMENT_NAME = "Home_Credit_2024"

In [3]:
dt_now = datetime.datetime.now()
run_postfix = dt_now.isoformat()
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///tmp/working/mlruns/308750597566159464', creation_time=1731200360498, experiment_id='308750597566159464', last_update_time=1731200360498, lifecycle_stage='active', name='Home_Credit_2024', tags={}>

# train fe

In [4]:
train_data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}

data/parquet_files/train/train_static_0_*.parquet
data/parquet_files/train/train_applprev_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_2_*.parquet


In [5]:
train_overall = feature_eng(**train_data_store)

In [6]:
del train_data_store

# post fe

In [7]:
train_overall, str_features = to_pandas(train_overall)
train_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), float32(4), float64(660), int64(4), int8(2), object(146)
memory usage: 9.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), category(146), float32(4), float64(660), int64(4), int8(2)
memory usage: 7.9 GB


In [8]:
train_overall = reduce_mem_usage(train_overall)
train_overall.info()

Memory usage of dataframe is 8060.49 MB
Memory usage after optimization is: 2986.56 MB
Decreased by 62.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: category(146), float16(416), float32(247), float64(2), int16(1), int32(1), int8(4)
memory usage: 2.9 GB


In [9]:
drop_cols = str_features#[2:] #train_overall.pipe(Pipeline.filter_cols)
train_overall = train_overall.drop(drop_cols, axis=1)

In [10]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
selected_cat_features = [col for i, col in enumerate(selected_features) if col in str_features]

In [11]:
y_train = train_overall[TARGET].astype(np.float32).values
train = read_file(TRAIN_DIR / "train_base.parquet")
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
tmp = train_overall[["WEEK_NUM", "target"]]

# modelling

In [12]:
cb_params = { 
            'random_seed': 42,
            "learning_rate": 0.04,
            'use_best_model': True,
            'iterations': 100,#00,
            'reg_lambda': 10,
            "scale_pos_weight": 10,
            "task_type": "GPU",
            'loss_function': 'Logloss',
            'allow_writing_files': False,
}

cb_modelling = Common_CB_Modelling(CatBoostClassifier)

In [13]:
cb_output =np.zeros(len(train_overall))
with mlflow.start_run(run_name = "cb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            cb_model, cb_val_output = cb_modelling.train_and_valid(
                                                            train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                            train_overall.loc[val_idx][selected_features], y_train[val_idx], cb_params,
                                                            selected_cat_features)
            mlflow.catboost.log_model(cb_model, "artifacts")
            mlflow.log_params(cb_params)
            
            dataset = mlflow.data.from_pandas(train_overall.head(1)[selected_features])
            mlflow.log_input(dataset)
        cb_output[val_idx] = cb_val_output
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, cb_output)) 

Fold : 1
0:	learn: 0.6746161	test: 0.6753495	best: 0.6753495 (0)	total: 89.1ms	remaining: 8.82s
99:	learn: 0.4324911	test: 0.4502857	best: 0.4502857 (99)	total: 4.31s	remaining: 0us
bestTest = 0.4502856587
bestIteration = 99


2024/11/10 01:17:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/308750597566159464/runs/dc0a82a917bc4e0ca1f5890671f20858.
2024/11/10 01:17:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.


Fold : 2
0:	learn: 0.6752581	test: 0.6747738	best: 0.6747738 (0)	total: 41.6ms	remaining: 4.11s
99:	learn: 0.4366365	test: 0.4336193	best: 0.4336193 (99)	total: 4.27s	remaining: 0us
bestTest = 0.4336192771
bestIteration = 99


2024/11/10 01:18:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/308750597566159464/runs/850ae0ba821a4cbeb78faae5850318a2.
2024/11/10 01:18:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.


Fold : 3
0:	learn: 0.6749544	test: 0.6746395	best: 0.6746395 (0)	total: 43.1ms	remaining: 4.27s
99:	learn: 0.4375709	test: 0.4319687	best: 0.4319687 (99)	total: 4.24s	remaining: 0us
bestTest = 0.4319687041
bestIteration = 99


2024/11/10 01:20:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/308750597566159464/runs/234b1c3a096a466499a082e75b46433d.
2024/11/10 01:20:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.


Fold : 4
0:	learn: 0.6749244	test: 0.6749950	best: 0.6749950 (0)	total: 41.8ms	remaining: 4.14s
99:	learn: 0.4357355	test: 0.4381682	best: 0.4381682 (99)	total: 4.26s	remaining: 0us
bestTest = 0.4381681503
bestIteration = 99


2024/11/10 01:22:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/308750597566159464/runs/f27471e28a2a435d8b2e6cce6e93d889.
2024/11/10 01:22:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.


Fold : 5
0:	learn: 0.6749237	test: 0.6748682	best: 0.6748682 (0)	total: 42.3ms	remaining: 4.19s
99:	learn: 0.4358613	test: 0.4363581	best: 0.4363581 (99)	total: 4.24s	remaining: 0us
bestTest = 0.4363580971
bestIteration = 99


2024/11/10 01:24:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/308750597566159464/runs/09654ed1d9dd46faab1b44f6bc86a1c2.
2024/11/10 01:24:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.
2024/11/10 01:24:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run cb_2024-11-10T01:13:30.293932 at: http://mlflow:5000/#/experiments/308750597566159464/runs/ecdbc591a82f412ba64ca1be34911ce7.
2024/11/10 01:24:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/308750597566159464.
