In [1]:
import sys
import mlflow
import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
ROOT            = Path("data")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

TARGET = 'target'
EXPERIMENT_NAME = "Home_Credit_2024"

In [3]:
dt_now = datetime.datetime.now()
run_postfix = dt_now.isoformat()
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

2024/11/09 10:16:48 INFO mlflow.tracking.fluent: Experiment with name 'Home_Credit_2024' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///tmp/working/mlruns/868492480804425921', creation_time=1731147408334, experiment_id='868492480804425921', last_update_time=1731147408334, lifecycle_stage='active', name='Home_Credit_2024', tags={}>

In [4]:
from unique_script.util import read_file, read_files, to_pandas, feature_eng

In [5]:
from ml_common.process import reduce_mem_usage
from ml_common.model import Common_LGB_Modelling

# train fe

In [6]:
train_data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}

data/parquet_files/train/train_static_0_*.parquet
data/parquet_files/train/train_applprev_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_1_*.parquet
data/parquet_files/train/train_credit_bureau_a_2_*.parquet


In [7]:
train_overall = feature_eng(**train_data_store)

In [8]:
del train_data_store

# post fe

In [9]:
train_overall, str_features = to_pandas(train_overall)
train_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), float32(4), float64(660), int64(4), int8(2), object(146)
memory usage: 9.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), category(146), float32(4), float64(660), int64(4), int8(2)
memory usage: 7.9 GB


In [10]:
train_overall = reduce_mem_usage(train_overall)
train_overall.info()

Memory usage of dataframe is 8060.49 MB
Memory usage after optimization is: 2986.56 MB
Decreased by 62.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: category(146), float16(416), float32(247), float64(2), int16(1), int32(1), int8(4)
memory usage: 2.9 GB


In [11]:
drop_cols = str_features#[2:] #train_overall.pipe(Pipeline.filter_cols)
train_overall = train_overall.drop(drop_cols, axis=1)

In [12]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
selected_cat_features = [col for i, col in enumerate(selected_features) if col in str_features]

In [13]:
y_train = train_overall[TARGET].astype(np.float32).values
train = read_file(TRAIN_DIR / "train_base.parquet")
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
tmp = train_overall[["WEEK_NUM", "target"]]

# modelling

In [14]:
lgb_params = {
            'objective': 'binary', 
            "metric": "auc",
            "n_estimators": 100,#00,
            'learning_rate': 0.02,
            'scale_pos_weight': 10,
            'boosting_type': 'gbdt',
            'verbose': -1,
            'seed': 42,
            'num_leaves': 64, 
            "reg_alpha": 0.1,
            "reg_lambda": 10,
            "cat_smooth": 20,
            "device": "gpu",
}

lgb_modelling = Common_LGB_Modelling(LGBMClassifier)

In [15]:
mlflow.lightgbm.autolog(log_input_examples = True, log_datasets=False, silent = True)
lgb_output =np.zeros(len(train_overall))

with mlflow.start_run(run_name = "lgb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            lgb_model, lgb_val_output = lgb_modelling.train_and_valid(train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                              train_overall.loc[val_idx][selected_features], y_train[val_idx], lgb_params)
        lgb_output[val_idx] = lgb_val_output
        
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, lgb_output))

Fold : 1
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.831631
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.831631


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/09 10:19:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/868492480804425921/runs/8b778913e65840ebba53246ffbd3a7f6.
2024/11/09 10:19:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 2
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.833736
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.833736


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/09 10:19:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/868492480804425921/runs/60264d6fcf534b30b5c42b74bb4aa6b1.
2024/11/09 10:19:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 3
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.837535
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.837535


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/09 10:20:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/868492480804425921/runs/ad6c8f69fd3f4b85ae145f73067c27c7.
2024/11/09 10:20:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 4
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.835525
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.835525


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/09 10:20:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/868492480804425921/runs/4c6a63b126e64a65af2f713bc85e032b.
2024/11/09 10:20:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.


Fold : 5
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.831998
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.831998


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/09 10:21:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/868492480804425921/runs/1e3ddf6b7216409c99024b11f240fef4.
2024/11/09 10:21:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.
2024/11/09 10:21:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run lgb_2024-11-09T10:16:48.292248 at: http://mlflow:5000/#/experiments/868492480804425921/runs/3f727e949ea04ac8beb7c7e37fc4ddc4.
2024/11/09 10:21:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/868492480804425921.
