In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [11]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost

In [None]:
VERSION = 5
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [13]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [14]:
cat_features = [
    "Area",
    "Province",
    "gender",
    "marital_status",
    "tel_category",
    "type_of_residence",
    "c_business_type",
    "c_position",
    "c_occupation",
    "c_employment_status",
    "c_salary_payment_methods",
    "media",
    "place_for_sending_information",
    "r_propose",
    "r_generalcode1",
    "r_generalcode2",
    "r_generalcode3",
    "r_generalcode4",
    "r_generalcode5",
    "apply"
]

In [15]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ["ID", "Shop Name"]
    df = df.drop(columns=drop_cols, errors='ignore')

    # =====================================================
    # 3. Handle date columns
    # =====================================================
    for col in ["pms_i_ymd", "date_of_birth"]:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    
    df["app_year"] = df["pms_i_ymd"].dt.year
    df["app_month"] = df["pms_i_ymd"].dt.month
    df["app_day"] = df["pms_i_ymd"].dt.day
    df["app_weekday"] = df["pms_i_ymd"].dt.weekday

    df["birth_year"] = df["date_of_birth"].dt.year
    df["birth_month"] = df["date_of_birth"].dt.month
    df["birth_day"] = df["date_of_birth"].dt.day

    df["age"] = ((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365.25).astype(float)

    df.drop(columns=["pms_i_ymd", "date_of_birth"], inplace=True, errors="ignore")

    # =====================================================
    # 4. Engineer numeric features
    # =====================================================
    df["living_period_month"] = df["living_period_year"] * 12 + df["living_period_month"]
    df.drop(columns=['living_period_year'], inplace=True)

    df["c_number_of_working_month"] = df["c_number_of_working_year"] * 12 + df["c_number_of_working_month"]
    df.drop(columns=['c_number_of_working_year'], inplace=True)

    df["r_total_income"] = df[["c_monthly_salary", "r_additional_income", "r_spouse_income"]].sum(axis=1)
    df["r_debt_to_income_ratio"] = df["r_allloan_amount"] / (df["r_total_income"] + 1)
    df["loan_to_salary_ratio"] = df["r_expected_credit_limit"] / (df["c_monthly_salary"] + 1)
    df["has_other_loans"] = (df["r_allloan_case"] > 0).astype(int)
    df["is_married_and_has_children"] = ((df["marital_status"] == 2) & (df["number_of_children"] > 0)).astype(int)

    # =====================================================
    # 5. Cleaning Data
    # =====================================================
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)

    df.to_csv(cleanPath, index=False)
    return df

In [16]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


## Testing

In [17]:
threshold = trainCatboost(VERSION, train_df, cat_features)

[I 2025-10-28 23:36:42,989] A new study created in memory with name: no-name-4784903a-7ad3-47ae-b6d2-85d8351b817f


GPU detected -> using CatBoost GPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:38:22,223] Trial 0 finished with value: 0.6385987255361646 and parameters: {'iterations': 759, 'depth': 8, 'learning_rate': 0.055324507790240535, 'l2_leaf_reg': 0.010533701072283543, 'random_strength': 0.3165940970057346, 'bagging_temperature': 0.40494281292404644, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 9, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 59, 'simple_ctr': 'FeatureFreq'}. Best is trial 0 with value: 0.6385987255361646.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:38:29,942] Trial 1 finished with value: 0.6296453496352898 and parameters: {'iterations': 840, 'depth': 7, 'learning_rate': 0.08899318024827496, 'l2_leaf_reg': 0.29822208427739155, 'random_strength': 0.24451256166725902, 'bagging_temperature': 0.7643667462522936, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 2, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 64, 'simple_ctr': 'Borders'}. Best is trial 0 with value: 0.6385987255361646.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:38:59,791] Trial 2 finished with value: 0.6364067485536102 and parameters: {'iterations': 709, 'depth': 7, 'learning_rate': 0.0693095945329627, 'l2_leaf_reg': 0.978967269155994, 'random_strength': 0.6402867785779537, 'bagging_temperature': 0.2281733402681284, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 2, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 36, 'simple_ctr': 'FloatTargetMeanValue'}. Best is trial 0 with value: 0.6385987255361646.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:39:30,132] Trial 3 finished with value: 0.6388385865655752 and parameters: {'iterations': 871, 'depth': 4, 'learning_rate': 0.08426473933711152, 'l2_leaf_reg': 0.021654597439826895, 'random_strength': 0.8178357048767091, 'bagging_temperature': 0.045460298649872, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 69, 'simple_ctr': 'Buckets'}. Best is trial 3 with value: 0.6388385865655752.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:39:47,382] Trial 4 finished with value: 0.6329753515334762 and parameters: {'iterations': 915, 'depth': 3, 'learning_rate': 0.036864296720492774, 'l2_leaf_reg': 4.1612821995131775, 'random_strength': 3.7120975224285866, 'bagging_temperature': 0.9165760959856203, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 5, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 39, 'simple_ctr': 'FeatureFreq'}. Best is trial 3 with value: 0.6388385865655752.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:40:06,160] Trial 5 finished with value: 0.6363387905363834 and parameters: {'iterations': 959, 'depth': 6, 'learning_rate': 0.023974667045899067, 'l2_leaf_reg': 6.824199594634483, 'random_strength': 0.24465409069605099, 'bagging_temperature': 0.09311148321300433, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 49, 'simple_ctr': 'Buckets'}. Best is trial 3 with value: 0.6388385865655752.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:40:52,590] Trial 6 finished with value: 0.6379437426943589 and parameters: {'iterations': 887, 'depth': 3, 'learning_rate': 0.06469934487198264, 'l2_leaf_reg': 0.03170187232034873, 'random_strength': 2.796408506015486, 'bagging_temperature': 0.07110353191632113, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 9, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 55, 'simple_ctr': 'FloatTargetMeanValue'}. Best is trial 3 with value: 0.6388385865655752.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:42:24,853] Trial 7 finished with value: 0.6407125615895913 and parameters: {'iterations': 1135, 'depth': 7, 'learning_rate': 0.015519324041274363, 'l2_leaf_reg': 0.37440848054545567, 'random_strength': 0.24817287808077262, 'bagging_temperature': 0.2751647462885364, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 10, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 49, 'simple_ctr': 'Borders'}. Best is trial 7 with value: 0.6407125615895913.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:42:41,675] Trial 8 finished with value: 0.6399594767927512 and parameters: {'iterations': 1010, 'depth': 4, 'learning_rate': 0.03667470828835537, 'l2_leaf_reg': 0.021639297453901845, 'random_strength': 0.4516569796176416, 'bagging_temperature': 0.9980590039946345, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 34, 'simple_ctr': 'FeatureFreq'}. Best is trial 7 with value: 0.6407125615895913.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-28 23:42:51,115] Trial 9 finished with value: 0.6417085525018009 and parameters: {'iterations': 856, 'depth': 4, 'learning_rate': 0.07230417046086798, 'l2_leaf_reg': 0.31370118379163453, 'random_strength': 0.5153130665495332, 'bagging_temperature': 0.3297346667154891, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 44, 'simple_ctr': 'Buckets'}. Best is trial 9 with value: 0.6417085525018009.

Best Parameters Found:
{'iterations': 856, 'depth': 4, 'learning_rate': 0.07230417046086798, 'l2_leaf_reg': 0.31370118379163453, 'random_strength': 0.5153130665495332, 'bagging_temperature': 0.3297346667154891, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 44, 'simple_ctr'

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6045438	best: 0.6045438 (0)	total: 19.7ms	remaining: 16.8s
200:	test: 0.6296477	best: 0.6390554 (80)	total: 2.57s	remaining: 8.37s
400:	test: 0.6195197	best: 0.6390554 (80)	total: 5.25s	remaining: 5.96s
600:	test: 0.6133069	best: 0.6390554 (80)	total: 7.97s	remaining: 3.38s
800:	test: 0.6045701	best: 0.6390554 (80)	total: 10.4s	remaining: 717ms
855:	test: 0.6044363	best: 0.6390554 (80)	total: 11.2s	remaining: 0us
bestTest = 0.6390554309
bestIteration = 80
Shrink model to first 81 iterations.

Optimal Threshold: 0.527

Confusion Matrix:
 [[3944 1719]
 [ 411  431]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.70      0.79      5663
           1       0.20      0.51      0.29       842

    accuracy                           0.67      6505
   macro avg       0.55      0.60      0.54      6505
weighted avg       0.81      0.67      0.72      6505

ROC-AUC Score: 0.639055430445493

CatBoost model training complete 

In [18]:
testCatboost(VERSION, test_df, ids, threshold, cat_features)


Predictions saved to: ../../prediction/prediction_catboost_v5.csv
Training + prediction complete.
