In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
VERSION = 5
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [4]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [5]:
cat_features = [
    "Area",
    "Province",
    "gender",
    "marital_status",
    "tel_category",
    "type_of_residence",
    "c_business_type",
    "c_position",
    "c_occupation",
    "c_employment_status",
    "c_salary_payment_methods",
    "media",
    "place_for_sending_information",
    "r_propose",
    "r_generalcode1",
    "r_generalcode2",
    "r_generalcode3",
    "r_generalcode4",
    "r_generalcode5",
    "apply"
]

In [6]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ["ID", "Shop Name"]
    df = df.drop(columns=drop_cols, errors='ignore')

    # =====================================================
    # 3. Handle date columns
    # =====================================================
    for col in ["pms_i_ymd", "date_of_birth"]:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    
    df["app_year"] = df["pms_i_ymd"].dt.year
    df["app_month"] = df["pms_i_ymd"].dt.month
    df["app_day"] = df["pms_i_ymd"].dt.day
    df["app_weekday"] = df["pms_i_ymd"].dt.weekday

    df["birth_year"] = df["date_of_birth"].dt.year
    df["birth_month"] = df["date_of_birth"].dt.month
    df["birth_day"] = df["date_of_birth"].dt.day

    df["age"] = ((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365.25).astype(float)

    df.drop(columns=["pms_i_ymd", "date_of_birth"], inplace=True, errors="ignore")

    # =====================================================
    # 4. Engineer numeric features
    # =====================================================
    df["living_period_month"] = df["living_period_year"] * 12 + df["living_period_month"]
    df.drop(columns=['living_period_year'], inplace=True)

    df["c_number_of_working_month"] = df["c_number_of_working_year"] * 12 + df["c_number_of_working_month"]
    df.drop(columns=['c_number_of_working_year'], inplace=True)

    df["r_total_income"] = df[["c_monthly_salary", "r_additional_income", "r_spouse_income"]].sum(axis=1)
    df["r_debt_to_income_ratio"] = df["r_allloan_amount"] / (df["r_total_income"] + 1)
    df["loan_to_salary_ratio"] = df["r_expected_credit_limit"] / (df["c_monthly_salary"] + 1)
    df["has_other_loans"] = (df["r_allloan_case"] > 0).astype(int)
    df["is_married_and_has_children"] = ((df["marital_status"] == 2) & (df["number_of_children"] > 0)).astype(int)

    # =====================================================
    # 5. Cleaning Data
    # =====================================================
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)

    df.to_csv(cleanPath, index=False)
    return df

In [7]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


## Testing

In [8]:
threshold = trainCatboost(VERSION, train_df, cat_features)

[I 2025-10-31 20:30:25,755] A new study created in memory with name: no-name-7b148031-8ad0-45d6-aec0-739b1b5c0961


GPU detected -> using CatBoost GPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 0. Best value: 0.632271:  10%|█         | 1/10 [00:13<01:58, 13.21s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:30:38,963] Trial 0 finished with value: 0.6322714797183443 and parameters: {'bagging_temperature': 0.09447331822452454, 'iterations': 980, 'simple_ctr': 'Borders', 'depth': 6, 'learning_rate': 0.0526554621511328, 'l2_leaf_reg': 0.12524459110483516, 'random_strength': 0.16294637370586895, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 2, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 93}. Best is trial 0 with value: 0.6322714797183443.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 0. Best value: 0.632271:  20%|██        | 2/10 [00:36<02:33, 19.20s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:31:02,365] Trial 1 finished with value: 0.6271154058564583 and parameters: {'bagging_temperature': 0.8822536309128737, 'iterations': 531, 'simple_ctr': 'FloatTargetMeanValue', 'depth': 4, 'learning_rate': 0.032537978318496856, 'l2_leaf_reg': 0.22685127346571968, 'random_strength': 0.1621112647616758, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 9, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 82}. Best is trial 0 with value: 0.6322714797183443.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 2. Best value: 0.636774:  30%|███       | 3/10 [02:07<06:02, 51.82s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:32:32,990] Trial 2 finished with value: 0.6367738381725506 and parameters: {'bagging_temperature': 0.12725003578451666, 'iterations': 905, 'simple_ctr': 'FeatureFreq', 'depth': 5, 'learning_rate': 0.025441552497449157, 'l2_leaf_reg': 0.8591668724841058, 'random_strength': 1.4394911534626422, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 2, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 91}. Best is trial 2 with value: 0.6367738381725506.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  40%|████      | 4/10 [02:10<03:16, 32.72s/it]Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:32:36,433] Trial 3 finished with value: 0.6402660473551317 and parameters: {'bagging_temperature': 0.6568298764202317, 'iterations': 677, 'simple_ctr': 'FeatureFreq', 'depth': 3, 'learning_rate': 0.09668396187936132, 'l2_leaf_reg': 0.06410012662899707, 'random_strength': 3.301423575269754, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 37}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  50%|█████     | 5/10 [03:32<04:12, 50.43s/it]

[I 2025-10-31 20:33:58,264] Trial 4 finished with value: 0.6393174563414203 and parameters: {'bagging_temperature': 0.4914317360388344, 'iterations': 770, 'simple_ctr': 'FloatTargetMeanValue', 'depth': 6, 'learning_rate': 0.026068676756367124, 'l2_leaf_reg': 0.4828336322113433, 'random_strength': 0.8394749865738709, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 59}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  60%|██████    | 6/10 [04:04<02:57, 44.29s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:34:30,625] Trial 5 finished with value: 0.634661321865979 and parameters: {'bagging_temperature': 0.04306178855178788, 'iterations': 572, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.016189308199993778, 'l2_leaf_reg': 5.137421357925463, 'random_strength': 3.047302069780262, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 50}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  70%|███████   | 7/10 [04:13<01:38, 32.73s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:34:39,552] Trial 6 finished with value: 0.6348968031751062 and parameters: {'bagging_temperature': 0.2876170889086477, 'iterations': 852, 'simple_ctr': 'FloatTargetMeanValue', 'depth': 7, 'learning_rate': 0.03214342209671157, 'l2_leaf_reg': 0.057873279981271036, 'random_strength': 0.7943417301137915, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 48}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  80%|████████  | 8/10 [05:06<01:18, 39.07s/it]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 20:35:32,193] Trial 7 finished with value: 0.6263598049653042 and parameters: {'bagging_temperature': 0.9478796110898939, 'iterations': 627, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.010434692600577352, 'l2_leaf_reg': 0.014571509342442569, 'random_strength': 0.3519376706356959, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 88}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266:  90%|█████████ | 9/10 [05:11<00:28, 28.55s/it]

[I 2025-10-31 20:35:37,609] Trial 8 finished with value: 0.6375636026729424 and parameters: {'bagging_temperature': 0.7515038345862806, 'iterations': 935, 'simple_ctr': 'FloatTargetMeanValue', 'depth': 3, 'learning_rate': 0.07946225279180148, 'l2_leaf_reg': 0.10397726353576307, 'random_strength': 0.12161533948275262, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 99}. Best is trial 3 with value: 0.6402660473551317.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 3. Best value: 0.640266: 100%|██████████| 10/10 [06:45<00:00, 40.52s/it]


[I 2025-10-31 20:37:10,930] Trial 9 finished with value: 0.6331595465921646 and parameters: {'bagging_temperature': 0.4219520770008628, 'iterations': 657, 'simple_ctr': 'Borders', 'depth': 7, 'learning_rate': 0.032253985804386924, 'l2_leaf_reg': 1.1468175848387636, 'random_strength': 1.8177462151121297, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 10, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 58}. Best is trial 3 with value: 0.6402660473551317.

Best Parameters Found: {'bagging_temperature': 0.6568298764202317, 'iterations': 677, 'simple_ctr': 'FeatureFreq', 'depth': 3, 'learning_rate': 0.09668396187936132, 'l2_leaf_reg': 0.06410012662899707, 'random_strength': 3.301423575269754, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od

FileNotFoundError: [Errno 2] No such file or directory: '../../model/v5/param_log_catboost.json'

In [None]:
testCatboost(VERSION, test_df, ids, threshold, cat_features)


Predictions saved to: ../../prediction/prediction_catboost_v5.csv
Training + prediction complete.
