In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
# from modules.v1.xgboostModel import trainXgboost, testXgboost
from modules.v1.catboostModel import trainCatboost, testCatboost

In [3]:
VERSION = 6
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [4]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [5]:
cat_features = [
    "Area",
    "Province",
    "gender",
    "marital_status",
    "tel_category",
    "type_of_residence",
    "c_postal_code",
    "c_business_type",
    "c_position",
    "c_occupation",
    "c_employment_status",
    "c_salary_payment_methods",
    "media",
    "place_for_sending_information",
    "r_propose",
    "r_generalcode1",
    "r_generalcode2",
    "r_generalcode3",
    "r_generalcode4",
    "r_generalcode5",
    "apply"
]



In [6]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)

    # drop_cols = ['pms_i_ymd','Area','Province','Shop Name','gender','date_of_birth_week',
    #              'marital_status','number_of_children','postal_code','number_of_resident',
    #              'c_postal_code','c_number_of_employee','c_salary_payment_method',
    #              'c_date_of_salary_payment','media','place_for_sending_information','r_propose',
    #              'r_alloan_case','r_allloan_amount', 'r_additional_income', 'r_spouse_income',
    #              'r_generalcode4','r_generalcode5','apply']

    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week',
                 'c_postal_code','c_date_of_salary_payment','media',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5','r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_month'] = df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12
        df.drop(columns=['living_period_year'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    #Combine Income & Calculate net worth
    other_income_cols = ['r_additional_income','r_spouse_income','r_allloan_amount']
    df[other_income_cols] = df[other_income_cols].fillna(0)
    df['c_monthly_salary'] = df['c_monthly_salary'].fillna(df['c_monthly_salary'].median())

    if {'c_monthly_salary','r_additional_income','r_spouse_income'}.issubset(df.columns):
        df['income'] = df['c_monthly_salary'] + df['r_additional_income'] + df['r_spouse_income']
        df.drop(columns=['c_monthly_salary','r_additional_income','r_spouse_income'], inplace=True)

    df['networth'] = df['income'] - df['r_allloan_amount']
    df.drop(columns=['r_allloan_amount'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Cut incorrect data
    # 1. Keep only valid values (1–2), replace others with 0
    df['r_generalcode1'] = df['r_generalcode1'].where(df['r_generalcode1'].between(1, 2), np.nan)

    # 2. Clamp salary payment date between 1 and 31
    #df['c_date_of_salary_payment'] = df['c_date_of_salary_payment'].clip(lower=1, upper=31)

    # 3. Replace invalid postal codes (<10000) with NaN
    df['postal_code'] = df['postal_code'].where(df['postal_code'].astype(str).str.len() == 5, np.nan)

    # Ensure numeric columns are numeric
    num_cols = ['number_of_children','number_of_resident','living_period_month','c_number_of_employee',
                'c_number_of_working_month','r_expected_credit_limit','age','networth']
    
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','marital_status','postal_code','tel_category','type_of_residence',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','r_generalcode5','apply']
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna(np.nan)

    print(df.head(3))
    df.to_csv(cleanPath, index=False)

    #All Categoric feature -> string
    # Keep only columns that exist in df
    existing_cat_features = [c for c in cat_features if c in df.columns]

    # Convert them to string
    df[existing_cat_features] = df[existing_cat_features].astype(str)
    return df

In [7]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  marital_status  number_of_children  postal_code  \
0  202412000001     F2               1                   2      10400.0   
1  202412000002      M               1                   0      10500.0   
2  202412000003     F2               1                   0      10170.0   

   tel_category  number_of_resident  living_period_month  type_of_residence  \
0             3                   2                   60                  6   
1             3                   3                   48                  6   
2             3                   6                   62                  3   

   c_business_type  ...  r_expected_credit_limit  r_propose  r_generalcode1  \
0                7  ...                  40000.0        5.0             NaN   
1                7  ...                  20000.0        6.0             NaN   
2                7  ...                  20000.0        5.0             NaN   

   r_generalcode2  r_generalcode3  apply  default_12month   incom

## Testing

In [8]:
# threshold = trainXgboost(VERSION, train_df)
# testXgboost(VERSION, test_df, ids, threshold)

In [9]:
threshold = trainCatboost(VERSION, train_df, cat_features)

[I 2025-10-31 23:14:33,615] A new study created in memory with name: no-name-035c2398-f912-4e05-8296-baee46ebde3f


No GPU detected -> using CPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:14:59,594] Trial 0 finished with value: 0.6245682427025265 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.8822583463454028, 'iterations': 837, 'simple_ctr': 'Buckets', 'depth': 3, 'learning_rate': 0.0291152029270211, 'l2_leaf_reg': 0.0543890580479135, 'random_strength': 0.7083281517411809, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 6, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 86}. Best is trial 0 with value: 0.6245682427025265.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:15:21,363] Trial 1 finished with value: 0.6168619312603066 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.874199222051885, 'iterations': 826, 'simple_ctr': 'Borders', 'depth': 3, 'learning_rate': 0.0106418283324601, 'l2_leaf_reg': 0.030677777488419596, 'random_strength': 3.981049415287959, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 3, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 76}. Best is trial 0 with value: 0.6245682427025265.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:15:46,877] Trial 2 finished with value: 0.609214082457945 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.9797816471318617, 'iterations': 408, 'simple_ctr': 'Buckets', 'depth': 4, 'learning_rate': 0.016436787964743224, 'l2_leaf_reg': 0.37754577297857445, 'random_strength': 4.40426634128591, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 2, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 96}. Best is trial 0 with value: 0.6245682427025265.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:16:30,403] Trial 3 finished with value: 0.6257435169680087 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.7573667718827705, 'iterations': 929, 'simple_ctr': 'Borders', 'depth': 5, 'learning_rate': 0.040010085952529084, 'l2_leaf_reg': 0.0117914351782389, 'random_strength': 1.124208748895785, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 4, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 47}. Best is trial 3 with value: 0.6257435169680087.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:16:43,303] Trial 4 finished with value: 0.6250655421261125 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.5899451639375689, 'iterations': 940, 'simple_ctr': 'BinarizedTargetMeanValue', 'depth': 6, 'learning_rate': 0.05606864184819115, 'l2_leaf_reg': 0.022838740772844084, 'random_strength': 3.882361317737478, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 56}. Best is trial 3 with value: 0.6257435169680087.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:17:00,212] Trial 5 finished with value: 0.6267724431212759 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.5758814138853083, 'iterations': 841, 'simple_ctr': 'Counter', 'depth': 6, 'learning_rate': 0.030172666509001658, 'l2_leaf_reg': 0.34693584264259075, 'random_strength': 2.9208663428988766, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 7, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 60}. Best is trial 5 with value: 0.6267724431212759.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:17:17,719] Trial 6 finished with value: 0.6201202321381976 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.686436532931067, 'iterations': 592, 'simple_ctr': 'Counter', 'depth': 3, 'learning_rate': 0.013418129998097729, 'l2_leaf_reg': 0.01636935377339296, 'random_strength': 2.707508906980087, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 7, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 77}. Best is trial 5 with value: 0.6267724431212759.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:17:23,969] Trial 7 finished with value: 0.6286599725079389 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.6043543927928057, 'iterations': 562, 'simple_ctr': 'Counter', 'depth': 7, 'learning_rate': 0.055203662158240026, 'l2_leaf_reg': 0.4521232224262562, 'random_strength': 0.7501649703412521, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 10, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 93}. Best is trial 7 with value: 0.6286599725079389.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:17:56,053] Trial 8 finished with value: 0.6222251252979815 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.9755561146286078, 'iterations': 446, 'simple_ctr': 'BinarizedTargetMeanValue', 'depth': 8, 'learning_rate': 0.01068500108236556, 'l2_leaf_reg': 3.447440932015337, 'random_strength': 0.19440302063455786, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 5, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 87}. Best is trial 7 with value: 0.6286599725079389.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-10-31 23:18:25,972] Trial 9 finished with value: 0.6237106419462829 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.761635411281699, 'iterations': 908, 'simple_ctr': 'BinarizedTargetMeanValue', 'depth': 4, 'learning_rate': 0.012071230385970654, 'l2_leaf_reg': 1.5791753240045088, 'random_strength': 0.23776013986206954, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 4, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 95}. Best is trial 7 with value: 0.6286599725079389.

Best Parameters Found: {'bootstrap_type': 'Bernoulli', 'subsample': 0.6043543927928057, 'iterations': 562, 'simple_ctr': 'Counter', 'depth': 7, 'learning_rate': 0.055203662158240026, 'l2_leaf_reg': 0.4521232224262562, 'random_strength': 0.7501649703412521, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 10, 'max_ctr_c

FileNotFoundError: [Errno 2] No such file or directory: '../../model/v6/param_log_catboost.json'