In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
VERSION = 3
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

## Clean Datasets

In [4]:
cat_features = [
    'gender','tel_category','type_of_residence','postal_code',
    'c_business_type','c_position','c_occupation','c_employment_status', 'r_generalcode1',
    'c_salary_payment_methods','r_propose','r_generalcode2',
    'r_generalcode3','apply'
]

In [5]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week','marital_status',
                 'c_postal_code','nummber_of_resident','c_number_of_employee', 'media','number_of_children',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5', 'r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Cut incorrect data
    df.loc[df['r_generalcode1'] > 2, 'r_generalcode1'] = 0
    df.loc[df['r_generalcode1'] < 1, 'r_generalcode1'] = 0
    df.loc[df['c_date_of_salary_payment'] > 31, 'c_date_of_salary_payment'] = 31
    df.loc[df['c_date_of_salary_payment'] < 1, 'c_date_of_salary_payment'] = 1
    df['c_date_of_salary_payment'] = df['c_date_of_salary_payment'].fillna(31)
    df.loc[df['postal_code'] < 10000, 'postal_code'] = np.nan
    df['1gen'] = df['r_generalcode1']


    #spouse
    df.loc[df['1gen'] == 2, '1gen'] = 0
    df['r_spouse_income'] = df['1gen'].fillna(0)*df['r_spouse_income'].fillna(0)
    df.drop(columns=['1gen'], inplace=True)

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_year'] = (df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12)/12
        df.drop(columns=['living_period_month'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Ensure numeric columns are numeric
    num_cols = ['living_period_year','c_date_of_salary_payment', 'all_income',
                'c_monthly_salary','c_number_of_working_month','r_expected_credit_limit',
                'r_allloan_amount','r_additional_income','age','r_spouse_income']

    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())


    # Fill categorical columns
    cat_cols = ['gender','tel_category','type_of_residence','postal_code',
                'c_business_type','c_position','c_occupation','c_employment_status','r_generalcode1',
                'c_salary_payment_methods','r_propose','r_generalcode2',
                'r_generalcode3','apply']
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)
    df.loc[df['postal_code'] < 10000, 'postal_code'] = np.nan
    df.loc[df['r_generalcode3'] < 1, 'r_generalcode3'] = np.nan
    
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna('Unknown')
    
    # for c in cat_cols:
    #     if c in df.columns:
    #         df[c] = df[c].astype('category')
    #         print(c)
    #         print(df[c].cat.categories)
    for c in cat_features:
        df[c] = df[c].astype(str)
            
    print(df.head(8))
    print(df.shape)
    print(df.isna().sum().sum())
    df.to_csv(cleanPath, index=False)
    return df

In [6]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)


             ID gender postal_code tel_category  number_of_resident  \
0  202412000001     F2     10400.0            3                   2   
1  202412000002      M     10500.0            3                   3   
2  202412000003     F2     10170.0            3                   6   
3  202412000004      M     10500.0            3                   2   
4  202412000005     F2     10120.0            3                   2   
5  202412000006     F2     10400.0            3                   2   
6  202412000007     F2     10120.0            3                   8   
7  202412000008     F2     10120.0            3                   4   

   living_period_year type_of_residence c_business_type c_position  \
0            5.000000                 6               7          5   
1            4.000000                 6               7          5   
2            5.166667                 3               7          4   
3            4.250000                 5               3          5   
4         

## Testing

In [None]:
threshold = trainCatboost(VERSION, train_df, cat_features)
testCatboost(VERSION, test_df, ids, threshold)

[I 2025-10-31 22:20:30,253] A new study created in memory with name: no-name-9c833956-0a0a-4aee-b286-ebb3252105aa


No GPU detected -> using CPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 0. Best value: 0.618256:  10%|█         | 1/10 [01:10<10:33, 70.44s/it]

[I 2025-10-31 22:21:40,689] Trial 0 finished with value: 0.6182561959168966 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.5459633397433823, 'iterations': 777, 'simple_ctr': 'Buckets', 'depth': 7, 'learning_rate': 0.08397839971063478, 'l2_leaf_reg': 0.1743538925138322, 'random_strength': 2.226901097449556, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 6, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 54}. Best is trial 0 with value: 0.6182561959168966.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 1. Best value: 0.621:  20%|██        | 2/10 [02:28<10:00, 75.04s/it]   

[I 2025-10-31 22:22:58,943] Trial 1 finished with value: 0.6210001830673295 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.9824557384096, 'iterations': 978, 'simple_ctr': 'BinarizedTargetMeanValue', 'depth': 3, 'learning_rate': 0.0584762481240694, 'l2_leaf_reg': 2.5611991841050474, 'random_strength': 1.997677522767386, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 4, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 68}. Best is trial 1 with value: 0.6210001830673295.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 2. Best value: 0.625358:  30%|███       | 3/10 [03:43<08:45, 75.05s/it]

[I 2025-10-31 22:24:14,004] Trial 2 finished with value: 0.625358464373518 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.6711898397865865, 'iterations': 622, 'simple_ctr': 'Buckets', 'depth': 6, 'learning_rate': 0.05075165327681962, 'l2_leaf_reg': 0.03149266794850976, 'random_strength': 0.17270770750086112, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 49}. Best is trial 2 with value: 0.625358464373518.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 2. Best value: 0.625358:  40%|████      | 4/10 [05:28<08:40, 86.69s/it]

[I 2025-10-31 22:25:58,541] Trial 3 finished with value: 0.607692025332588 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.9511791645268823, 'iterations': 673, 'simple_ctr': 'Counter', 'depth': 3, 'learning_rate': 0.024352210391897854, 'l2_leaf_reg': 2.9613531632300796, 'random_strength': 0.12426203728338577, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 2, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 77}. Best is trial 2 with value: 0.625358464373518.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 2. Best value: 0.625358:  50%|█████     | 5/10 [08:53<10:47, 129.46s/it]

[I 2025-10-31 22:29:23,828] Trial 4 finished with value: 0.6211253223890972 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.7411137522992517, 'iterations': 592, 'simple_ctr': 'Buckets', 'depth': 8, 'learning_rate': 0.010996580800985251, 'l2_leaf_reg': 0.03483580882637811, 'random_strength': 1.5460845709998845, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 9, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 74}. Best is trial 2 with value: 0.625358464373518.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Best trial: 2. Best value: 0.625358:  60%|██████    | 6/10 [12:30<10:36, 159.13s/it]

[I 2025-10-31 22:33:00,554] Trial 5 finished with value: 0.6184160080060255 and parameters: {'bootstrap_type': 'Bernoulli', 'subsample': 0.9857099010353176, 'iterations': 695, 'simple_ctr': 'BinarizedTargetMeanValue', 'depth': 5, 'learning_rate': 0.03429550572600413, 'l2_leaf_reg': 0.473040134512008, 'random_strength': 2.471132953387456, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 3, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 87}. Best is trial 2 with value: 0.625358464373518.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


In [None]:
# testCatboost(VERSION, test_df, ids, threshold, cat_features)

NameError: name 'threshold' is not defined

In [None]:
print(len(test_df))
print(len(ids))
print(len(train_df))

8619
8619
32524
