In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [18]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost
from modules.v1.xgboostModel import trainXgboost, testXgboost
from modules.v1.lightgbmModel import trainLGBM, testLGBM
from modules.v1.stackModel import trainTestStack

In [3]:
VERSION = 6
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [4]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [5]:
cat_features = [
    "Area",
    "Province",
    "gender",
    "marital_status",
    "tel_category",
    "type_of_residence",
    "c_postal_code",
    "c_business_type",
    "c_position",
    "c_occupation",
    "c_employment_status",
    "c_salary_payment_methods",
    "media",
    "place_for_sending_information",
    "r_propose",
    "r_generalcode1",
    "r_generalcode2",
    "r_generalcode3",
    "r_generalcode4",
    "r_generalcode5",
    "apply"
]



In [6]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)

    # drop_cols = ['pms_i_ymd','Area','Province','Shop Name','gender','date_of_birth_week',
    #              'marital_status','number_of_children','postal_code','number_of_resident',
    #              'c_postal_code','c_number_of_employee','c_salary_payment_method',
    #              'c_date_of_salary_payment','media','place_for_sending_information','r_propose',
    #              'r_alloan_case','r_allloan_amount', 'r_additional_income', 'r_spouse_income',
    #              'r_generalcode4','r_generalcode5','apply']

    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week',
                 'c_postal_code','c_date_of_salary_payment','media',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5','r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_month'] = df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12
        df.drop(columns=['living_period_year'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    #Combine Income & Calculate net worth
    other_income_cols = ['r_additional_income','r_spouse_income','r_allloan_amount']
    df[other_income_cols] = df[other_income_cols].fillna(0)
    df['c_monthly_salary'] = df['c_monthly_salary'].fillna(df['c_monthly_salary'].median())

    if {'c_monthly_salary','r_additional_income','r_spouse_income'}.issubset(df.columns):
        df['income'] = df['c_monthly_salary'] + df['r_additional_income'] + df['r_spouse_income']
        df.drop(columns=['c_monthly_salary','r_additional_income','r_spouse_income'], inplace=True)

    df['networth'] = df['income'] - df['r_allloan_amount']
    df.drop(columns=['r_allloan_amount'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Cut incorrect data
    # 1. Keep only valid values (1–2), replace others with 0
    df['r_generalcode1'] = df['r_generalcode1'].where(df['r_generalcode1'].between(1, 2), np.nan)

    # 2. Clamp salary payment date between 1 and 31
    #df['c_date_of_salary_payment'] = df['c_date_of_salary_payment'].clip(lower=1, upper=31)

    # 3. Replace invalid postal codes (<10000) with NaN
    df['postal_code'] = df['postal_code'].where(df['postal_code'].astype(str).str.len() == 5, np.nan)

    # Ensure numeric columns are numeric
    num_cols = ['number_of_children','number_of_resident','living_period_month','c_number_of_employee',
                'c_number_of_working_month','r_expected_credit_limit','age','networth']
    
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','marital_status','postal_code','tel_category','type_of_residence',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','r_generalcode5','apply']
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna(np.nan)

    print(df.head(3))
    df.to_csv(cleanPath, index=False)

    #All Categoric feature -> string

    # Keep only columns that exist in df
    existing_cat_features = [c for c in cat_features if c in df.columns]
    # Convert them to string
    df[existing_cat_features] = df[existing_cat_features].astype(str)
    
    return df

In [7]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  marital_status  number_of_children  postal_code  \
0  2.024120e+11     F2               1                   2      10400.0   
1  2.024120e+11      M               1                   0      10500.0   
2  2.024120e+11     F2               1                   0      10170.0   

   tel_category  number_of_resident  living_period_month  type_of_residence  \
0             3                   2                   60                  6   
1             3                   3                   48                  6   
2             3                   6                   62                  3   

   c_business_type  ...  r_expected_credit_limit  r_propose  r_generalcode1  \
0                7  ...                  40000.0        5.0             NaN   
1                7  ...                  20000.0        6.0             NaN   
2                7  ...                  20000.0        5.0             NaN   

   r_generalcode2  r_generalcode3  apply  default_12month   incom

## Testing

In [8]:
threshold, xgb_proba_train = trainXgboost(VERSION, train_df)

[I 2025-11-01 01:51:04,126] A new study created in memory with name: no-name-bbb66d7a-7ee1-43a8-be2f-38e7f4d765db


GPU detected -> using XGBoost GPU training (gpu_hist).


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-01 01:51:06,308] Trial 0 finished with value: 0.6032429280827304 and parameters: {'max_depth': 5, 'learning_rate': 0.2536999076681772, 'subsample': 0.8659969709057025, 'colsample_bytree': 0.7993292420985183, 'gamma': 0.7800932022121826, 'min_child_weight': 2.403950683025824, 'lambda': 0.0017073967431528124, 'alpha': 2.9154431891537547}. Best is trial 0 with value: 0.6032429280827304.
[I 2025-11-01 01:51:08,712] Trial 1 finished with value: 0.5974628779774571 and parameters: {'max_depth': 7, 'learning_rate': 0.11114989443094977, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'gamma': 4.162213204002109, 'min_child_weight': 2.9110519961044856, 'lambda': 0.005337032762603957, 'alpha': 0.00541524411940254}. Best is trial 0 with value: 0.6032429280827304.
[I 2025-11-01 01:51:11,433] Trial 2 finished with value: 0.6203499224028894 and parameters: {'max_depth': 5, 'learning_rate': 0.05958389350068958, 'subsample': 0.7159725093210578, 'colsample_bytree': 0.6

Parameters: { "loss_function", "random_seed" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
xgb_proba_test = testXgboost(VERSION, test_df, ids, threshold)


Predictions saved to: ../../prediction/prediction_xgboost_v6.csv
Training + prediction complete.


In [10]:
threshold, cat_proba_train = trainCatboost(VERSION, train_df)
cat_proba_train

[I 2025-11-01 01:55:12,721] A new study created in memory with name: no-name-b9d8412f-cee3-4d0d-9415-93b0a656fd41


GPU detected -> using CatBoost GPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:55:38,703] Trial 0 finished with value: 0.6223818238827664 and parameters: {'bagging_temperature': 0.35847250289619137, 'iterations': 869, 'simple_ctr': 'Borders', 'depth': 3, 'learning_rate': 0.022617223474868456, 'l2_leaf_reg': 3.561369315139788, 'random_strength': 1.485579941451387, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 2, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 85}. Best is trial 0 with value: 0.6223818238827664.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:56:06,344] Trial 1 finished with value: 0.6197507683823634 and parameters: {'bagging_temperature': 0.9913773456280232, 'iterations': 446, 'simple_ctr': 'Buckets', 'depth': 7, 'learning_rate': 0.011699593729391434, 'l2_leaf_reg': 0.0265220739940866, 'random_strength': 3.274922652375787, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 81}. Best is trial 0 with value: 0.6223818238827664.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:56:19,836] Trial 2 finished with value: 0.618388812858771 and parameters: {'bagging_temperature': 0.3014209888234407, 'iterations': 525, 'simple_ctr': 'Borders', 'depth': 4, 'learning_rate': 0.08688246685234789, 'l2_leaf_reg': 0.011127564600775734, 'random_strength': 2.6528866253440837, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 2, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 89}. Best is trial 0 with value: 0.6223818238827664.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:56:29,844] Trial 3 finished with value: 0.6141191630177796 and parameters: {'bagging_temperature': 0.024047241999834434, 'iterations': 986, 'simple_ctr': 'Borders', 'depth': 8, 'learning_rate': 0.042471018931373554, 'l2_leaf_reg': 0.02187429513121552, 'random_strength': 0.351318238144981, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 79}. Best is trial 0 with value: 0.6223818238827664.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:57:14,306] Trial 4 finished with value: 0.6245749985269285 and parameters: {'bagging_temperature': 0.9419586751051815, 'iterations': 835, 'simple_ctr': 'Borders', 'depth': 7, 'learning_rate': 0.0221478209930868, 'l2_leaf_reg': 0.40744575115746434, 'random_strength': 1.4971444168105337, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 10, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 45}. Best is trial 4 with value: 0.6245749985269285.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:57:30,482] Trial 5 finished with value: 0.6229264048914507 and parameters: {'bagging_temperature': 0.6222202983838984, 'iterations': 657, 'simple_ctr': 'Buckets', 'depth': 8, 'learning_rate': 0.023183259035330597, 'l2_leaf_reg': 0.029428414496819665, 'random_strength': 0.9171963562370143, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 3, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 41}. Best is trial 4 with value: 0.6245749985269285.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:57:55,149] Trial 6 finished with value: 0.6095139403256028 and parameters: {'bagging_temperature': 0.21226541434107016, 'iterations': 644, 'simple_ctr': 'Buckets', 'depth': 3, 'learning_rate': 0.010747746794565294, 'l2_leaf_reg': 2.5977802058303903, 'random_strength': 0.36403610603954173, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 6, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 63}. Best is trial 4 with value: 0.6245749985269285.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:58:15,146] Trial 7 finished with value: 0.6219647126006347 and parameters: {'bagging_temperature': 0.8250366234690918, 'iterations': 910, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.06635326893773477, 'l2_leaf_reg': 8.361334117854263, 'random_strength': 0.2541449893305224, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 79}. Best is trial 4 with value: 0.6245749985269285.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:58:30,611] Trial 8 finished with value: 0.6254400826316892 and parameters: {'bagging_temperature': 0.8560052058039547, 'iterations': 591, 'simple_ctr': 'Buckets', 'depth': 4, 'learning_rate': 0.02503440517117324, 'l2_leaf_reg': 7.574158474399039, 'random_strength': 0.15077006735730086, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 6, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 31}. Best is trial 8 with value: 0.6254400826316892.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:59:13,961] Trial 9 finished with value: 0.6224936046767533 and parameters: {'bagging_temperature': 0.45507495858288616, 'iterations': 579, 'simple_ctr': 'Buckets', 'depth': 4, 'learning_rate': 0.02567691194059127, 'l2_leaf_reg': 0.2855086766347162, 'random_strength': 0.9570231143027366, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 2, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 75}. Best is trial 8 with value: 0.6254400826316892.

Best Parameters Found: {'bagging_temperature': 0.8560052058039547, 'iterations': 591, 'simple_ctr': 'Buckets', 'depth': 4, 'learning_rate': 0.02503440517117324, 'l2_leaf_reg': 7.574158474399039, 'random_strength': 0.15077006735730086, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 6, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncT

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5822333	best: 0.5822333 (0)	total: 17.7ms	remaining: 10.4s
200:	test: 0.6231912	best: 0.6234225 (195)	total: 2.5s	remaining: 4.84s
400:	test: 0.6233715	best: 0.6241901 (300)	total: 5.07s	remaining: 2.4s
590:	test: 0.6225885	best: 0.6241901 (300)	total: 7.64s	remaining: 0us
bestTest = 0.6241900921
bestIteration = 300
Shrink model to first 301 iterations.

Optimal Threshold: 0.482

Confusion Matrix:
 [[3040 2623]
 [ 293  549]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.54      0.68      5663
           1       0.17      0.65      0.27       842

    accuracy                           0.55      6505
   macro avg       0.54      0.59      0.47      6505
weighted avg       0.82      0.55      0.62      6505

ROC-AUC Score: 0.62419011099679

CatBoost model training complete and saved!


In [11]:
cat_proba_test = testCatboost(VERSION, test_df, ids, threshold)
cat_proba_test


Predictions saved to: ../../prediction/prediction_catboost_v6.csv
Training + prediction complete.


array([0.57509467, 0.43165674, 0.50187301, ..., 0.55052708, 0.45773104,
       0.48276046])

In [12]:
threshold, lgbm_proba_train = trainLGBM(VERSION, train_df, True)
lgbm_proba_train

[I 2025-11-01 01:59:22,386] A new study created in memory with name: no-name-be55df37-e2f9-4c11-a3f6-0e41002f13a8


  0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3370, number of negative: 22649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2336
[LightGBM] [Info] Number of data points in the train set: 26019, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.129521 -> initscore=-1.905203
[LightGBM] [Info] Start training from score -1.905203
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's auc: 0.614993	valid_0's binary_logloss: 0.376163
[LightGBM] [Info] Number of positive: 3370, number of negative: 22649
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2356
[LightGBM] [Inf

array([0.07604924, 0.18283581, 0.06565587, ..., 0.19623214, 0.20360685,
       0.12831925])

In [13]:
lgbm_proba_test = testLGBM(VERSION, test_df, ids, threshold)
lgbm_proba_test


Predictions saved to: ../../prediction/prediction_lightgbm_v6.csv


array([0.15935866, 0.11290956, 0.1334572 , ..., 0.16000949, 0.11898981,
       0.14442508])

In [15]:
p_train = [xgb_proba_train, cat_proba_train, lgbm_proba_train]
p_test = [xgb_proba_test, cat_proba_test, lgbm_proba_test]

In [19]:


trainTestStack(VERSION, ids, train_df, p_train, p_test)

[I 2025-11-01 02:05:13,996] A new study created in memory with name: no-name-c6b9ea75-6673-4015-aa02-6d8c3f55d321
[W 2025-11-01 02:05:14,000] Trial 0 failed with parameters: {'w_xgb': 0.9994632353950159, 'w_cat': 0.2978523439720645, 'w_lgbm': 0.9612665261726747} because of the following error: ValueError('Found input variables with inconsistent numbers of samples: [32524, 6505]').
Traceback (most recent call last):
  File "c:\Users\Toa\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "d:\CU\Aihack_aikokkak\modules\v1\stackModel.py", line 170, in objective
    w = study.best_params
               ^^^^^^^^^^
  File "c:\Users\Toa\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\Toa\anaconda3\Lib\site-packages\sklearn\metrics\_ranking.py", line 641, in roc_auc_score
    return _average_binary_score(
        partial(_binary_roc_auc_sc

ValueError: Found input variables with inconsistent numbers of samples: [32524, 6505]