In [113]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [114]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost
from modules.v1.xgboostModel import trainXgboost, testXgboost

In [115]:
VERSION = 5
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [116]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [117]:
cat_features = [
    'gender', 'c_business_type', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'r_expected_credit_limit',
       'r_additional_income', 'r_generalcode3', 'apply',
       'age', 'months_at_residence', 'months_at_job', 'total_income'
]


In [118]:
def cleanDataset(df, cleanPath):

    # %%
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from datetime import datetime

    # %%
    import sys
    import os
    root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
    if root_path not in sys.path:
        sys.path.append(root_path)
    from fileDir import getDataDir, getModelDir, getPredDir

    # %%
    #df = pd.read_csv(getDataDir("train"))
    df.head()

    # %%
    df["pms_i_ymd"] = pd.to_datetime(df["pms_i_ymd"], errors="coerce")
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")

    # %%
    df["age"] = (((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365).astype(int))
    df["months_at_residence"] = df["living_period_year"] * 12 + df["living_period_month"]
    df["months_at_job"] = df["c_number_of_working_year"] * 12 + df["c_number_of_working_month"]
    df["total_income"] = df["c_monthly_salary"] + df["r_additional_income"]
    #df["debt_to_income_ratio"] = df["r_allloan_amount"] / (df["total_income"] + 1)
    #df["loan_burden_per_income"] = df["r_expected_credit_limit"] / (df["total_income"] + 1)

    # %%
    df.head()

    # %%
    df.shape

    # %%
    df.columns

    # %%
    df = df.drop(columns=["ID", "pms_i_ymd", "Area", "Province", "Shop Name", "date_of_birth_week", "date_of_birth", "marital_status", "number_of_children", "postal_code", "tel_category", "number_of_resident", "living_period_year", "living_period_month", "c_number_of_working_year", "c_postal_code", "c_number_of_working_month", "c_salary_payment_methods", "c_date_of_salary_payment", "media", "place_for_sending_information", "r_propose", "r_allloan_case", "r_allloan_amount", "r_generalcode4", "r_generalcode5"])
    df.columns

    # %%
    df = df.dropna(axis=1, thresh=len(df)*0.6)

    df.columns

    # %%
    df = df.drop(columns=["r_spouse_income", "c_number_of_employee", "type_of_residence"])
    df.columns

    # %%
    categorical_cols = [
        "gender", "c_business_type", "c_position",
        "c_occupation", "c_employment_status",
        "c_employment_status",
        "r_generalcode3", "apply"
    ]

    # %%
    from sklearn.impute import SimpleImputer

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    # Create imputers
    num_imputer = SimpleImputer(strategy='median')           # for numeric columns
    cat_imputer = SimpleImputer(strategy='most_frequent')    # for categorical columns

    # Fit and transform
    df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

    # %%
    from sklearn.preprocessing import LabelEncoder
    df[["gender", "apply", "r_generalcode3", "r_additional_income", "default_12month"]] = df[["gender", "apply", "r_generalcode3", "r_additional_income", "default_12month"]].apply(LabelEncoder().fit_transform)

    # %%
    # Loop through each numeric column and remove outliers
    numeric_cols = list(numeric_cols)
    numeric_cols.remove("default_12month") 
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)   # 25th percentile
        Q3 = df[col].quantile(0.75)   # 75th percentile
        IQR = Q3 - Q1                 # Interquartile Range
        lower = Q1 - 1.5 * IQR        # Lower bound
        upper = Q3 + 1.5 * IQR        # Upper bound

        # Keep only the rows within the bounds
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    # %%
    df['c_monthly_salary'] = pd.qcut(df['c_monthly_salary'], q=3, labels=['low', 'medium', 'high'])
    df['r_expected_credit_limit'] = pd.qcut(df['r_expected_credit_limit'], q=3, labels=['low', 'medium', 'high'])
    df['age'] = pd.qcut(df['age'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_residence'] = pd.qcut(df['months_at_residence'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_job'] = pd.qcut(df['months_at_job'], q=3, labels=['low', 'medium', 'high'])
    df['total_income'] = pd.qcut(df['total_income'], q=3, labels=['low', 'medium', 'high'])

    df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]] = df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]].apply(LabelEncoder().fit_transform)

    df.head()

    # %%
    categorical_cols = [
        "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"
    ]

    for col in cat_features:
        df[col] = df[col].astype("str")

    df.to_csv(cleanPath, index=False)
    return df

In [119]:
def cleanDataset2(df, cleanPath):

    # %%
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from datetime import datetime

    # %%
    import sys
    import os
    root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
    if root_path not in sys.path:
        sys.path.append(root_path)
    from fileDir import getDataDir, getModelDir, getPredDir

    # %%
    #df = pd.read_csv(getDataDir("train"))
    df.head()

    # %%
    df["pms_i_ymd"] = pd.to_datetime(df["pms_i_ymd"], errors="coerce")
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")

    # Compute age and fill NaN with mean
    df["age"] = ((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365)
    df["age"].fillna(df["age"].mean(), inplace=True)
    df["age"] = df["age"].astype(int)

    # %%
    #df["age"] = (((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365).astype(int))
    df["months_at_residence"] = df["living_period_year"] * 12 + df["living_period_month"]
    df["months_at_job"] = df["c_number_of_working_year"] * 12 + df["c_number_of_working_month"]
    df["total_income"] = df["c_monthly_salary"] + df["r_additional_income"]
    #df["debt_to_income_ratio"] = df["r_allloan_amount"] / (df["total_income"] + 1)
    #df["loan_burden_per_income"] = df["r_expected_credit_limit"] / (df["total_income"] + 1)

    # %%
    df.head()

    # %%
    df.shape

    # %%
    df.columns

    # %%
    df = df.drop(columns=["ID", "pms_i_ymd", "Area", "Province", "Shop Name", "date_of_birth_week", "date_of_birth", "marital_status", "number_of_children", "postal_code", "tel_category", "number_of_resident", "living_period_year", "living_period_month", "c_number_of_working_year", "c_postal_code", "c_number_of_working_month", "c_salary_payment_methods", "c_date_of_salary_payment", "media", "place_for_sending_information", "r_propose", "r_allloan_case", "r_allloan_amount", "r_generalcode4", "r_generalcode5"])
    df.columns

    # %%
    df = df.dropna(axis=1, thresh=len(df)*0.6)

    df.columns

    # %%
    df = df.drop(columns=["r_spouse_income", "c_number_of_employee", "type_of_residence"])
    df.columns

    # %%
    categorical_cols = [
        "gender", "c_business_type", "c_position",
        "c_occupation", "c_employment_status",
        "c_employment_status",
        "r_generalcode3", "apply"
    ]

    # %%
    from sklearn.impute import SimpleImputer

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    # Create imputers
    num_imputer = SimpleImputer(strategy='median')           # for numeric columns
    cat_imputer = SimpleImputer(strategy='most_frequent')    # for categorical columns

    # Fit and transform
    df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

    # %%
    from sklearn.preprocessing import LabelEncoder
    df[["gender", "apply", "r_generalcode3", "r_additional_income"]] = df[["gender", "apply", "r_generalcode3", "r_additional_income"]].apply(LabelEncoder().fit_transform)

    # %%
    # Loop through each numeric column and remove outliers
    #numeric_cols = list(numeric_cols)
    #numeric_cols.remove("default_12month") 
    #for col in numeric_cols:
    #    Q1 = df[col].quantile(0.25)   # 25th percentile
    #    Q3 = df[col].quantile(0.75)   # 75th percentile
    #    IQR = Q3 - Q1                 # Interquartile Range
    #    lower = Q1 - 1.5 * IQR        # Lower bound
    #    upper = Q3 + 1.5 * IQR        # Upper bound

        # Keep only the rows within the bounds
    #    df = df[(df[col] >= lower) & (df[col] <= upper)]

    # %%
    df['c_monthly_salary'] = pd.qcut(df['c_monthly_salary'], q=3, labels=['low', 'medium', 'high'])
    df['r_expected_credit_limit'] = pd.qcut(df['r_expected_credit_limit'], q=3, labels=['low', 'medium', 'high'])
    df['age'] = pd.qcut(df['age'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_residence'] = pd.qcut(df['months_at_residence'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_job'] = pd.qcut(df['months_at_job'], q=3, labels=['low', 'medium', 'high'])
    df['total_income'] = pd.qcut(df['total_income'], q=3, labels=['low', 'medium', 'high'])

    df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]] = df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]].apply(LabelEncoder().fit_transform)

    df.head()

    # %%
    categorical_cols = [
        "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"
    ]

    for col in cat_features:
        df[col] = df[col].astype("str")

    df.to_csv(cleanPath, index=False)
    return df

In [120]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset2(test_df, CLEANED_TEST_PATH)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].mean(), inplace=True)


In [121]:
train_df.columns

Index(['gender', 'c_business_type', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'r_expected_credit_limit',
       'r_additional_income', 'r_generalcode3', 'apply', 'default_12month',
       'age', 'months_at_residence', 'months_at_job', 'total_income'],
      dtype='object')

In [122]:
test_df.shape

(8619, 14)

## Testing

In [123]:
threshold = trainCatboost(VERSION, train_df)

[I 2025-11-01 01:34:32,183] A new study created in memory with name: no-name-e2cc5c18-caec-4218-ad2a-63eeca3d1ff0


GPU detected -> using CatBoost GPU training.
Running hyperparameter tuning with Optuna...


  0%|          | 0/10 [00:00<?, ?it/s]

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:34:37,388] Trial 0 finished with value: 0.5428302738189434 and parameters: {'bagging_temperature': 0.4525298972747116, 'iterations': 786, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.03860553079921787, 'l2_leaf_reg': 5.285096913092954, 'random_strength': 2.567186448492727, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 9, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 48}. Best is trial 0 with value: 0.5428302738189434.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:34:55,026] Trial 1 finished with value: 0.5651120763864559 and parameters: {'bagging_temperature': 0.5435287464098958, 'iterations': 792, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.07383500613642323, 'l2_leaf_reg': 3.980181043132135, 'random_strength': 0.9599984961373049, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 10, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 56}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:35:03,985] Trial 2 finished with value: 0.5453351645772807 and parameters: {'bagging_temperature': 0.8012277434111841, 'iterations': 579, 'simple_ctr': 'Buckets', 'depth': 6, 'learning_rate': 0.014340386870869515, 'l2_leaf_reg': 0.05523783439324127, 'random_strength': 1.887797192700648, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 10, 'max_ctr_complexity': 2, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 32}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:35:34,222] Trial 3 finished with value: 0.5642740761254346 and parameters: {'bagging_temperature': 0.6231602968954744, 'iterations': 814, 'simple_ctr': 'Borders', 'depth': 6, 'learning_rate': 0.029719966770933692, 'l2_leaf_reg': 0.17934587034429433, 'random_strength': 2.4136804027285184, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 5, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 64}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:35:48,628] Trial 4 finished with value: 0.5504475964370071 and parameters: {'bagging_temperature': 0.724826003381539, 'iterations': 977, 'simple_ctr': 'Buckets', 'depth': 8, 'learning_rate': 0.062085304467391876, 'l2_leaf_reg': 0.10252394122703197, 'random_strength': 1.2873984456705172, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 4, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 82}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:36:07,764] Trial 5 finished with value: 0.5640866283911601 and parameters: {'bagging_temperature': 0.3563111888883721, 'iterations': 851, 'simple_ctr': 'Buckets', 'depth': 6, 'learning_rate': 0.016746594138907764, 'l2_leaf_reg': 0.011558829187935077, 'random_strength': 2.145716604830029, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 9, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 47}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:36:13,498] Trial 6 finished with value: 0.5580322860205885 and parameters: {'bagging_temperature': 0.317547599892267, 'iterations': 871, 'simple_ctr': 'Buckets', 'depth': 4, 'learning_rate': 0.06407385406515846, 'l2_leaf_reg': 0.024455114923640067, 'random_strength': 0.1568023769042231, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 1, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 3, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 51}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:36:16,611] Trial 7 finished with value: 0.5584569598797594 and parameters: {'bagging_temperature': 0.7341312419346472, 'iterations': 491, 'simple_ctr': 'Buckets', 'depth': 3, 'learning_rate': 0.09846552072632485, 'l2_leaf_reg': 0.5512864996804028, 'random_strength': 0.19327646527176276, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 2, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 58}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:36:24,024] Trial 8 finished with value: 0.5579781030174743 and parameters: {'bagging_temperature': 0.08392689965806921, 'iterations': 717, 'simple_ctr': 'Borders', 'depth': 5, 'learning_rate': 0.023168656134770756, 'l2_leaf_reg': 4.896160349094909, 'random_strength': 0.16889133196016365, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Newton', 'one_hot_max_size': 8, 'max_ctr_complexity': 1, 'auto_class_weights': 'Balanced', 'od_type': 'IncToDec', 'od_wait': 80}. Best is trial 1 with value: 0.5651120763864559.


Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.


[I 2025-11-01 01:36:29,867] Trial 9 finished with value: 0.5510036040069393 and parameters: {'bagging_temperature': 0.99676429407285, 'iterations': 564, 'simple_ctr': 'Buckets', 'depth': 8, 'learning_rate': 0.09407543948053992, 'l2_leaf_reg': 2.314742661898409, 'random_strength': 0.7792307662391913, 'grow_policy': 'Lossguide', 'leaf_estimation_iterations': 5, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 4, 'max_ctr_complexity': 1, 'auto_class_weights': None, 'od_type': 'IncToDec', 'od_wait': 34}. Best is trial 1 with value: 0.5651120763864559.

Best Parameters Found: {'bagging_temperature': 0.5435287464098958, 'iterations': 792, 'simple_ctr': 'Buckets', 'depth': 5, 'learning_rate': 0.07383500613642323, 'l2_leaf_reg': 3.980181043132135, 'random_strength': 0.9599984961373049, 'grow_policy': 'SymmetricTree', 'leaf_estimation_iterations': 4, 'leaf_estimation_method': 'Gradient', 'one_hot_max_size': 10, 'max_ctr_complexity': 3, 'auto_class_weights': None, 'od_type': 'IncToDec',

Change of simpleCtr will not affect combinations ctrs.
Change of simpleCtr will not affect combinations ctrs.
Exception in thread Thread-8 (start):
Traceback (most recent call last):
  File [35m"c:\Users\chins\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m1043[0m, in [35m_bootstrap_inner[0m
    [31mself.run[0m[1;31m()[0m
    [31m~~~~~~~~[0m[1;31m^^[0m
  File [35m"C:\Users\chins\AppData\Roaming\Python\Python313\site-packages\ipykernel\ipkernel.py"[0m, line [35m788[0m, in [35mrun_closure[0m
    [31m_threading_Thread_run[0m[1;31m(self)[0m
    [31m~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^[0m
  File [35m"c:\Users\chins\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m994[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"c:\Users\chins\AppData\Local\Programs\Python\Python313\Lib\site-packages\catboost\widg

0:	test: 0.5303386	best: 0.5303386 (0)	total: 21.7ms	remaining: 17.1s
200:	test: 0.5635578	best: 0.5640232 (165)	total: 2.74s	remaining: 8.07s
400:	test: 0.5658157	best: 0.5658157 (400)	total: 5.45s	remaining: 5.32s
600:	test: 0.5656327	best: 0.5662449 (425)	total: 8.21s	remaining: 2.61s
791:	test: 0.5657886	best: 0.5662449 (425)	total: 10.8s	remaining: 0us
bestTest = 0.5662449002
bestIteration = 425
Shrink model to first 426 iterations.

Optimal Threshold: 0.128

Confusion Matrix:
 [[ 707 1476]
 [  89  284]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.32      0.47      2183
           1       0.16      0.76      0.27       373

    accuracy                           0.39      2556
   macro avg       0.52      0.54      0.37      2556
weighted avg       0.78      0.39      0.44      2556

ROC-AUC Score: 0.5662332255461715

CatBoost model training complete and saved!


In [124]:
testCatboost(VERSION, test_df, ids, threshold)


Predictions saved to: ../../prediction/prediction_catboost_v5.csv
Training + prediction complete.
