In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.xgboostModel import trainXgboost, testXgboost

In [3]:
VERSION = 3
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

## Clean Datasets

In [None]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week','marital_status',
                 'c_postal_code','nummber_of_resident','c_number_of_employee', 'media','number_of_children',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5', 'r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_year'] = (df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12)/12
        df.drop(columns=['living_period_month'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Ensure numeric columns are numeric
    num_cols = ['living_period_year','c_date_of_salary_payment',
                'c_monthly_salary','c_number_of_working_month','r_expected_credit_limit',
                'r_allloan_amount','r_additional_income','r_spouse_income','age']

    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','tel_category','type_of_residence','postal_code',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','apply']
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna('Unknown')

    print(df.head(3))
    print(df['c_monthly_salary'].min(), df['c_monthly_salary'].max())
    df.to_csv(cleanPath, index=False)
    return df

In [8]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  postal_code  tel_category  number_of_resident  \
0  202412000001     F2        10400             3                   2   
1  202412000002      M        10500             3                   3   
2  202412000003     F2        10170             3                   6   

   living_period_year  type_of_residence  c_business_type  c_position  \
0            5.000000                  6                7           5   
1            4.000000                  6                7           5   
2            5.166667                  3                7           4   

   c_occupation  ...  c_date_of_salary_payment  r_propose  r_allloan_amount  \
0            55  ...                      30.0        5.0                 0   
1            55  ...                      28.0        6.0                 0   
2            53  ...                      30.0        5.0             10000   

   r_spouse_income  r_generalcode1 r_generalcode2  r_generalcode3  apply  \
0              0.0   

## Testing

In [9]:
threshold = trainXgboost(VERSION, train_df)
testXgboost(VERSION, test_df, ids, threshold)

SMOTE applied. Imbalance ratio before: 6.72
No GPU detected -> using XGBoost CPU training (hist).
Tuning hyperparameters with early stopping (using validation set)...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters Found: {'subsample': 0.8, 'scale_pos_weight': 1, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.02, 'gamma': 0, 'colsample_bytree': 0.6}
Logged best parameters to ../../model/v3/param_log_xgboost.json.json


  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-logloss:0.68714
[1]	validation_0-logloss:0.68175
[2]	validation_0-logloss:0.67677
[3]	validation_0-logloss:0.67193
[4]	validation_0-logloss:0.66719
[5]	validation_0-logloss:0.66159
[6]	validation_0-logloss:0.65638
[7]	validation_0-logloss:0.65159
[8]	validation_0-logloss:0.64649
[9]	validation_0-logloss:0.64197
[10]	validation_0-logloss:0.63738
[11]	validation_0-logloss:0.63346
[12]	validation_0-logloss:0.62895
[13]	validation_0-logloss:0.62471
[14]	validation_0-logloss:0.62002
[15]	validation_0-logloss:0.61596
[16]	validation_0-logloss:0.61227
[17]	validation_0-logloss:0.60966
[18]	validation_0-logloss:0.60526
[19]	validation_0-logloss:0.60166
[20]	validation_0-logloss:0.59847
[21]	validation_0-logloss:0.59498
[22]	validation_0-logloss:0.59171
[23]	validation_0-logloss:0.58811
[24]	validation_0-logloss:0.58457
[25]	validation_0-logloss:0.58116
[26]	validation_0-logloss:0.57785
[27]	validation_0-logloss:0.57517
[28]	validation_0-logloss:0.57221
[29]	validation_0-loglos