In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [10]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.xgboostModel import trainXgboost, testXgboost

In [11]:
VERSION = 1
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

## Clean Datasets

In [12]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)

    # drop_cols = ['pms_i_ymd','Area','Province','Shop Name','gender','date_of_birth_week',
    #              'marital_status','number_of_children','postal_code','number_of_resident',
    #              'c_postal_code','c_number_of_employee','c_salary_payment_method',
    #              'c_date_of_salary_payment','media','place_for_sending_information','r_propose',
    #              'r_alloan_case','r_allloan_amount', 'r_additional_income', 'r_spouse_income',
    #              'r_generalcode4','r_generalcode5','apply']

    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week',
                 'c_postal_code','c_date_of_salary_payment','media',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5','r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_month'] = df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12
        df.drop(columns=['living_period_year'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    #Combine Income & Calculate net worth
    other_income_cols = ['r_additional_income','r_spouse_income','r_allloan_amount']
    df[other_income_cols] = df[other_income_cols].fillna(0)
    df['c_monthly_salary'] = df['c_monthly_salary'].fillna(df['c_monthly_salary'].median())

    if {'c_monthly_salary','r_additional_income','r_spouse_income'}.issubset(df.columns):
        df['income'] = df['c_monthly_salary'] + df['r_additional_income'] + df['r_spouse_income']
        df.drop(columns=['c_monthly_salary','r_additional_income','r_spouse_income'], inplace=True)

    df['networth'] = df['income'] - df['r_allloan_amount']
    df.drop(columns=['r_allloan_amount'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Cut incorrect data
    # 1. Keep only valid values (1–2), replace others with 0
    df['r_generalcode1'] = df['r_generalcode1'].where(df['r_generalcode1'].between(1, 2), np.nan)

    # 2. Clamp salary payment date between 1 and 31
    #df['c_date_of_salary_payment'] = df['c_date_of_salary_payment'].clip(lower=1, upper=31)

    # 3. Replace invalid postal codes (<10000) with NaN
    df['postal_code'] = df['postal_code'].where(df['postal_code'].astype(str).str.len() == 5, np.nan)

    # Ensure numeric columns are numeric
    num_cols = ['number_of_children','number_of_resident','living_period_month','c_number_of_employee',
                'c_number_of_working_month','r_expected_credit_limit','age','networth']
    
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','marital_status','postal_code','tel_category','type_of_residence',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','r_generalcode5','apply']
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna("Unknown")

    print(df.head(3))
    df.to_csv(cleanPath, index=False)

    #All Categoric feature -> string
    # Keep only columns that exist in df
    # existing_cat_features = [c for c in cat_features if c in df.columns]

    # Convert them to string
    # df[existing_cat_features] = df[existing_cat_features].astype(str)
    
    return df

In [13]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  marital_status  number_of_children postal_code  \
0  202412000001     F2               1                   2     10400.0   
1  202412000002      M               1                   0     10500.0   
2  202412000003     F2               1                   0     10170.0   

   tel_category  number_of_resident  living_period_month  type_of_residence  \
0             3                   2                   60                  6   
1             3                   3                   48                  6   
2             3                   6                   62                  3   

   c_business_type  ...  r_expected_credit_limit  r_propose  r_generalcode1  \
0                7  ...                  40000.0        5.0         Unknown   
1                7  ...                  20000.0        6.0         Unknown   
2                7  ...                  20000.0        5.0         Unknown   

   r_generalcode2  r_generalcode3  apply  default_12month   income ne

## Testing

In [14]:
threshold = trainXgboost(VERSION, train_df)
testXgboost(VERSION, test_df, ids, threshold)

SMOTE applied. Imbalance ratio before: 6.72
No GPU detected -> using XGBoost CPU training (hist).
Tuning hyperparameters with early stopping (using validation set)...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters Found: {'subsample': 0.8, 'scale_pos_weight': 1, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.02, 'gamma': 0, 'colsample_bytree': 0.6}
Logged best parameters to ../../model/v1/param_log_xgboost.json.json
[0]	validation_0-logloss:0.68848
[1]	validation_0-logloss:0.68390
[2]	validation_0-logloss:0.67824


  self.starting_round = model.num_boosted_rounds()


[3]	validation_0-logloss:0.67324
[4]	validation_0-logloss:0.66832
[5]	validation_0-logloss:0.66176
[6]	validation_0-logloss:0.65791
[7]	validation_0-logloss:0.65277
[8]	validation_0-logloss:0.64825
[9]	validation_0-logloss:0.64240
[10]	validation_0-logloss:0.63775
[11]	validation_0-logloss:0.63345
[12]	validation_0-logloss:0.62927
[13]	validation_0-logloss:0.62493
[14]	validation_0-logloss:0.62173
[15]	validation_0-logloss:0.61732
[16]	validation_0-logloss:0.61420
[17]	validation_0-logloss:0.60972
[18]	validation_0-logloss:0.60554
[19]	validation_0-logloss:0.60196
[20]	validation_0-logloss:0.59860
[21]	validation_0-logloss:0.59450
[22]	validation_0-logloss:0.59141
[23]	validation_0-logloss:0.58870
[24]	validation_0-logloss:0.58557
[25]	validation_0-logloss:0.58250
[26]	validation_0-logloss:0.58012
[27]	validation_0-logloss:0.57761
[28]	validation_0-logloss:0.57464
[29]	validation_0-logloss:0.57226
[30]	validation_0-logloss:0.56877
[31]	validation_0-logloss:0.56656
[32]	validation_0-log