In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.xgboostModel import trainXgboost, testXgboost

In [3]:
VERSION = 1
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

## Clean Datasets

In [4]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week',
                 'c_postal_code','c_date_of_salary_payment','media',
                 'place_for_sending_information','r_generalcode4']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_month'] = df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12
        df.drop(columns=['living_period_year'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Ensure numeric columns are numeric
    num_cols = ['number_of_children','number_of_resident','living_period_month','c_number_of_employee',
                'c_monthly_salary','c_number_of_working_month','r_expected_credit_limit',
                'r_allloan_case','r_allloan_amount','r_additional_income','r_spouse_income','age']
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','marital_status','postal_code','tel_category','type_of_residence',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','r_generalcode5','apply']
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna('Unknown')

    print(df.head(3))
    df.to_csv(cleanPath, index=False)
    return df

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  marital_status  number_of_children  postal_code  \
0  2.024120e+11     F2               1                   2        10400   
1  2.024120e+11      M               1                   0        10500   
2  2.024120e+11     F2               1                   0        10170   

   tel_category  number_of_resident  living_period_month  type_of_residence  \
0             3                   2                   60                  6   
1             3                   3                   48                  6   
2             3                   6                   62                  3   

   c_business_type  ...  r_allloan_amount  r_additional_income  \
0                7  ...                 0                  0.0   
1                7  ...                 0               5000.0   
2                7  ...             10000                  0.0   

   r_spouse_income  r_generalcode1  r_generalcode2  r_generalcode3  \
0              0.0         Unknown         Unkn

## Testing

In [6]:
threshold = trainXgboost(VERSION, train_df)
testXgboost(VERSION, test_df, ids, threshold)

SMOTE applied. Imbalance ratio before: 6.72
GPU detected -> using XGBoost GPU training (gpu_hist).
Tuning hyperparameters with early stopping (using validation set)...
Fitting 4 folds for each of 1 candidates, totalling 4 fits




Best Parameters Found: {'subsample': 0.8, 'scale_pos_weight': 1, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.02, 'gamma': 0, 'colsample_bytree': 0.6}
Logged best parameters to ../../model/v1/param_log_xgboost.json.json
[0]	validation_0-logloss:0.68564
[1]	validation_0-logloss:0.67841
[2]	validation_0-logloss:0.67232
[3]	validation_0-logloss:0.66583
[4]	validation_0-logloss:0.65948
[5]	validation_0-logloss:0.65257
[6]	validation_0-logloss:0.64603
[7]	validation_0-logloss:0.63991
[8]	validation_0-logloss:0.63542
[9]	validation_0-logloss:0.62968
[10]	validation_0-logloss:0.62402
[11]	validation_0-logloss:0.61833
[12]	validation_0-logloss:0.61328
[13]	validation_0-logloss:0.60843
[14]	validation_0-logloss:0.60333
[15]	validation_0-logloss:0.59842
[16]	validation_0-logloss:0.59373
[17]	validation_0-logloss:0.58919
[18]	validation_0-logloss:0.58477
[19]	validation_0-logloss:0.58069
[20]	validation_0-logloss:0.57629
[21]	

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
