In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir, getModelDir, getPredDir
from modules.v1.xgboostModel import trainTestXgboost

In [3]:
VERSION = 3
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

## Clean Datasets

In [4]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week','marital_status',
                 'c_postal_code','nummber_of_resident','c_number_of_employee', 'media',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_month'] = df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12
        df.drop(columns=['living_period_year'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Ensure numeric columns are numeric
    num_cols = ['number_of_children','living_period_month','c_date_of_salary_payment',
                'c_monthly_salary','c_number_of_working_month','r_expected_credit_limit',
                'r_allloan_case','r_allloan_amount','r_additional_income','r_spouse_income','age']
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill categorical columns
    cat_cols = ['gender','tel_category','type_of_residence','postal_code',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode1','r_generalcode2',
                'r_generalcode3','apply']
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna('Unknown')

    print(df.head(3))
    df.to_csv(cleanPath, index=False)
    return df

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

             ID gender  number_of_children  postal_code  tel_category  \
0  2.024120e+11     F2                   2        10400             3   
1  2.024120e+11      M                   0        10500             3   
2  2.024120e+11     F2                   0        10170             3   

   number_of_resident  living_period_month  type_of_residence  \
0                   2                   60                  6   
1                   3                   48                  6   
2                   6                   62                  3   

   c_business_type  c_position  ...  r_allloan_case  r_allloan_amount  \
0                7           5  ...               0                 0   
1                7           5  ...               0                 0   
2                7           4  ...               1             10000   

   r_additional_income  r_spouse_income  r_generalcode1  r_generalcode2  \
0                  0.0              0.0         Unknown         Unknown   
1  

## Testing

In [6]:
trainTestXgboost(VERSION, train_df, test_df, ids)

SMOTE applied. Imbalance ratio before: 6.72
GPU detected -> using XGBoost GPU training (gpu_hist).
Tuning hyperparameters with early stopping (using validation set)...
Fitting 4 folds for each of 30 candidates, totalling 120 fits


KeyboardInterrupt: 