In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [4]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.xgboostModel import trainXgboost, testXgboost

In [5]:
VERSION = 3
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [14]:
def cleanDataset(df, cleanPath):
    # Drop unnecessary columns (keep ID)
    drop_cols = ['pms_i_ymd','Area','Province','Shop Name','date_of_birth_week','marital_status',
                 'c_postal_code','nummber_of_resident','c_number_of_employee', 'media','number_of_children',
                 'place_for_sending_information','r_generalcode4', 'r_generalcode5', 'r_allloan_case']
    df = df.drop(columns=drop_cols, errors='ignore')

    # Cut incorrect data
    df.loc[df['r_generalcode1'] > 2, 'r_generalcode1'] = 0
    df.loc[df['r_generalcode1'] < 1, 'r_generalcode1'] = 0

    df.loc[df['c_date_of_salary_payment'] > 31, 'c_date_of_salary_payment'] = 31
    df.loc[df['c_date_of_salary_payment'] < 1, 'c_date_of_salary_payment'] = 1
    df['c_date_of_salary_payment'] = df['c_date_of_salary_payment'].fillna(31)

    # Replace data
    df.loc[df['r_generalcode1'] == 2, 'r_generalcode1'] = 0


    df['r_spouse_income'] = df['r_generalcode1'].fillna(0)*df['r_spouse_income'].fillna(0)

    # df['r_spouse_income'] = df['r_spouse_income'].astype('category')
    # print(df['r_spouse_income'].cat.categories)

    df.drop(columns=['r_generalcode1'], inplace=True)


    # Combine year/month features safely
    if {'living_period_month','living_period_year'}.issubset(df.columns):
        df['living_period_year'] = (df['living_period_month'].fillna(0) + df['living_period_year'].fillna(0) * 12)/12
        df.drop(columns=['living_period_month'], inplace=True)

    if {'c_number_of_working_month','c_number_of_working_year'}.issubset(df.columns):
        df['c_number_of_working_month'] = df['c_number_of_working_month'].fillna(0) + df['c_number_of_working_year'].fillna(0) * 12
        df.drop(columns=['c_number_of_working_year'], inplace=True)

    # Convert birth date to age
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
        df['age'] = 2025 - df['date_of_birth'].dt.year
        df.drop(columns=['date_of_birth'], inplace=True)

    # Ensure numeric columns are numeric
    num_cols = ['living_period_year','c_date_of_salary_payment', 'all_income',
                'c_monthly_salary','c_number_of_working_month','r_expected_credit_limit',
                'r_allloan_amount','r_additional_income','age','r_spouse_income']

    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    num_cols = [c for c in num_cols if c in df.columns]
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())


    # Fill categorical columns
    cat_cols = ['gender','tel_category','type_of_residence','postal_code',
                'c_business_type','c_position','c_occupation','c_employment_status',
                'c_salary_payment_methods','r_propose','r_generalcode2',
                'r_generalcode3','apply']
    df.replace(["NaN", "nan", "None", ""], np.nan, inplace=True)
    df.loc[df['postal_code'] < 10000, 'postal_code'] = np.nan
    df.loc[df['r_generalcode3'] < 1, 'r_generalcode3'] = np.nan
    
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].fillna('Unknown')
    
    # for c in cat_cols:
    #     if c in df.columns:
    #         df[c] = df[c].astype('category')
    #         print(c)
    #         print(df[c].cat.categories)
            
    print(df.head(3))
    print(df.shape)
    print(df.isna().sum().sum())
    df.to_csv(cleanPath, index=False)
    return df

In [15]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)


             ID gender postal_code  tel_category  number_of_resident  \
0  202412000001     F2     10400.0             3                   2   
1  202412000002      M     10500.0             3                   3   
2  202412000003     F2     10170.0             3                   6   

   living_period_year  type_of_residence  c_business_type  c_position  \
0            5.000000                  6                7           5   
1            4.000000                  6                7           5   
2            5.166667                  3                7           4   

   c_occupation  ...  r_expected_credit_limit  r_propose  r_allloan_amount  \
0            55  ...                  40000.0        5.0                 0   
1            55  ...                  20000.0        6.0                 0   
2            53  ...                  20000.0        5.0             10000   

   r_additional_income  r_spouse_income  r_generalcode2 r_generalcode3  apply  \
0                  0.0  