# Cleaning

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datetime import datetime

In [13]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir, getModelDir, getPredDir

In [14]:
df = pd.read_csv(getDataDir("train", 0))

In [15]:
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

In [16]:
df["pms_i_ymd"] = pd.to_datetime(df["pms_i_ymd"], errors="coerce")
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")

Feature Engineering

In [17]:
df["age"] = ((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365).astype(float)
df["years_at_residence"] = df["living_period_year"] + df["living_period_month"] / 12
df["years_at_job"] = df["c_number_of_working_year"] + df["c_number_of_working_month"] / 12
df["total_income"] = df["c_monthly_salary"] + df["r_additional_income"] + df["r_spouse_income"]
df["debt_to_income_ratio"] = df["r_allloan_amount"] / (df["total_income"] + 1)
df["loan_burden_per_income"] = df["r_expected_credit_limit"] / (df["total_income"] + 1)

Handle categorical variables (for CatBoost or encoding later)

In [18]:
categorical_cols = [
    "gender", "marital_status", "tel_category", "type_of_residence",
    "c_business_type", "c_position", "c_occupation",
    "c_employment_status", "c_salary_payment_methods",
    "media", "place_for_sending_information", "r_propose",
    "r_generalcode1", "r_generalcode2", "r_generalcode3",
    "r_generalcode4", "r_generalcode5", "apply"
]

for col in categorical_cols:
    df[col] = df[col].astype("category")


Handle missing values

In [19]:
# Fill numeric NaN with 0 or median, depending on context
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

In [20]:
# Fill categorical NaN with 'Unknown'
for col in categorical_cols:
    df[col] = df[col].cat.add_categories("Unknown").fillna("Unknown")

In [21]:
# --- Drop unneeded original date columns (after derived features) ---
df.drop(["pms_i_ymd", "date_of_birth", "living_period_year",
         "living_period_month", "c_number_of_working_year",
         "c_number_of_working_month"], axis=1, inplace=True)

Last Check

In [22]:
print("Shape of cleaned data:", df.shape)
print("Sample rows:")
print(df.head())

Shape of cleaned data: (32524, 42)
Sample rows:
             ID                         Area Province      Shop Name gender  \
0  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   
1  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex      M   
2  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   
3  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex      M   
4  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   

   date_of_birth_week marital_status  number_of_children  postal_code  \
0                   6              1                   2        10400   
1                   4              1                   0        10500   
2                   5              1                   0        10170   
3                   7              1                   2        10500   
4                   6              3                   1        10120   

  tel_category  ...  r_generalcode4 r_

Save to CSV

In [23]:
df.to_csv(getDataDir("clean", 0))

',ID,Area,Province,Shop Name,gender,date_of_birth_week,marital_status,number_of_children,postal_code,tel_category,number_of_resident,type_of_residence,c_postal_code,c_business_type,c_number_of_employee,c_position,c_occupation,c_employment_status,c_monthly_salary,c_salary_payment_methods,c_date_of_salary_payment,media,place_for_sending_information,r_expected_credit_limit,r_propose,r_allloan_case,r_allloan_amount,r_additional_income,r_spouse_income,r_generalcode1,r_generalcode2,r_generalcode3,r_generalcode4,r_generalcode5,apply,default_12month,age,years_at_residence,years_at_job,total_income,debt_to_income_ratio,loan_burden_per_income\r\n0,202412000000.0,Bangkok Metropolitan Region,Bangkok,Silom Complex,F2,6,1,2,10400,3,2,6,10330.0,7,9999,5,55,5,9070,1,30.0,7,1,40000.0,5.0,0,0,0.0,0.0,Unknown,Unknown,2.0,4.0,4.0,WI,0,46.90684931506849,5.0,5.0,9070.0,0.0,4.409657149156653\r\n1,202412000000.0,Bangkok Metropolitan Region,Bangkok,Silom Complex,M,4,1,0,10500,3,3,6,10500.0,7,500,5,55,5,11765,1