# Cleaning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datetime import datetime
from fileDir import *

In [None]:
df = pd.read_csv(getDataDir("train", 0))

In [None]:
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

In [5]:
df["pms_i_ymd"] = pd.to_datetime(df["pms_i_ymd"], errors="coerce")
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")

Feature Engineering

In [6]:
df["age"] = ((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365).astype(float)
df["years_at_residence"] = df["living_period_year"] + df["living_period_month"] / 12
df["years_at_job"] = df["c_number_of_working_year"] + df["c_number_of_working_month"] / 12
df["total_income"] = df["c_monthly_salary"] + df["r_additional_income"] + df["r_spouse_income"]
df["debt_to_income_ratio"] = df["r_allloan_amount"] / (df["total_income"] + 1)
df["loan_burden_per_income"] = df["r_expected_credit_limit"] / (df["total_income"] + 1)

Handle categorical variables (for CatBoost or encoding later)

In [7]:
categorical_cols = [
    "gender", "marital_status", "tel_category", "type_of_residence",
    "c_business_type", "c_position", "c_occupation",
    "c_employment_status", "c_salary_payment_methods",
    "media", "place_for_sending_information", "r_propose",
    "r_generalcode1", "r_generalcode2", "r_generalcode3",
    "r_generalcode4", "r_generalcode5", "apply"
]

for col in categorical_cols:
    df[col] = df[col].astype("category")


Handle missing values

In [8]:
# Fill numeric NaN with 0 or median, depending on context
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

In [9]:
# Fill categorical NaN with 'Unknown'
for col in categorical_cols:
    df[col] = df[col].cat.add_categories("Unknown").fillna("Unknown")

In [10]:
# --- Drop unneeded original date columns (after derived features) ---
df.drop(["pms_i_ymd", "date_of_birth", "living_period_year",
         "living_period_month", "c_number_of_working_year",
         "c_number_of_working_month"], axis=1, inplace=True)

Last Check

In [11]:
print("Shape of cleaned data:", df.shape)
print("Sample rows:")
print(df.head())

Shape of cleaned data: (32524, 42)
Sample rows:
             ID                         Area Province      Shop Name gender  \
0  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   
1  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex      M   
2  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   
3  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex      M   
4  2.024120e+11  Bangkok Metropolitan Region  Bangkok  Silom Complex     F2   

   date_of_birth_week marital_status  number_of_children  postal_code  \
0                   6              1                   2        10400   
1                   4              1                   0        10500   
2                   5              1                   0        10170   
3                   7              1                   2        10500   
4                   6              3                   1        10120   

  tel_category  ...  r_generalcode4 r_

Save to CSV

In [None]:
df.to_csv(getDataDir("clean", 0))