In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir, getModelDir, getPredDir

In [None]:
TRAIN_PATH = getDataDir("train")
CLEANED_PATH = getDataDir("cleaned", 1)

In [None]:
#Read Data from file
df = pd.read_csv(TRAIN_PATH) 
df.columns


Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

In [4]:
#Drop Columns
df = df.drop(columns=['ID','pms_i_ymd','Area','Province','Shop Name','date_of_birth_week','c_postal_code','c_date_of_salary_payment','media','place_for_sending_information','r_generalcode4'])

#year to month
df['living_period_month'] = df['living_period_month'] + df['living_period_year']*12
df['c_number_of_working_month'] = df['c_number_of_working_month'] + df['c_number_of_working_year']*12

df = df.drop(columns = ['living_period_year','c_number_of_working_year'])

df.columns

Index(['gender', 'date_of_birth', 'marital_status', 'number_of_children',
       'postal_code', 'tel_category', 'number_of_resident',
       'living_period_month', 'type_of_residence', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_month',
       'c_salary_payment_methods', 'r_expected_credit_limit', 'r_propose',
       'r_allloan_case', 'r_allloan_amount', 'r_additional_income',
       'r_spouse_income', 'r_generalcode1', 'r_generalcode2', 'r_generalcode3',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

In [5]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')
df['birth_year'] = df['date_of_birth'].dt.year
df.drop(columns=['date_of_birth'], inplace=True)
df['age'] = 2025 - df['birth_year']  # replace 2025 with current year or dataset year
df.drop(columns=['birth_year'], inplace=True)

df.columns

Index(['gender', 'marital_status', 'number_of_children', 'postal_code',
       'tel_category', 'number_of_resident', 'living_period_month',
       'type_of_residence', 'c_business_type', 'c_number_of_employee',
       'c_position', 'c_occupation', 'c_employment_status', 'c_monthly_salary',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode5',
       'apply', 'default_12month', 'age'],
      dtype='object')

In [6]:
#Fill Missing Value

#Numeric
num_cols = ['number_of_children','number_of_resident',
       'living_period_month','c_number_of_employee','c_monthly_salary',
       'c_number_of_working_month','r_expected_credit_limit', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income','age']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

#Categorical
cols = ['gender', 'marital_status','postal_code', 'tel_category','type_of_residence',
       'c_business_type','c_position', 'c_occupation',
       'c_employment_status','c_salary_payment_methods','r_propose','r_generalcode1', 'r_generalcode2', 'r_generalcode3','r_generalcode5','apply']
df[cols] = df[cols].fillna('Unknown')

In [7]:
print("Shape of cleaned data:", df.shape)
print("Sample rows:")
print(df.head())

Shape of cleaned data: (32524, 29)
Sample rows:
  gender  marital_status  number_of_children  postal_code  tel_category  \
0     F2               1                   2        10400             3   
1      M               1                   0        10500             3   
2     F2               1                   0        10170             3   
3      M               1                   2        10500             3   
4     F2               3                   1        10120             3   

   number_of_resident  living_period_month  type_of_residence  \
0                   2                   60                  6   
1                   3                   48                  6   
2                   6                   62                  3   
3                   2                   51                  5   
4                   2                    4                  4   

   c_business_type  c_number_of_employee  ...  r_allloan_amount  \
0                7                  9999  .

In [None]:
df.to_csv(CLEANED_PATH, index=False)