# Data Processing

Notebook ini berfungsi untuk melakukan proses pengolahan data agar dataset siap digunakan. Adapun prosesnya meliputi hal berikut.
1. Membaca data train dan test mentah (application_train.csv dan application_test.csv)
2. Melakukan feature engineering, penanganan missing atau outlier, binning dan WOE pada fitur tertentu, imputasi, encoding kategori, serta scaling.
3. Menyimpan dataset hasil olahan dalam format CSV (`../data/dataset_hasil_data_processing/`) dan artefak berupa WOE bins dan scaler (`../outputs/models/preprocessing`) agar konsisten dipakai saat inference atau modeling.

## 1) Imports dan Paths

In [22]:

from pathlib import Path
import json, pickle, warnings
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings("ignore")

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

DATA_RAW_DIR = PROJECT_ROOT / "data" / "dataset_awal"
DATA_OUT_DIR = PROJECT_ROOT / "data" / "dataset_hasil_data_processing"
ARTIFACT_DIR = PROJECT_ROOT / "outputs" / "preprocessing"

TRAIN_PATH = DATA_RAW_DIR / "application_train.csv"
TEST_PATH  = DATA_RAW_DIR / "application_test.csv"

DATA_OUT_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

print("TRAIN_PATH:", TRAIN_PATH)
print("TEST_PATH :", TEST_PATH)


TRAIN_PATH: d:\Home Credit Virtual Internship\home_credit_scorecard_model\data\dataset_awal\application_train.csv
TEST_PATH : d:\Home Credit Virtual Internship\home_credit_scorecard_model\data\dataset_awal\application_test.csv


## 2) Load Data

In [23]:

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)
print("Train shape:", train_raw.shape, "| Test shape:", test_raw.shape)
display(train_raw.head(3))


Train shape: (307511, 122) | Test shape: (48744, 121)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## 3) Monotonic WOE Binning

In [24]:

def _woe(good, bad, eps=1e-6):
    tot = good + bad + eps
    p_good = (good + eps) / tot
    p_bad  = (bad  + eps) / tot
    return np.log(p_good / p_bad)

def compute_woe_monotonic_bins(df, feature, target="TARGET", n_bins=8, min_bin=0.05):
    x = df[feature].copy(); y = df[target].copy()
    mask = x.notna() & y.notna(); x_n = x[mask]; y_n = y[mask]
    q = min(max(3, n_bins), max(3, x_n.nunique()))
    bins = pd.qcut(x_n, q=q, duplicates="drop")

    tbl = pd.DataFrame({"bin": bins, "y": y_n}).groupby("bin")["y"].agg(["count","sum"])
    tbl.rename(columns={"sum":"bad"}, inplace=True); tbl["good"] = tbl["count"] - tbl["bad"]
    total = tbl["count"].sum()
    while (tbl["count"]/total < min_bin).any() and len(tbl) > 3:
        idx = tbl["count"].idxmin(); pos = list(tbl.index).index(idx)
        if pos == 0: merge_to = tbl.index[1]
        elif pos == len(tbl)-1: merge_to = tbl.index[-2]
        else:
            left = tbl.index[pos-1]; right = tbl.index[pos+1]
            merge_to = left if tbl.loc[left,"count"] <= tbl.loc[right,"count"] else right
        tbl.loc[merge_to, ["count","bad","good"]] += tbl.loc[idx, ["count","bad","good"]]
        tbl = tbl.drop(index=idx)
    tbl["woe"] = _woe(tbl["good"], tbl["bad"])
    direction = "increasing" if tbl["woe"].iloc[-1] >= tbl["woe"].iloc[0] else "decreasing"
    changed = True
    while changed and len(tbl) > 2:
        changed = False; w = tbl["woe"].values; dif = np.diff(w)
        viol = np.where(dif < 0)[0] if direction == "increasing" else np.where(dif > 0)[0]
        if len(viol):
            i = viol[0]; i_idx = tbl.index[i]; j_idx = tbl.index[i+1]
            tbl.loc[i_idx, ["count","bad","good"]] += tbl.loc[j_idx, ["count","bad","good"]]
            tbl = tbl.drop(index=j_idx); tbl["woe"] = _woe(tbl["good"], tbl["bad"]); changed = True
    bins_sorted = tbl.index.tolist(); woe_map = {b: tbl.loc[b, "woe"] for b in bins_sorted}
    return {"bin_intervals": bins_sorted, "woe_per_interval": woe_map, "direction": direction}

def apply_woe(series, artifact):
    cats = pd.cut(series, bins=artifact["bin_intervals"], include_lowest=True)
    return cats.map(artifact["woe_per_interval"])


## 4) Cleaning dan Feature Engineering

In [None]:

train = train_raw.copy(); test = test_raw.copy()

for df in (train, test):
    if "DAYS_BIRTH" in df.columns:
        df["AGE_YEARS"] = (-df["DAYS_BIRTH"]) / 365.25
    if "DAYS_EMPLOYED" in df.columns:
        df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].replace({365243: np.nan})
        df["YEARS_EMPLOYED"] = (-df["DAYS_EMPLOYED"]) / 365.25

ext_cols = [c for c in ["EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"] if c in train.columns]
for df in (train, test):
    for c in ext_cols:
        df[f"{c}_MISSING"] = df[c].isna() | (df[c] == 0)
        df.loc[df[c] == 0, c] = np.nan

def safe_div(a, b):
    return np.where((b==0) | pd.isna(b), np.nan, a/b)

for df in (train, test):
    if {"AMT_ANNUITY","AMT_INCOME_TOTAL"}.issubset(df.columns):
        df["ANNUITY_INCOME_RATIO"] = safe_div(df["AMT_ANNUITY"], df["AMT_INCOME_TOTAL"])
    if {"AMT_CREDIT","AMT_INCOME_TOTAL"}.issubset(df.columns):
        df["CREDIT_INCOME_RATIO"] = safe_div(df["AMT_CREDIT"], df["AMT_INCOME_TOTAL"])
    if {"AMT_CREDIT","AMT_ANNUITY"}.issubset(df.columns):
        df["CREDIT_ANNUITY_RATIO"] = safe_div(df["AMT_CREDIT"], df["AMT_ANNUITY"])

def winsorize_train_apply(train_s, test_s, lower=0.01, upper=0.99):
    lo, hi = train_s.quantile(lower), train_s.quantile(upper)
    return train_s.clip(lo, hi), test_s.clip(lo, hi)

ratio_cols = [c for c in ["ANNUITY_INCOME_RATIO","CREDIT_INCOME_RATIO","CREDIT_ANNUITY_RATIO"] if c in train.columns]
amt_cols   = [c for c in ["AMT_CREDIT","AMT_ANNUITY","AMT_INCOME_TOTAL"] if c in train.columns]
for c in ratio_cols + amt_cols:
    train[c], test[c] = winsorize_train_apply(train[c], test[c])

def to_emp_band(x):
    if pd.isna(x): return np.nan
    if x <= 1: return "<=1y"
    if x <= 3: return "1-3y"
    if x <= 5: return "3-5y"
    if x <=10: return "5-10y"
    return ">10y"

for df in (train, test):
    if "YEARS_EMPLOYED" in df.columns:
        df["EMP_BAND"] = df["YEARS_EMPLOYED"].map(to_emp_band)

print("FE selesai.")


## 5) Memilih Fitur

In [26]:

TARGET = "TARGET" if "TARGET" in train.columns else None

num_features = [c for c in [
    "AGE_YEARS","YEARS_EMPLOYED",
    "AMT_CREDIT","AMT_ANNUITY","AMT_INCOME_TOTAL",
    "ANNUITY_INCOME_RATIO","CREDIT_INCOME_RATIO","CREDIT_ANNUITY_RATIO",
    "EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"
] if c in train.columns]

woe_features = [c for c in ["AGE_YEARS","EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"] if c in num_features]
cat_features = [c for c in ["EMP_BAND"] if c in train.columns]
flag_features = [c for c in train.columns if c.endswith("_MISSING")]

print("Numerik   :", num_features)
print("WOE apply :", woe_features)
print("Kategori  :", cat_features)
print("Flags     :", flag_features)


Numerik   : ['AGE_YEARS', 'YEARS_EMPLOYED', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'ANNUITY_INCOME_RATIO', 'CREDIT_INCOME_RATIO', 'CREDIT_ANNUITY_RATIO', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
WOE apply : ['AGE_YEARS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
Kategori  : ['EMP_BAND']
Flags     : ['EXT_SOURCE_1_MISSING', 'EXT_SOURCE_2_MISSING', 'EXT_SOURCE_3_MISSING']


## 6) WOE Binning (Fit Train dan Apply Test)

In [27]:

if TARGET is None:
    raise ValueError("Kolom TARGET tidak ditemukan pada train.")

woe_artifacts = {}
train_woe = pd.DataFrame(index=train.index); test_woe = pd.DataFrame(index=test.index)

for col in woe_features:
    art_full = compute_woe_monotonic_bins(train, feature=col, target=TARGET, n_bins=8, min_bin=0.05)
    woe_artifacts[col] = {"bin_edges": [[b.left, b.right] for b in art_full["bin_intervals"]], "direction": art_full["direction"]}
    train_woe[col+"_WOE"] = apply_woe(train[col], art_full)
    test_woe[col+"_WOE"]  = apply_woe(test[col], art_full)

with open(ARTIFACT_DIR / "woe_artifacts.json", "w") as f:
    json.dump(woe_artifacts, f, indent=2)

display(train_woe.head(3))


Unnamed: 0,AGE_YEARS_WOE,EXT_SOURCE_1_WOE,EXT_SOURCE_2_WOE,EXT_SOURCE_3_WOE
0,2.041899,1.619996,2.072215,1.458008
1,2.496886,2.132457,2.856785,
2,2.580206,,2.508759,3.234264


## 7) Imputasi, Encoding, dan Assemble

In [28]:

num_raw = [c for c in num_features if c not in ["AGE_YEARS","EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"]]

num_imputer = SimpleImputer(strategy="median")
train_num = pd.DataFrame(num_imputer.fit_transform(train[num_raw]), columns=num_raw, index=train.index)
test_num  = pd.DataFrame(num_imputer.transform(test[num_raw]),   columns=num_raw, index=test.index)

woe_imputer = SimpleImputer(strategy="median")
train_woe_imp = pd.DataFrame(woe_imputer.fit_transform(train_woe), columns=train_woe.columns, index=train.index)
test_woe_imp  = pd.DataFrame(woe_imputer.transform(test_woe),  columns=test_woe.columns,  index=test.index)

if cat_features:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    train_cat = pd.DataFrame(cat_imputer.fit_transform(train[cat_features]), columns=cat_features, index=train.index)
    test_cat  = pd.DataFrame(cat_imputer.transform(test[cat_features]),   columns=cat_features, index=test.index)
    train_cat = pd.get_dummies(train_cat, drop_first=True)
    test_cat  = pd.get_dummies(test_cat, drop_first=True)
    train_cat, test_cat = train_cat.align(test_cat, join="left", axis=1, fill_value=0)
else:
    train_cat = pd.DataFrame(index=train.index); test_cat = pd.DataFrame(index=test.index)

train_flags = train[flag_features].astype(int) if flag_features else pd.DataFrame(index=train.index)
test_flags  = test[flag_features].astype(int)  if flag_features else pd.DataFrame(index=test.index)

train_X = pd.concat([train_num, train_woe_imp, train_cat, train_flags], axis=1)
test_X  = pd.concat([test_num,  test_woe_imp,  test_cat,  test_flags], axis=1)

print("train_X:", train_X.shape, "| test_X:", test_X.shape)
display(train_X.head(3))


train_X: (307511, 18) | test_X: (48744, 18)


Unnamed: 0,YEARS_EMPLOYED,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,ANNUITY_INCOME_RATIO,CREDIT_INCOME_RATIO,CREDIT_ANNUITY_RATIO,AGE_YEARS_WOE,EXT_SOURCE_1_WOE,EXT_SOURCE_2_WOE,EXT_SOURCE_3_WOE,EMP_BAND_3-5y,EMP_BAND_5-10y,EMP_BAND_<=1y,EMP_BAND_>10y,EXT_SOURCE_1_MISSING,EXT_SOURCE_2_MISSING,EXT_SOURCE_3_MISSING
0,1.744011,406597.5,24700.5,202500.0,0.121978,2.007889,16.461104,2.041899,1.619996,2.072215,1.458008,False,False,False,False,0,0,0
1,3.252567,1293502.5,35698.5,270000.0,0.132217,4.79075,36.234085,2.496886,2.132457,2.856785,2.877896,True,False,False,False,0,0,1
2,0.616016,135000.0,6750.0,67500.0,0.1,2.0,20.0,2.580206,2.61992,2.508759,3.234264,False,False,True,False,1,0,0


## 8) Scaling (RobustScaler)

In [29]:

scaler = RobustScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train_X), columns=train_X.columns, index=train_X.index)
test_scaled  = pd.DataFrame(scaler.transform(test_X),  columns=test_X.columns,  index=test_X.index)

with open(ARTIFACT_DIR / "robust_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

display(train_scaled.head(3))


Unnamed: 0,YEARS_EMPLOYED,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,ANNUITY_INCOME_RATIO,CREDIT_INCOME_RATIO,CREDIT_ANNUITY_RATIO,AGE_YEARS_WOE,EXT_SOURCE_1_WOE,EXT_SOURCE_2_WOE,EXT_SOURCE_3_WOE,EMP_BAND_3-5y,EMP_BAND_5-10y,EMP_BAND_<=1y,EMP_BAND_>10y,EXT_SOURCE_1_MISSING,EXT_SOURCE_2_MISSING,EXT_SOURCE_3_MISSING
0,-0.553366,-0.198521,-0.011205,0.615,-0.357506,-0.40022,-0.30812,-0.682951,-0.999924,-0.836902,-2.072001,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
1,-0.251779,1.448012,0.597361,1.365,-0.267911,0.485699,1.41345,0.0,-0.487463,0.667201,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,1.0
2,-0.778872,-0.70274,-1.004482,-0.885,-0.549823,-0.402732,0.0,0.125066,0.0,0.0,0.520037,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## 9) Menyimpan Dataset

In [None]:

train_out = train_scaled.copy()
if "TARGET" in train.columns:
    train_out["TARGET"] = train["TARGET"].values
test_out = test_scaled.copy()

out_train = DATA_OUT_DIR / "train_processed.csv"
out_test  = DATA_OUT_DIR / "test_processed.csv"
train_out.to_csv(out_train, index=False)
test_out.to_csv(out_test, index=False)

print("Saved:")
print("-", out_train)
print("-", out_test)
print("Artefak :", ARTIFACT_DIR)
