
# Feature Engineering – Credit Default Dataset

This notebook prepares a cleaned and feature-rich version of the **UCI Credit Default dataset** for modeling.  

**Goals:**  
- Clean and preprocess the raw data.  
- Engineer new features based on domain knowledge and EDA findings.  
- Save a processed dataset (`credit_processed.csv`) for modeling.  


## 1. Load Data

In [1]:
import sys
sys.path.append("../src")
import pandas as pd
import numpy as np
from data_loader import load_from_file, load_from_ucimlrepo

DATA_SOURCE = "local" #or api
LOCAL_FILE_PATH = "../data/credit.xls"

if DATA_SOURCE == "local":
    df = load_from_file(LOCAL_FILE_PATH)
elif DATA_SOURCE == "api":
    df = load_from_ucimlrepo()
else: 
    print("Wrong DATA SOURCE")

print("Shape:", df.shape)
df.head()

Shape: (30000, 25)


Unnamed: 0,Unnamed: 0_level_0_ID,X1_LIMIT_BAL,X2_SEX,X3_EDUCATION,X4_MARRIAGE,X5_AGE,X6_PAY_0,X7_PAY_2,X8_PAY_3,X9_PAY_4,...,X15_BILL_AMT4,X16_BILL_AMT5,X17_BILL_AMT6,X18_PAY_AMT1,X19_PAY_AMT2,X20_PAY_AMT3,X21_PAY_AMT4,X22_PAY_AMT5,X23_PAY_AMT6,Y_default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## 2. Data Cleaning

In [3]:

# Drop ID column (not predictive)
df = df.drop(columns=["Unnamed: 0_level_0_ID"], errors="ignore")
print(df["X3_EDUCATION"].value_counts())
print(df["X4_MARRIAGE"].value_counts())

# Replace out-of-range categories in EDUCATION and MARRIAGE
df["X3_EDUCATION"] = df["X3_EDUCATION"].replace({0:4, 5:4, 6:4})  # group as 'other'
df["X4_MARRIAGE"] = df["X4_MARRIAGE"].replace({0:3})              # group as 'other'

# Ensure target column is integer
df["Y_default payment next month"] = df["Y_default payment next month"].astype(int)

print(df["X3_EDUCATION"].value_counts())
print(df["X4_MARRIAGE"].value_counts())

# The dataset is relatively clean. 

X3_EDUCATION
2    14030
1    10585
3     4917
4      468
Name: count, dtype: int64
X4_MARRIAGE
2    15964
1    13659
3      377
Name: count, dtype: int64
X3_EDUCATION
2    14030
1    10585
3     4917
4      468
Name: count, dtype: int64
X4_MARRIAGE
2    15964
1    13659
3      377
Name: count, dtype: int64


## 3. Binning Features

In [7]:

# Bin AGE into categories
bins = [20,30,40,50,60,80]
labels = ["20-29","30-39","40-49","50-59","60+"]
df["AGE_BIN"] = pd.cut(df["X5_AGE"], bins=bins, labels=labels)

print("Shape: ",df.shape)
df[["X5_AGE","AGE_BIN"]].head(10)



Shape:  (30000, 25)


Unnamed: 0,X5_AGE,AGE_BIN
0,24,20-29
1,26,20-29
2,34,30-39
3,37,30-39
4,57,50-59
5,37,30-39
6,29,20-29
7,23,20-29
8,28,20-29
9,35,30-39


## 4. Encoding Categorical Features

In [8]:

# One-hot encode categorical features (SEX, EDUCATION, MARRIAGE, AGE_BIN)
categorical_cols = ["X2_SEX","X3_EDUCATION","X4_MARRIAGE","AGE_BIN"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("Shape: ",df.shape)
df.head()


Shape:  (30000, 31)


Unnamed: 0,X1_LIMIT_BAL,X5_AGE,X6_PAY_0,X7_PAY_2,X8_PAY_3,X9_PAY_4,X10_PAY_5,X11_PAY_6,X12_BILL_AMT1,X13_BILL_AMT2,...,X2_SEX_2,X3_EDUCATION_2,X3_EDUCATION_3,X3_EDUCATION_4,X4_MARRIAGE_2,X4_MARRIAGE_3,AGE_BIN_30-39,AGE_BIN_40-49,AGE_BIN_50-59,AGE_BIN_60+
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,True,True,False,False,False,False,False,False,False,False
1,120000,26,-1,2,0,0,0,2,2682,1725,...,True,True,False,False,True,False,False,False,False,False
2,90000,34,0,0,0,0,0,0,29239,14027,...,True,True,False,False,True,False,True,False,False,False
3,50000,37,0,0,0,0,0,0,46990,48233,...,True,True,False,False,False,False,True,False,False,False
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,False,True,False,False,False,False,False,False,True,False


## 5. Feature Engineering

In [9]:

# Utilization ratios (bill / limit)
for i in range(1,7):
    df[f"utilization{i}"] = df[f"X{11+i}_BILL_AMT{i}"] / df["X1_LIMIT_BAL"]

# Repayment ratios (payment / bill)
for i in range(1,7):
    df[f"repay_ratio{i}"] = df[f"X{17+i}_PAY_AMT{i}"] / df[f"X{11+i}_BILL_AMT{i}"].replace(0,np.nan)

# Payment delays: avg & max
pay_cols = ["X6_PAY_0","X7_PAY_2","X8_PAY_3","X9_PAY_4","X10_PAY_5","X11_PAY_6"]
df["avg_delay"] = df[pay_cols].mean(axis=1)
df["max_delay"] = df[pay_cols].max(axis=1)

# Aggregate bills and payments
bill_cols = [f"X{11+i}_BILL_AMT{i}" for i in range(1,7)]
pay_amt_cols = [f"X{17+i}_PAY_AMT{i}" for i in range(1,7)]
df["avg_bill"] = df[bill_cols].mean(axis=1)
df["avg_payment"] = df[pay_amt_cols].mean(axis=1)

df.head()


Unnamed: 0,X1_LIMIT_BAL,X5_AGE,X6_PAY_0,X7_PAY_2,X8_PAY_3,X9_PAY_4,X10_PAY_5,X11_PAY_6,X12_BILL_AMT1,X13_BILL_AMT2,...,repay_ratio1,repay_ratio2,repay_ratio3,repay_ratio4,repay_ratio5,repay_ratio6,avg_delay,max_delay,avg_bill,avg_payment
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,0.0,0.222115,0.0,,,,-0.333333,2,1284.0,114.833333
1,120000,26,-1,2,0,0,0,2,2682,1725,...,0.0,0.57971,0.372856,0.305623,0.0,0.613309,0.5,2,2846.166667,833.333333
2,90000,34,0,0,0,0,0,0,29239,14027,...,0.051917,0.106937,0.073752,0.069779,0.066899,0.321564,0.0,0,16942.166667,1836.333333
3,50000,37,0,0,0,0,0,0,46990,48233,...,0.042562,0.041859,0.024345,0.03885,0.036914,0.033844,0.0,0,38555.666667,1398.0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,0.232099,6.469312,0.279057,0.429799,0.035987,0.035492,-0.333333,0,18223.166667,9841.5


## 6. Scaling & Transformations

In [11]:

# Apply log transform to skewed monetary features
def signed_log(x):
    return np.sign(x) * np.log1p(abs(x))

df[f"{col}_log"] = df[col].apply(signed_log)


df.head()


Unnamed: 0,X1_LIMIT_BAL,X5_AGE,X6_PAY_0,X7_PAY_2,X8_PAY_3,X9_PAY_4,X10_PAY_5,X11_PAY_6,X12_BILL_AMT1,X13_BILL_AMT2,...,X14_BILL_AMT3_log,X15_BILL_AMT4_log,X16_BILL_AMT5_log,X17_BILL_AMT6_log,X18_PAY_AMT1_log,X19_PAY_AMT2_log,X20_PAY_AMT3_log,X21_PAY_AMT4_log,X22_PAY_AMT5_log,X23_PAY_AMT6_log
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,6.536692,0.0,0.0,0.0,0.0,6.536692,0.0,0.0,0.0,0.0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,7.894691,8.093462,8.147867,8.090096,0.0,6.908755,6.908755,6.908755,0.0,7.601402
2,90000,34,0,0,0,0,0,0,29239,14027,...,9.51488,9.57025,9.6124,9.651816,7.325808,7.313887,6.908755,6.908755,6.908755,8.517393
3,50000,37,0,0,0,0,0,0,46990,48233,...,10.805517,10.251147,10.273671,10.293771,7.601402,7.610853,7.09091,7.003974,6.975414,6.908755
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,10.486708,9.949464,9.859901,9.859118,7.601402,10.510041,9.21044,9.105091,6.536692,6.522093


## 7. Save Processed Dataset

In [12]:

output_path = "../data/credit_processed.csv"
df.to_csv(output_path, index=False)
print(f"Processed dataset saved to {output_path}")


Processed dataset saved to ../data/credit_processed.csv



## 8. Observations  

- Engineered new behavioral features (utilization ratios, repayment ratios, delays).  
- Aggregated across months to create summary variables (average bill, average payment).  
- Applied log transforms to normalize skewed monetary features.  
- Final dataset is ready for modeling.  
