In [8]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# ----------------------------
# 0) Load dataset
# ----------------------------
DATA_PATH = "D:/datasets/dpp/week6_loan_default_feature_construction_dataset.csv"
df = pd.read_csv(DATA_PATH)

TARGET = "Defaulted"

df

Unnamed: 0,Age,Gender,Marital_Status,Dependents,Employment_Type,Industry,Monthly_Income,Monthly_Expense,Existing_EMI,Loan_Amount,...,Credit_Limit,Used_Credit,Credit_Score,Delinquencies_12M,Property_Value,Call_Count,Complaint_Flag,Review_Sentiment,Review_Length,Defaulted
0,35,Female,Married,1,Contract,Govt,45473.0,6000.0,5447.0,669638.0,...,117468.0,76660.0,742,0,1268505.0,1,0,0.369,65,1
1,35,Male,Single,1,Salaried,Healthcare,62855.0,6000.0,2772.0,470995.0,...,133482.0,60598.0,764,0,634418.0,2,0,0.918,55,1
2,46,Female,Married,0,Salaried,Healthcare,34801.0,7309.0,12223.0,580644.0,...,274807.0,201013.0,611,1,1287509.0,1,0,-0.064,72,1
3,23,Female,Married,1,Salaried,IT,138747.0,6598.0,40198.0,1820510.0,...,423747.0,53814.0,726,2,2482016.0,3,0,-0.600,83,1
4,43,Male,Married,0,Self-Employed,Govt,57378.0,6802.0,25633.0,1027175.0,...,256971.0,39347.0,708,0,2190849.0,3,0,0.377,99,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,21,Female,Single,1,Salaried,IT,81263.0,25320.0,42357.0,988881.0,...,221976.0,14737.0,670,0,1616807.0,1,0,0.923,47,1
1496,34,Male,Single,0,Salaried,Govt,36626.0,8657.0,9852.0,448097.0,...,204444.0,79946.0,747,0,790887.0,1,0,0.306,55,1
1497,21,Male,Single,0,Salaried,Manufacturing,139386.0,35086.0,18086.0,1520510.0,...,865071.0,380777.0,809,0,2616213.0,4,0,0.450,85,1
1498,48,Male,Married,2,Salaried,Retail,129704.0,14001.0,11350.0,2471405.0,...,1032244.0,570247.0,685,0,2735428.0,2,0,0.569,57,1


In [9]:
# 1) Feature Construction
# ----------------------------
df_feat = df.copy()

# Safe divisions
def safe_div(a, b):
    b = b.replace(0, np.nan) if isinstance(b, pd.Series) else (np.nan if b == 0 else b)
    return a / b

# (A) Ratio features (financial normalization)
df_feat["DTI_Existing"] = safe_div(df_feat["Existing_EMI"], df_feat["Monthly_Income"])
df_feat["DTI_Total"] = safe_div(df_feat["Existing_EMI"] + df_feat["New_EMI"], df_feat["Monthly_Income"])
df_feat["Credit_Utilization"] = safe_div(df_feat["Used_Credit"], df_feat["Credit_Limit"])
df_feat["Savings_Ratio"] = safe_div(df_feat["Monthly_Income"] - df_feat["Monthly_Expense"], df_feat["Monthly_Income"])
df_feat["LTV"] = safe_div(df_feat["Loan_Amount"], df_feat["Property_Value"])

# (B) Difference features (gap / change)
df_feat["EMI_Gap"] = df_feat["New_EMI"] - df_feat["Existing_EMI"]
df_feat["Disposable_Income"] = df_feat["Monthly_Income"] - df_feat["Monthly_Expense"]
df_feat["PostLoan_Disposable"] = df_feat["Monthly_Income"] - (df_feat["Monthly_Expense"] + df_feat["Existing_EMI"] + df_feat["New_EMI"])

# (C) Interaction features (risk compounding)
df_feat["DTI_x_Utilization"] = df_feat["DTI_Total"] * df_feat["Credit_Utilization"]
df_feat["LowScore_x_Delinquency"] = (df_feat["Credit_Score"] < 620).astype(int) * df_feat["Delinquencies_12M"]
df_feat["Complaints_x_CallCount"] = df_feat["Complaint_Flag"] * df_feat["Call_Count"]

# (D) Domain flags (simple underwriting style signals)
df_feat["High_LTV_Flag"] = (df_feat["LTV"] > 0.8).astype(int)
df_feat["High_DTI_Flag"] = (df_feat["DTI_Total"] > 0.45).astype(int)
df_feat["High_Util_Flag"] = (df_feat["Credit_Utilization"] > 0.8).astype(int)
df_feat["Low_CreditScore_Flag"] = (df_feat["Credit_Score"] < 650).astype(int)

# (E) Domain-based composite score (interpretable risk meter)
df_feat["Risk_Index"] = (
    2.8*df_feat["DTI_Total"].fillna(0) +
    1.6*df_feat["Credit_Utilization"].fillna(0) +
    0.9*(df_feat["Delinquencies_12M"] > 0).astype(int) +
    0.6*df_feat["High_LTV_Flag"] +
    0.5*(df_feat["Credit_Score"] < 620).astype(int) +
    0.4*(df_feat["Employment_Type"] == "Contract").astype(int) -
    0.3*(df_feat["Savings_Ratio"].fillna(0) > 0.25).astype(int) +
    0.25*df_feat["Complaint_Flag"]
)

df_feat = df_feat.replace([np.inf, -np.inf], np.nan)


In [10]:
# ----------------------------
# 2) Define feature sets
# ----------------------------
categorical_cols = ["Gender", "Marital_Status", "Employment_Type", "Industry"]

raw_numeric_cols = [
    "Age", "Dependents",
    "Monthly_Income", "Monthly_Expense",
    "Existing_EMI", "Loan_Amount", "Interest_Rate", "Tenure_Months", "New_EMI",
    "Credit_Limit", "Used_Credit", "Credit_Score", "Delinquencies_12M",
    "Property_Value", "Call_Count", "Complaint_Flag",
    "Review_Sentiment", "Review_Length"
]

engineered_numeric_cols = raw_numeric_cols + [
    "DTI_Existing", "DTI_Total", "Credit_Utilization", "Savings_Ratio", "LTV",
    "EMI_Gap", "Disposable_Income", "PostLoan_Disposable",
    "DTI_x_Utilization", "LowScore_x_Delinquency", "Complaints_x_CallCount",
    "High_LTV_Flag", "High_DTI_Flag", "High_Util_Flag", "Low_CreditScore_Flag",
    "Risk_Index"
]


In [11]:
# ----------------------------
# 3) Train/Test Split
# ----------------------------
X = df_feat[categorical_cols + engineered_numeric_cols]
y = df_feat[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Helper to build preprocessors
def make_preprocessor(numeric_cols, cat_cols, use_poly=False, poly_degree=2):
    num_steps = [("imp", SimpleImputer(strategy="median"))]

    if use_poly:
        # PolynomialFeatures will auto-create squares + interactions among numeric inputs
        num_steps.append(("poly", PolynomialFeatures(degree=poly_degree, include_bias=False)))

    num_steps.append(("scaler", StandardScaler()))
    num_pipe = Pipeline(num_steps)

    pre = ColumnTransformer(
        transformers=[
            ("num", num_pipe, numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ],
        remainder="drop"
    )
    return pre



In [12]:
# ----------------------------
# 4) Models: Baseline vs Engineered vs Engineered+Polynomial
# ----------------------------
# Baseline uses only raw numeric (no engineered)
X_train_raw = X_train[categorical_cols + raw_numeric_cols]
X_test_raw  = X_test[categorical_cols + raw_numeric_cols]

baseline = Pipeline([
    ("prep", make_preprocessor(raw_numeric_cols, categorical_cols, use_poly=False)),
    ("clf", LogisticRegression(max_iter=1200))
])

engineered = Pipeline([
    ("prep", make_preprocessor(engineered_numeric_cols, categorical_cols, use_poly=False)),
    ("clf", LogisticRegression(max_iter=1200))
])

engineered_poly = Pipeline([
    ("prep", make_preprocessor(engineered_numeric_cols, categorical_cols, use_poly=True, poly_degree=2)),
    ("clf", LogisticRegression(max_iter=1200))
])

# Train
baseline.fit(X_train_raw, y_train)
engineered.fit(X_train, y_train)
engineered_poly.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1200


In [13]:
# ----------------------------
# 5) Evaluate
# ----------------------------
def evaluate(model, X_te, y_te, name):
    proba = model.predict_proba(X_te)[:, 1]
    pred = (proba >= 0.5).astype(int)

    return {
        "Model": name,
        "ROC_AUC": roc_auc_score(y_te, proba),
        "Accuracy": accuracy_score(y_te, pred),
        "Precision": precision_score(y_te, pred, zero_division=0),
        "Recall": recall_score(y_te, pred, zero_division=0),
        "F1": f1_score(y_te, pred, zero_division=0)
    }

results = pd.DataFrame([
    evaluate(baseline, X_test_raw, y_test, "Baseline (Raw Features)"),
    evaluate(engineered, X_test, y_test, "Engineered (Ratios/Diffs/Interactions/Domain)"),
    evaluate(engineered_poly, X_test, y_test, "Engineered + PolynomialFeatures(degree=2)")
]).sort_values("ROC_AUC", ascending=False)

print("\n=== Model Comparison ===")
print(results.to_string(index=False))



=== Model Comparison ===
                                        Model  ROC_AUC  Accuracy  Precision   Recall       F1
                      Baseline (Raw Features) 0.927224  0.989333   0.989333 1.000000 0.994638
Engineered (Ratios/Diffs/Interactions/Domain) 0.904313  0.984000   0.989276 0.994609 0.991935
    Engineered + PolynomialFeatures(degree=2) 0.883423  0.984000   0.989276 0.994609 0.991935


In [14]:
# ----------------------------
# 6) Show the top engineered features (sanity check view)
# ----------------------------
preview_cols = [
    "Monthly_Income","Monthly_Expense","Existing_EMI","New_EMI",
    "DTI_Total","Credit_Utilization","Savings_Ratio","LTV",
    "EMI_Gap","PostLoan_Disposable","DTI_x_Utilization","Risk_Index",
    "Credit_Score","Delinquencies_12M","Defaulted"
]
print("\n=== Engineered Feature Preview (first 8 rows) ===")
print(df_feat[preview_cols].head(8).to_string(index=False))



=== Engineered Feature Preview (first 8 rows) ===
 Monthly_Income  Monthly_Expense  Existing_EMI  New_EMI  DTI_Total  Credit_Utilization  Savings_Ratio      LTV  EMI_Gap  PostLoan_Disposable  DTI_x_Utilization  Risk_Index  Credit_Score  Delinquencies_12M  Defaulted
        45473.0           6000.0        5447.0  17184.0   0.497680            0.652603       0.868054 0.527895  11737.0              16842.0           0.324788    2.537669           742                  0          1
        62855.0           6000.0        2772.0  12468.0   0.242463            0.453979       0.904542 0.742405   9696.0              41615.0           0.110073    1.105262           764                  0          1
        34801.0           7309.0       12223.0  27152.0   1.131433            0.731470       0.789977 0.450982  14929.0             -11883.0           0.827609    5.438364           611                  1          1
       138747.0           6598.0       40198.0  44495.0   0.610413            0.12699