# EMIPredict AI — Feature Engineering

Objective:
Create financial intelligence features that reflect repayment capacity and risk.

These engineered features will directly improve model performance.


In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("../Data/cleaned_emi_dataset.csv")

print("Shape:", df.shape)
df.head()
expense_cols = [
    "monthly_rent","school_fees","college_fees",
    "travel_expenses","groceries_utilities",
    "other_monthly_expenses","current_emi_amount"
]

df["total_expenses"] = df[expense_cols].sum(axis=1)
df["debt_to_income_ratio"] = df["current_emi_amount"] / df["monthly_salary"]
df["expense_to_income_ratio"] = df["total_expenses"] / df["monthly_salary"]
df["disposable_income"] = df["monthly_salary"] - df["total_expenses"]
df["savings_ratio"] = df["disposable_income"] / df["monthly_salary"]
df["emi_burden_ratio"] = df["max_monthly_emi"] / df["monthly_salary"]
df["risk_score"] = (
    0.35 * df["debt_to_income_ratio"] +
    0.25 * df["expense_to_income_ratio"] -
    0.20 * df["savings_ratio"] +
    0.20 * df["emi_burden_ratio"]
)
# Replace infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill numeric columns with 0
num_cols = df.select_dtypes(include=["int64","float64"]).columns
df[num_cols] = df[num_cols].fillna(0)

# Fill categorical columns with "Unknown"
cat_cols = df.select_dtypes(include=["object","string"]).columns
df[cat_cols] = df[cat_cols].fillna("Unknown")

df[[
    "debt_to_income_ratio",
    "expense_to_income_ratio",
    "savings_ratio",
    "emi_burden_ratio",
    "disposable_income",
    "risk_score"
]].describe()
df.to_csv("../Data/featured_emi_dataset.csv", index=False)
print("✅ Featured dataset saved")


Shape: (404800, 27)
✅ Featured dataset saved
