In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
np.random.seed(42)
N = 10000 # Number of loan applications

data = {
    'Annual_Income': np.random.lognormal(mean=10.5, sigma=0.5, size=N) * 1000,
    'Credit_Score': np.random.randint(500, 850, size=N),
    'Debt_To_Income': np.random.lognormal(mean=np.log(0.2), sigma=0.4, size=N),
    'Employment_Length': np.random.poisson(5, size=N), # Years
    'Home_Ownership': np.random.choice(['MORTGAGE', 'RENT', 'OWN'], size=N, p=[0.5, 0.4, 0.1]),
    'Loan_Amount': np.random.randint(5000, 50000, size=N),
}

df = pd.DataFrame(data)


In [9]:
 #Default probability is modeled primarily by high DTI, low credit score, and low income.
default_prob = (
    0.7 * (df['Debt_To_Income'] / df['Debt_To_Income'].max()) +
    0.5 * (1 - (df['Credit_Score'] / df['Credit_Score'].max())) +
    0.3 * (1 - (df['Annual_Income'] / df['Annual_Income'].max()))
)
# Add some randomness and scale down the overall default rate
default_prob = default_prob * 0.4 + np.random.rand(N) * 0.1
default_prob = np.clip(default_prob, 0.05, 0.25) # Clip to realistic default rates

df['Default'] = (np.random.rand(N) < default_prob).astype(int)


In [11]:
# Apply domain-specific cleaning/capping
df['Employment_Length'] = df['Employment_Length'].clip(0, 20) # Cap employment at 20 years
df['Debt_To_Income'] = df['Debt_To_Income'].clip(upper=1.0) # Cap DTI at 100%

print("--- Sample of Simulated Data (First 5 Rows) ---")
print(df.head())
print(f"\nDefault Rate: {df['Default'].mean():.2%}")
print("-" * 50)

--- Sample of Simulated Data (First 5 Rows) ---
   Annual_Income  Credit_Score  Debt_To_Income  Employment_Length  \
0   4.655348e+07           619        0.146511                  3   
1   3.388975e+07           836        0.175938                  5   
2   5.020371e+07           530        0.165753                  7   
3   7.777030e+07           680        0.199925                  8   
4   3.230326e+07           661        0.168185                  3   

  Home_Ownership  Loan_Amount  Default  
0           RENT        34246        0  
1       MORTGAGE        42979        0  
2       MORTGAGE        31832        0  
3       MORTGAGE        27588        0  
4       MORTGAGE        18403        0  

Default Rate: 23.61%
--------------------------------------------------


In [13]:
# --- 2. Feature Engineering & Preprocessing Pipeline ---

# Define feature types
numerical_features = ['Annual_Income', 'Credit_Score', 'Debt_To_Income', 'Employment_Length', 'Loan_Amount']
categorical_features = ['Home_Ownership']

# Create preprocessing steps
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [15]:
# Create a ColumnTransformer to apply transformations to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [17]:
# --- 3. Model Training ---
X = df.drop('Default', axis=1)
y = df['Default']

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:
# Create the final modeling pipeline (Preprocessor + Classifier)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')) # 'balanced' to help with lower default rate
])

print("Training Logistic Regression Model...")
model.fit(X_train, y_train)
print("Training Complete.")
print("-" * 50)

Training Logistic Regression Model...
Training Complete.
--------------------------------------------------


In [23]:
# --- 4. Evaluation and Risk Scoring ---

# Predict probabilities (Risk Score)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of defaulting (Class 1)

# Predict class labels
y_pred = model.predict(X_test)

# Calculate key metrics
auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"Model Evaluation (Test Set):")
print(f"Area Under ROC Curve (AUC): {auc_score:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display Confusion Matrix for context
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (True Positives, False Positives, etc.):")
print(cm)
print("-" * 50)

Model Evaluation (Test Set):
Area Under ROC Curve (AUC): 0.5149

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.50      0.60      1528
           1       0.24      0.53      0.33       472

    accuracy                           0.50      2000
   macro avg       0.51      0.51      0.47      2000
weighted avg       0.65      0.50      0.54      2000

Confusion Matrix (True Positives, False Positives, etc.):
[[757 771]
 [224 248]]
--------------------------------------------------


In [25]:
# --- 5. Model Interpretability (The "Why") ---

# 5a. Extract Feature Names
# Get the one-hot encoded feature names
ohe_feature_names = list(model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
# Combine all feature names
feature_names = numerical_features + ohe_feature_names

# 5b. Extract Coefficients (Impact on Risk)
coefficients = model.named_steps['classifier'].coef_[0]
feature_importance = pd.Series(coefficients, index=feature_names).sort_values(ascending=False)

print("--- Model Interpretability: Feature Impact (Coefficients) ---")
print("These coefficients show how a 1-unit increase (after standardization) in a feature affects the *log-odds* of default.")
print("\nTop 10 Features Driving HIGHER Default Risk (Positive Coefficient):")
print(feature_importance.head(10).round(4))

print("\nTop 10 Features Driving LOWER Default Risk (Negative Coefficient):")
# Features with the largest negative coefficients decrease the risk of default
print(feature_importance.tail(10).round(4))
print("-" * 50)

--- Model Interpretability: Feature Impact (Coefficients) ---
These coefficients show how a 1-unit increase (after standardization) in a feature affects the *log-odds* of default.

Top 10 Features Driving HIGHER Default Risk (Positive Coefficient):
Debt_To_Income             0.0476
Home_Ownership_MORTGAGE    0.0310
Loan_Amount                0.0094
Home_Ownership_OWN         0.0063
Employment_Length         -0.0286
Home_Ownership_RENT       -0.0405
Credit_Score              -0.0607
Annual_Income             -0.0676
dtype: float64

Top 10 Features Driving LOWER Default Risk (Negative Coefficient):
Debt_To_Income             0.0476
Home_Ownership_MORTGAGE    0.0310
Loan_Amount                0.0094
Home_Ownership_OWN         0.0063
Employment_Length         -0.0286
Home_Ownership_RENT       -0.0405
Credit_Score              -0.0607
Annual_Income             -0.0676
dtype: float64
--------------------------------------------------


In [27]:
# 5c. Demonstrate Risk Score Output for New Applicants
def get_risk_score(applicant_data):
    """Predicts the probability of default for new applicants."""
    applicant_df = pd.DataFrame([applicant_data])
    risk_score = model.predict_proba(applicant_df)[:, 1][0]
    return risk_score

# Example Applicant 1: High Risk Profile
applicant_high_risk = {
    'Annual_Income': 45000.0,
    'Credit_Score': 550,
    'Debt_To_Income': 0.6,
    'Employment_Length': 1,
    'Home_Ownership': 'RENT',
    'Loan_Amount': 35000
}

# Example Applicant 2: Low Risk Profile
applicant_low_risk = {
    'Annual_Income': 180000.0,
    'Credit_Score': 800,
    'Debt_To_Income': 0.1,
    'Employment_Length': 15,
    'Home_Ownership': 'MORTGAGE',
    'Loan_Amount': 10000
}

risk_high = get_risk_score(applicant_high_risk)
risk_low = get_risk_score(applicant_low_risk)

print("--- Risk Score Application ---")
print(f"Applicant 1 (High Risk) Default Probability: {risk_high:.2%}")
print(f"Applicant 2 (Low Risk) Default Probability: {risk_low:.2%}")
print("This probability (or risk score) is what the bank would use to make a lending decision.")


--- Risk Score Application ---
Applicant 1 (High Risk) Default Probability: 60.33%
Applicant 2 (Low Risk) Default Probability: 46.88%
This probability (or risk score) is what the bank would use to make a lending decision.
