# Task 3

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('Task_3_and_4_Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [10]:
class DefaultRiskModel:
    def __init__(self, recovery_rate=0.1):
        self.recovery_rate = recovery_rate
        self.model = None
        self.scaler = None
        self.feature_columns = None

    def train(self, df, target_col='default', id_col='customer_id'):
        df = df.drop(columns=[id_col], errors='ignore') #the id doesnt give any info

        X = df.drop(columns=[target_col])
        y = df[target_col]

        self.feature_columns = X.columns.tolist()

        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        #trying multiple models
        logreg = LogisticRegression()
        rf = RandomForestClassifier(n_estimators=100, random_state=42)

        logreg.fit(X_train, y_train)
        rf.fit(X_train, y_train)

        auc_log = roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])
        auc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

        self.model = rf if auc_rf > auc_log else logreg #whichever has better predictions
        print(f"Selected model: {'Random Forest' if self.model == rf else 'Logistic Regression'} (AUC = {max(auc_rf, auc_log):.4f})")

    def estimate_expected_loss(self, borrower_features):
        x = pd.DataFrame([borrower_features])
        x = x[self.feature_columns]  
        x_scaled = self.scaler.transform(x)

        pd_est = self.model.predict_proba(x_scaled)[0][1]
        loan_amt = borrower_features['loan_amt_outstanding']
        el = pd_est * loan_amt * (1 - self.recovery_rate)

        return {"probability_of_default": pd_est,"expected_loss": el}

In [11]:
model = DefaultRiskModel()
model.train(df)

Selected model: Logistic Regression (AUC = 1.0000)


In [12]:
borrower = {"credit_lines_outstanding": 3,"loan_amt_outstanding": 5000,"total_debt_outstanding": 10000,"income": 40000,"years_employed": 3,"fico_score": 590}
print(model.estimate_expected_loss(borrower))

{'probability_of_default': 0.9380528506946687, 'expected_loss': 4221.237828126009}
