In [2]:
import pandas as pd
loan_df=pd.read_csv('Task 3 and 4_Loan_Data.csv')
print(loan_df.head())
print(loan_df.info())

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                 

Check for:

Null values

Data types

Distribution of default (target variable)



In [4]:
X = loan_df.drop('default', axis=1)  # Independent variables
y = loan_df['default']               # Dependent variable
#Clean and encode categorical features if needed:

X = pd.get_dummies(X, drop_first=True)

Step 3: Preprocess the Data
Split into training and test sets, scale if required:

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler= StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [6]:
#Step 4: Build a Credit Risk ModelStart with Logistic Regression:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC SCORE:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.98      1.00      0.99       348

    accuracy                           1.00      2000
   macro avg       0.99      1.00      0.99      2000
weighted avg       1.00      1.00      1.00      2000

ROC-AUC SCORE: 0.9999286827530545


In [8]:
importance = pd.Series(model.coef_[0], index=X.columns)
print(importance.sort_values(ascending=False))

credit_lines_outstanding    8.930254
total_debt_outstanding      3.692901
loan_amt_outstanding        0.109996
customer_id                -0.021767
fico_score                 -1.201409
income                     -2.297575
years_employed             -2.870429
dtype: float64


In [9]:
#Random Forest:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

importances = pd.Series(rf.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))

credit_lines_outstanding    0.499521
total_debt_outstanding      0.347296
years_employed              0.050000
fico_score                  0.042536
income                      0.033717
loan_amt_outstanding        0.017085
customer_id                 0.009844
dtype: float64


The dataset has 10,000 loan records with the following features:

Column	Description
customer_id	Unique customer identifier (not useful for prediction)
credit_lines_outstanding	Number of active credit lines
loan_amt_outstanding	Current outstanding loan amount
total_debt_outstanding	Total outstanding debt (including loans)
income	Annual income of the borrower
years_employed	Years of employment
fico_score	FICO credit score
default	Target variable: 1 if defaulted before, 0 otherwise

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Drop customer_id (not predictive)
df = loan_df.drop(columns=['customer_id'])

# Define features and target
X = loan_df.drop(columns=['default'])
y = loan_df['default']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
logreg_probs = logreg.predict_proba(X_test_scaled)[:, 1]
logreg_auc = roc_auc_score(y_test, logreg_probs)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_probs = rf.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_probs)

logreg_auc, rf_auc

(np.float64(0.9999512955386713), np.float64(0.9996929879491248))

Both models perform exceptionally well on this dataset:

Logistic Regression AUC: 0.99997

Random Forest AUC: 0.99966

These AUC scores indicate near-perfect separation between defaulters and non-defaulters, which is rare in real-world credit risk data—so the dataset may be synthetically clean or oversimplified.



Create the PD → Expected Loss Function
We'll now define a function that:

Takes in borrower features

Uses the logistic regression model (highest AUC) to predict Probability of Default (PD)

Calculates Expected Loss using:

Expected Loss
=
PD
×
(
1
−
Recovery Rate
)
×
Loan Amount
Expected Loss=PD×(1−Recovery Rate)×Loan Amount
Assuming a 10% recovery rate → Loss Given Default (LGD) = 0.90

In [13]:

# Refit the scaler properly on the 6 actual features (excluding customer_id and default)
# Define features and target again just to be sure
X = df.drop(columns=['default'])
y = df['default']

# Split the data again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a new StandardScaler on the correct feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Retrain logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

# Redefine the expected loss function using the corrected scaler
def calculate_expected_loss(borrower_features, model=logreg, scaler=scaler, recovery_rate=0.10):
    feature_order = ['credit_lines_outstanding', 'loan_amt_outstanding',
                     'total_debt_outstanding', 'income', 'years_employed', 'fico_score']
    
    input_data = np.array([[borrower_features[feature] for feature in feature_order]])
    input_scaled = scaler.transform(input_data)
    pd = model.predict_proba(input_scaled)[0][1]
    lgd = 1 - recovery_rate
    expected_loss = pd * lgd * borrower_features['loan_amt_outstanding']
    
    return {'probability_of_default': pd, 'expected_loss': expected_loss}

# Test the corrected function again with the same example borrower
calculate_expected_loss(example_borrower)



{'probability_of_default': np.float64(0.001139026312315162),
 'expected_loss': np.float64(5.125618405418229)}

Probability of Default (PD): 0.127%

Expected Loss: $5.72 (given a loan amount of $5,000 and 10% recovery rate)



In [17]:
df.dtypes

credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Define feature set and target
X = df[['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding',
        'income', 'years_employed', 'fico_score']]
y = df['default']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [20]:
import numpy as np

def calculate_expected_loss(borrower_features, model=model, scaler=scaler, recovery_rate=0.10):
    """
    borrower_features: dict of 6 features
    """
    feature_order = ['credit_lines_outstanding', 'loan_amt_outstanding',
                     'total_debt_outstanding', 'income', 'years_employed', 'fico_score']
    
    input_data = np.array([[borrower_features[feature] for feature in feature_order]])
    input_scaled = scaler.transform(input_data)
    
    pd = model.predict_proba(input_scaled)[0][1]
    lgd = 1 - recovery_rate
    expected_loss = pd * lgd * borrower_features['loan_amt_outstanding']
    
    return {'probability_of_default': pd, 'expected_loss': expected_loss}


In [21]:
example_borrower = {
    'credit_lines_outstanding': 3,
    'loan_amt_outstanding': 8000.0,
    'total_debt_outstanding': 12000.0,
    'income': 50000.0,
    'years_employed': 5,
    'fico_score': 650
}

calculate_expected_loss(example_borrower)



{'probability_of_default': np.float64(0.11771059750569704),
 'expected_loss': np.float64(847.5163020410187)}

Interpretation:
Probability of Default (PD):
The borrower has an 11.77% estimated chance of defaulting on their loan.

Expected Loss (EL):
If the borrower does default, the bank expects to lose $847.52, based on:

The loan amount

A 90% loss given default (LGD = 1 - 0.10)

The risk manager has collected data on the loan borrowers. The data is in tabular format, with each row providing details of the borrower, including their income, total loans outstanding, and a few other metrics. There is also a column indicating if the borrower has previously defaulted on a loan. You must use this data to build a model that, given details for any loan described above, will predict the probability that the borrower will default (also known as PD: the probability of default). Use the provided data to train a function that will estimate the probability of default for a borrower. Assuming a recovery rate of 10%, this can be used to give the expected loss on a loan.

You should produce a function that can take in the properties of a loan and output the expected loss.
You can explore any technique ranging from a simple regression or a decision tree to something more advanced. You can also use multiple methods and provide a comparative analysis.