# Task 3

## My works

In [1]:
import pandas as pd

In [3]:
loan = pd.read_csv("Loan_Data.csv")
loan

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.752520,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.830850,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0
...,...,...,...,...,...,...,...,...
9995,3972488,0,3033.647103,2553.733144,42691.62787,5,697,0
9996,6184073,1,4146.239304,5458.163525,79969.50521,8,615,0
9997,6694516,2,3088.223727,4813.090925,38192.67591,5,596,0
9998,3942961,0,3288.901666,1043.099660,50929.37206,2,647,0


In [5]:
missing_values = loan.isnull().sum()
print(missing_values)

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Add the 'debt_to_income_ratio' feature
loan['debt_to_income_ratio'] = loan['total_debt_outstanding'] / loan['income']

# Define the features (X) and target (y)
X = loan.drop(columns=['customer_id', 'default'])
y = loan['default']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Logistic Regression model
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
y_pred = logreg.predict(X_test)

print("AUC-ROC:", roc_auc_score(y_test, y_pred_prob))
# The AUC-ROC score tells us how well the model can distinguish between the classes, 
# with 1 being perfect and 0.5 being no better than random guessing.
print("\nClassification Report:\n", classification_report(y_test, y_pred))

AUC-ROC: 0.9999812075148893

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2459
           1       1.00      0.99      0.99       541

    accuracy                           1.00      3000
   macro avg       1.00      0.99      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [9]:
def calculate_pd(loan_details):
    # Predict the probability of default
    pd = logreg.predict_proba(loan_details)[:, 1]
    return pd

In [11]:
def calculate_expected_loss(loan_details, loan_amt_outstanding, recovery_rate=0.10):
    # Calculate the Probability of Default (PD)
    pd = calculate_pd(loan_details)
    
    # Calculate the expected loss
    expected_loss = loan_amt_outstanding * pd * (1 - recovery_rate)
    return expected_loss

In [13]:
# Example loan details with the new feature
loan_details = pd.DataFrame({
    'credit_lines_outstanding': [4],
    'loan_amt_outstanding': [5000],
    'total_debt_outstanding': [7500],
    'income': [60000],
    'years_employed': [5],
    'fico_score': [650],
})

loan_details['debt_to_income_ratio'] = loan_details['total_debt_outstanding'] / loan_details['income']

# Calculate the expected loss
loan_amt_outstanding = loan_details['loan_amt_outstanding'][0]
expected_loss = calculate_expected_loss(loan_details, loan_amt_outstanding)

print(f"Expected Loss: ${expected_loss[0]:.2f}")

Expected Loss: $79.05


## JPMorgan Chase example

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import pandas as pd

# Read in loan data from a CSV file
df = pd.read_csv('Loan_Data.csv')

# Define the variable features
features = ['credit_lines_outstanding', 'debt_to_income', 'payment_to_income', 'years_employed', 'fico_score']

# Calculate the payment_to_income ratio
df['payment_to_income'] = df['loan_amt_outstanding'] / df['income']
    
# Calculate the debt_to_income ratio
df['debt_to_income'] = df['total_debt_outstanding'] / df['income']

clf = LogisticRegression(random_state=0, solver='liblinear', tol=1e-5, max_iter=10000).fit(df[features], df['default'])
print(clf.coef_, clf.intercept_)

# Use the following code to check yourself
y_pred = clf.predict(df[features])

fpr, tpr, thresholds = metrics.roc_curve(df['default'], y_pred)
print((1.0*(abs(df['default']-y_pred)).sum()) / len(df))
print(metrics.auc(fpr, tpr))

[[ 8.18520373  0.54490854  0.01994244 -2.77630853 -0.02418391]] [-0.09162643]
0.0037
0.9925106069101026


# Task 4

#### Quantization
Quantization is the process of mapping a large range of input values to a smaller range of output values. In this task, we're focusing on mapping FICO scores (which can range from 300 to 850) into a smaller number of buckets (or categories). Each bucket will represent a range of FICO scores, and we’ll assign a rating to each bucket.

## My works

### MSE

In [22]:
import numpy as np

In [24]:
fico_scores = loan['fico_score']
defaults = loan['default']

# Define the number of buckets
num_buckets = 5

# Step 1: Determine the bucket boundaries (using quantiles for simplicity)
boundaries = np.percentile(fico_scores, np.linspace(0, 100, num_buckets + 1))

# Step 2: Assign each FICO score to a bucket
loan['bucket'] = pd.cut(fico_scores, bins=boundaries, labels=False, include_lowest=True)

# Step 3: Calculate the average FICO score for each bucket (to minimize MSE)
bucket_means = loan.groupby('bucket')['fico_score'].mean()

# Step 4: Map the bucket mean to each row in the DataFrame
loan['bucket_mean'] = loan['bucket'].map(bucket_means)

# Step 5: Calculate the squared error
loan['squared_error'] = (loan['fico_score'] - loan['bucket_mean']) ** 2

# Step 6: Calculate the overall MSE
mse = loan['squared_error'].mean()

print("Boundaries:", boundaries)
print("Bucket Means:", bucket_means)
print(f"Mean Squared Error (MSE): {mse:.2f}")

Boundaries: [408. 587. 623. 653. 688. 850.]
Bucket Means: bucket
0    552.611707
1    606.751395
2    638.579688
3    670.346520
4    721.524837
Name: fico_score, dtype: float64
Mean Squared Error (MSE): 392.65


#### Using K-Means to optimize

In [27]:
from sklearn.cluster import KMeans

fico_scores = loan['fico_score'].values.reshape(-1, 1)

# Define the number of clusters (buckets)
num_buckets = 5

# Step 1: Apply K-means clustering
kmeans = KMeans(n_clusters=num_buckets, random_state=42)
loan['bucket'] = kmeans.fit_predict(fico_scores)

# Step 2: Calculate the mean FICO score for each cluster (bucket)
bucket_means = loan.groupby('bucket')['fico_score'].mean()

# Step 3: Map the bucket mean to each row in the DataFrame
loan['bucket_mean'] = loan['bucket'].map(bucket_means)

# Step 4: Calculate the Mean Squared Error (MSE)
loan['squared_error'] = (loan['fico_score'] - loan['bucket_mean']) ** 2
mse = loan['squared_error'].mean()

print("Bucket Means:", bucket_means)
print(f"Mean Squared Error (MSE): {mse:.2f}")

Bucket Means: bucket
0    593.242118
1    640.889328
2    687.201020
3    531.374074
4    743.170478
Name: fico_score, dtype: float64
Mean Squared Error (MSE): 300.52


### Logical Regression

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Step 1: Select Features and Target
X = loan.drop(columns=['customer_id', 'default'])
y = loan['default']

# Step 2: Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Initialize Logistic Regression Model
logreg = LogisticRegression()

# Step 4: Train the Model
logreg.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = logreg.predict(X_test)
y_pred_prob = logreg.predict_proba(X_test)[:, 1]  # Probability of default

# Step 6: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.99
ROC AUC Score: 1.00
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2459
           1       0.98      0.96      0.97       541

    accuracy                           0.99      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.99      0.99      0.99      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## JPMorgan Chase example

### Maximum Likelihood Estimation
The log-likelihood function is defined to calculate the likelihood of a given set of parameters. The likelihood function is used to calculate the probability of observing the data given the parameter values. The code then initializes a three-dimensional array, dp, that is used to store the calculated log-likelihood values for different sets of observations. The first dimension represents the number of iterations performed, the second dimension represents the rank of the observation, and the third dimension represents the log-likelihood and the index of the previous observation.

In [33]:
import pandas as pd
from math import log

df = pd.read_csv('Loan_Data.csv')

x = df['default'].to_list()
y = df['fico_score'].to_list()
n = len(x)
print (len(x), len(y))

10000 10000


In [35]:
default = [0 for i in range(851)]
total = [0 for i in range(851)]

for i in range(n):
    y[i] = int(y[i])
    default[y[i]-300] += x[i]
    total[y[i]-300] += 1
    
for i in range(0, 551):
    default[i] += default[i-1]
    total[i] += total[i-1]
    
import numpy as np
    
def log_likelihood(n, k):
    p = k/n
    if (p==0 or p==1):
        return 0
    return k*np.log(p)+ (n-k)*np.log(1-p)

r = 10
dp = [[[-10**18, 0] for i in range(551)] for j in range(r+1)]

for i in range(r+1):
    for j in range(551):
        if (i==0):
            dp[i][j][0] = 0
        else:
            for k in range(j):
                if (total[j]==total[k]):
                    continue
                if (i==1):
                    dp[i][j][0] = log_likelihood(total[j], default[j])
                else:
                    if (dp[i][j][0] < (dp[i-1][k][0] + log_likelihood(total[j]-total[k], default[j] - default[k]))):
                        dp[i][j][0] = log_likelihood(total[j]-total[k], default[j]-default[k]) + dp[i-1][k][0]
                        dp[i][j][1] = k
                                                     
print (round(dp[r][550][0], 4))
                                                     
k = 550
l = []
while r >= 0:
    l.append(k+300)
    k = dp[r][k][1]
    r -= 1

print(l)

-4217.8245
[850, 753, 752, 732, 696, 649, 611, 580, 552, 520, 300]
