# Probability of Default Demo

## Introduction

#### Use Case

#### Import Libraries

In [None]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

import statsmodels.api as sm
import pickle
import os
from typing import List
from datetime import datetime
import re
import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
%matplotlib inline

#### Connect to ValidMind

In [None]:

import validmind as vm

vm.init(
  api_host = "https://api.prod.validmind.ai/api/v1/tracking",
  api_key = "...",
  api_secret = "...",
  project = "..."
)

#### Processing Functions

In [None]:
import datetime

def apply_default_probabilities(model_fit, df_scores, point_in_time=None):

    # If point-in-time None set to today's date
    if point_in_time is None:
        point_in_time = datetime.date.today().strftime("%Y-%m-%d")

    df_scores['point_in_time'] = point_in_time

    # Remove 'const' column if it exists in the dataframe
    if 'const' in df_scores.columns:
        df_scores = df_scores.drop(columns=['const'])

    # Prepare the feature matrix
    X = sm.add_constant(df_scores['score'])

    # Compute the probabilities
    probabilities = model_fit.predict(X)

    # Add the probabilities to the dataframe
    df_scores['predicted_default_probability'] = probabilities

    # Compute predicted default based on the probability
    df_scores['predicted_default'] = (df_scores['predicted_default_probability'] >= 0.5).astype(int)

    return df_scores


def apply_credit_scores(model, X, target_score, target_odds, pdo):
    X_copy = X.copy()
    beta = model.params.values
    alpha = model.params[0]
    factor = pdo / np.log(2)
    offset = target_score - (factor * np.log(target_odds))

    for _, row in X_copy.iterrows():
        score_i = 0
        for i in range(1, len(beta)):
            WoE_i = row[i]
            score_i += (beta[i] * WoE_i) * factor

        score_i += alpha * factor
        score_i += offset
        X_copy.loc[row.name, "score"] = score_i

    return X_copy


def get_risk_band(pd_value, pd_risk_bands):
    for band, (low, high) in pd_risk_bands.items():
        if low <= pd_value < high:
            return band
    return None

def apply_risk_bands(df, default_probabilities_column, pd_risk_bands):
    df['risk_band'] = df[default_probabilities_column].apply(lambda x: get_risk_band(x, pd_risk_bands))
    return df


def compute_point_in_time_pd(model, df, threshold=0.5, point_in_time_date='2023-01-01'):
    """
    Compute point-in-time default probabilities and predicted default values,
    and add them as new columns in df.

    The 'point_in_time' column is also added to df, with the given date.
    """
    # Copy df to avoid changing the original DataFrame
    df_copy = df.copy()

    # Store 'default' and 'const' columns and exclude them from df_copy
    default_column = None

    if 'default' in df_copy.columns:
        default_column = df_copy['default']
        df_copy.drop('default', axis=1, inplace=True)

    # Calculate probabilities
    probabilities = model.predict(df_copy)
    df_copy["default_probabilities"] = probabilities

    # Drop 'const' column
    if 'const' in df_copy.columns:
        df_copy.drop('const', axis=1, inplace=True)

    # Add 'default' column back into df_copy
    if default_column is not None:
        df_copy['default'] = default_column

    # Compute predicted default
    df_copy['predicted_default'] = df_copy['default_probabilities'].apply(lambda x: 1 if x >= threshold else 0)

    # Add 'point_in_time' column
    df_copy['point_in_time'] = pd.to_datetime(point_in_time_date)

    return df_copy



def load_model_and_df(file_name):
    """Load a model and a DataFrame from a pickle file in the current directory"""
    # Get the current working directory
    current_path = os.getcwd()

    # Construct the full file path
    full_file_path = os.path.join(current_path, file_name)

    print(f"The full file path is: {full_file_path}")

    # Load the model and DataFrame
    with open(full_file_path, 'rb') as file:
        model, df = pickle.load(file)

    print(f"Model and DataFrame loaded from {full_file_path}")

    return model, df

## Data Description

#### Load Credit Risk Scorecard

In [None]:
# Load GLM Credit Risk Scorecard Model
file_name = 'model_fit_glm_scorecard_20230725_104439.pkl'
model_fit_glm, df = load_model_and_df(file_name)

# Compute credit scores
target_score = 500  # The score that you want to assign to the target odds
target_odds = 50  # The target odds (e.g., odds of being good vs bad)
pdo = 20  # Points to double the odds

# Compute risk scores from model's coefficients
df_scores = apply_credit_scores(model_fit_glm, df, target_score, target_odds, pdo)

# Define the scores and default columns
scores_column = 'score'
default_column = 'default'

# Extract the relevant columns from the dataframe
X = df_scores[scores_column]
y = df_scores[default_column]

# Add constant to the features (Statsmodels requires this step)
X = sm.add_constant(X)

# Fit the GLM model (using the binomial family to get logistic regression)
model_fit_pit_pd = sm.GLM(y, X, family=sm.families.Binomial()).fit()

# Print the model summary
print(model_fit_pit_pd.summary())



In [None]:
df_pit_pd = apply_default_probabilities(model_fit_pit_pd, df_scores)
df_pit_pd

#### Validate PiT-PD Data

**Validate Point-in-Time PD Dataset**

In [None]:
from validmind.vm_models.test_context import TestContext
from validmind.tests.data_validation.PiTPDHistogram import PiTPDHistogram

# Define text context
test_context_pit_pd = TestContext(dataset=df_pit_pd)

# Configure test parameters
params = {
    "default_column": 'default',
    "predicted_default_column": 'predicted_default',
    "default_probabilities_column": 'predicted_default_probability',
    "point_in_time_column": 'point_in_time',
    "title": "Histogram of Probability of Default",
}

metric = PiTPDHistogram(test_context_pit_pd, params)
metric.run()
metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.PiTCreditScoresHistogram import PiTCreditScoresHistogram

# Define text context
test_context_pit_pd = TestContext(dataset=df_pit_pd)

# Configure test parameters
params = {
    "default_column": 'default',
    "predicted_default_column": 'predicted_default',
    "scores_column": 'score',
    "point_in_time_column": 'point_in_time',
    "title": "Histogram of Credit Scores",
}

metric = PiTCreditScoresHistogram(test_context_pit_pd, params)
metric.run()
metric.result.log()
metric.result.show()

## Data Preparation

#### Assing Risk Bands to PDs

In [None]:
pd_risk_bands = {
    "A": [0, 0.1],
    "B": [0.1, 0.4],
    "C": [0.4, 0.6],
    "D": [0.6, 1],
}


df_risk_bands = apply_risk_bands(df=df_pit_pd,
                                  default_probabilities_column='predicted_default_probability',
                                  pd_risk_bands=pd_risk_bands)

#### Testing

In [None]:
from validmind.tests.data_validation.DefaultRatesbyRiskBandPlot import DefaultRatesByRiskBandPlot

# Define text context
test_context_risk_grade = TestContext(dataset=df_risk_bands)

# Configure test parameters
params = {
    "default_column": 'default',
    "risk_band_column": 'risk_band',
    "title": "Bar Plot of Default Rates per Risk Grade",
}

metric = DefaultRatesbyRiskBandPlot(test_context_risk_grade, params)
metric.run()
metric.result.log()
metric.result.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px

def plot_total_counts_by_risk_band(df, risk_band_column):
    # Calculate the count of accounts in each risk band
    risk_band_counts = df[risk_band_column].value_counts().sort_index()

    # Convert to percentage
    total_accounts = len(df)
    risk_band_percentages = (risk_band_counts / total_accounts) * 100

    # Use 'Dark24' color sequence for more distinguishable colors
    colors = px.colors.qualitative.Dark24

    # Create the bar plot
    fig = go.Figure(data=[go.Bar(x=risk_band_percentages.index,
                                 y=risk_band_percentages.values,
                                 marker_color=colors)])

    # Customize the plot
    fig.update_layout(
        title_text='Percentage of Total Accounts by Risk Band',
        xaxis_title='Risk Band',
        yaxis_title='Percentage of Total Accounts'
    )

    fig.show()

plot_total_counts_by_risk_band(df_risk_bands, risk_band_column='risk_band')


In [None]:
import plotly.graph_objects as go
import plotly.express as px

def plot_default_rates_by_risk_band(df, risk_band_column, default_column):
    # Calculate the default rate in each risk band
    risk_bands = sorted(df[risk_band_column].unique())
    default_rates = []

    for band in risk_bands:
        band_data = df[df[risk_band_column] == band]
        default_rate = band_data[default_column].mean()
        default_rates.append(default_rate)

    # Use 'Dark24' color sequence for more distinguishable colors
    colors = px.colors.qualitative.Dark24

    # Create the bar plot
    fig = go.Figure(data=[go.Bar(x=risk_bands, y=default_rates, marker_color=colors)])

    # Customize the plot
    fig.update_layout(
        title_text='Default Rates by Risk Band',
        xaxis_title='Risk Band',
        yaxis_title='Default Rate'
    )

    fig.show()

plot_default_rates_by_risk_band(df_risk_bands, risk_band_column='risk_band', default_column='default')


#### Validate PiT-PD Risk Grades Data 

In [None]:
from validmind.tests.data_validation.RiskGradeDefaultRatesBarPlot import RiskGradeDefaultRatesBarPlot

# Define text context
test_context_risk_grade = TestContext(dataset=df_risk_grade)

# Configure test parameters
params = {
    "default_column": 'default',
    "predicted_default_column": 'predicted_default',
    "risk_band_column": 'risk_band',
    "point_in_time_column": 'point_in_time',
    "title": "Bar Plot of Default Rates per Risk Grade",
}

metric = RiskGradeDefaultRatesBarPlot(test_context_risk_grade, params)
metric.run()
metric.result.log()
metric.result.show()

#### Validate Prepared Data

## Data Sampling

#### Sampling Method

#### Data Split

## Exploratory Data Analysis 