# Probability of Default Model using ValidMind

- Step 1: Connect Notebook to ValidMind Project
- Step 2: Import Raw Data
- Step 3: Data Description
- Step 4: Data Preparation
- Step 5: Data Description on Preprocessed Data 
- Step 6: Univariate Analysis
- Step 7: Multivariate Analysis
- Step 8: Model Training 

## Step 1: Connect to ValidMind Project

#### Import Libraries

In [1]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
%matplotlib inline

#### Connect Notebook to ValidMind Project

In [None]:
import validmind as vm

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "2494c3838f48efe590d531bfe225d90b",
  api_secret = "4f692f8161f128414fef542cab2a4e74834c75d01b3a8e088a1834f2afcfe838",
  project = "cliwzqjgv00001fy6869rlav9"
)

: 

## Step 2: Import Raw Data

#### Import Lending Club Dataset

In [None]:
# Specify the path to the zip file
filepath = '/Users/juanvalidmind/Dev/datasets/lending club/data_2007_2014/loan_data_2007_2014.csv'
# filepath = '/Users/juanvalidmind/Dev/datasets/lending club/data_2007_2011/lending_club_loan_data_2007_2011.csv'
df = pd.read_csv(filepath)
df.info()

: 

## Step 3: Data Description

### Describe Raw Dataset

**Create VM Dataset and Run Test**

In [None]:
from validmind.vm_models.test_context import TestContext
from validmind.tests.data_validation.TabularDescriptionTables import TabularDescriptionTables

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)

metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

: 

### Identify Missing Values

In [None]:
from validmind.tests.data_validation.MissingValuesBarPlot import MissingValuesBarPlot

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)

params = {"threshold": 80,
          "xticks_fontsize": 8}

metric = MissingValuesBarPlot(test_context, params)
metric.run()
metric.result.show()

: 

## Step 4: Data Preparation

### Identify Target Variable

**Definition of Default**

We categorizing `Fully Paid` loans as "default = 0" and `Charged Off` loans as "default = 1". This binary classification is suitable for developing a credit scorecard, as it enables distinction between applicants likely to fulfill their credit obligations (low risk) and those likely to fail (high risk). 

Loans with `Current` status, which represents ongoing loans with an unresolved outcome, should be excluded from the model, as their final repayment status is still unknown and thus not suitable for a retrospective risk analysis.


**Add `default` Variable**

In [None]:
def add_target_column(df, target_column):
    # Assuming the column name is 'loan_status'
    df[target_column] = df['loan_status'].apply(lambda x: 0 if x == "Fully Paid" else 1 if x == "Charged Off" else np.nan)
    # Remove rows where the target column is NaN
    df = df.dropna(subset=[target_column])
    # Convert target column to integer
    df[target_column] = df[target_column].astype(int)
    return df


: 

In [None]:
target_column = 'default'
df = add_target_column(df, target_column)

# Drop 'loan_status' variable 
df.drop(columns='loan_status', axis=1, inplace=True)

: 

### Remove Unused Variables

Remove all the **Demographic** and **Customer Behavioural** features which is of no use for default analysis for credit approval.

In [None]:
unused_variables = ["id", "member_id", "funded_amnt", "emp_title", "url", "desc", "application_type",
                    "title", "zip_code", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record",
                    "revol_bal", "total_rec_prncp", "total_rec_late_fee", "recoveries", "out_prncp_inv", "out_prncp", 
                    "collection_recovery_fee", "next_pymnt_d", "initial_list_status", "pub_rec",
                    "collections_12_mths_ex_med", "policy_code", "acc_now_delinq", "pymnt_plan",
                    "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "last_pymnt_d", "last_credit_pull_d",
                    'earliest_cr_line', 'issue_d']

df = df.drop(columns=unused_variables)

: 

### Remove Variables with Large Number of Missing Values

In [None]:
def variables_with_min_missing(df, min_missing_percentage):
    # Calculate the percentage of missing values in each column
    missing_percentages = df.isnull().mean() * 100

    # Get the variables where the percentage of missing values is greater than the specified minimum
    variables_to_drop = missing_percentages[missing_percentages > min_missing_percentage].index.tolist()

    # Also add any columns where all values are missing
    variables_to_drop.extend(df.columns[df.isnull().all()].tolist())

    # Remove duplicates (if any)
    variables_to_drop = list(set(variables_to_drop))

    return variables_to_drop

min_missing_count = 80
variables_to_drop = variables_with_min_missing(df, min_missing_count)
df.drop(columns=variables_to_drop, axis=1, inplace=True)

df.dropna(axis=0, subset=["emp_length"], inplace=True)
df.dropna(axis=0, subset=["revol_util"], inplace=True)

: 

### Data Cleaning

In [None]:
from typing import List
import pandas as pd
import numpy as np
from datetime import datetime
import re

def clean_term_column(df, column):
    """
    Function to remove 'months' string from the 'term' column and convert it to categorical
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].str.replace(' months', '')
    
    # Convert to categorical
    df[column] = df[column].astype('object')

    return df

def clean_rate_columns(df, column):
    """
    Clean interest rate column. Remove the '%' sign and convert to numeric.

    Parameters:
    df (pandas.DataFrame): DataFrame to be processed.
    column (str): Name of the interest rate column to be cleaned.
    """
    df[column] = df[column].str.replace('%', '')
    df[column] = pd.to_numeric(df[column])

def clean_emp_length_column(df, column):
    """
    Function to clean 'emp_length' column and convert it to categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].replace('n/a', np.nan)
    df[column] = df[column].str.replace('< 1 year', str(0))
    df[column] = df[column].apply(lambda x: re.sub('\D', '', str(x)))
    df[column].fillna(value = 0, inplace=True)

    # Convert to categorical
    df[column] = df[column].astype('object')

    return df

def clean_inq_last_6mths(df, column):
    """
    Function to convert 'inq_last_6mths' column into categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")

    # Convert to categorical
    df[column] = df[column].astype('category')

    return df

clean_emp_length_column(df, 'emp_length')
clean_term_column(df, 'term')
clean_inq_last_6mths(df, 'inq_last_6mths')

: 

### Outliers

In [None]:
def get_numerical_columns(df):
        numerical_columns = df.select_dtypes(
            include=["int", "float", "uint"]
        ).columns.tolist()
        return numerical_columns

def get_categorical_columns(df):
        categorical_columns = df.select_dtypes(
            include=["object", "category"]
        ).columns.tolist()
        return categorical_columns

def compute_outliers(series, threshold=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return series[(series < lower_bound) | (series > upper_bound)]

def remove_iqr_outliers(df, target_column, threshold=1.5):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols.remove(target_column)  # Exclude target_column from numerical columns
    for col in num_cols:
        outliers = compute_outliers(df[col], threshold)
        df = df[~df[col].isin(outliers)]
    return df

df = remove_iqr_outliers(df, target_column, threshold=1.5)

: 

In [None]:
from validmind.tests.data_validation.IQROutliersPlots import IQROutliersPlots

vm_df = vm.init_dataset(dataset=df,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

num_features = get_numerical_columns(df)
params = {"num_features": num_features,
          "threshold": 1.5}

metric = IQROutliersPlots(test_context, params)
metric.run()
metric.result.show()

: 

## Step 5: Training Data  

### Sampling

We employ stratified sampling to create our training and testing sets. Stratified sampling is particularly important in this context. When the `stratify = y` parameter is set, it ensures that the distribution of the target variable (`y`) in the test set is the same as that in the original dataset. 

This is crucial for maintaining a consistent representation of the target variable classes, especially important in scenarios where the classes are imbalanced, which is often the case in credit risk scorecards.

In [None]:
# Split data into train and test 
X = df.drop(target_column, axis = 1)
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    random_state = 42, stratify = y)

# Concatenate X_train with y_train to form df_train
df_train = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test with y_test to form df_test
df_test = pd.concat([X_test, y_test], axis=1)

: 

### Class Imbalance

Class imbalance is a common issue in credit risk scorecards and datasets like the Lending Club's. This imbalance arises when the number of defaulting loans (negative class) is significantly smaller than the number of loans that are paid off (positive class). Such imbalance can lead to biased models that favor the majority class, thus affecting predictive performance. 

Special techniques like oversampling, undersampling, or cost-sensitive learning are often needed to ensure that the minority class is appropriately represented during model training.

**Update VM Dataset and Run Test**

In [None]:
from validmind.tests.data_validation.ClassImbalance import ClassImbalance

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

metric = ClassImbalance(test_context)
metric.run()
metric.result.show()

: 

### Feature Selection

#### Chi-Squared Test on Categorical Features

**Run Test**

In [None]:
from validmind.tests.data_validation.ChiSquaredFeaturesTable import ChiSquaredFeaturesTable

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

cat_features = get_categorical_columns(df_train)
params = {"cat_features": cat_features,
          "p_threshold": 0.05}

metric = ChiSquaredFeaturesTable(test_context, params)
metric.run()
metric.result.show()

: 

#### ANOVA Test on Numerical Features

**Run Test**

In [None]:
from validmind.tests.data_validation.ANOVAOneWayTable import ANOVAOneWayTable

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

num_features = get_numerical_columns(df_train)
params = {"num_features": num_features,
          "p_threshold": 0.05}

metric = ANOVAOneWayTable(test_context, params)
metric.run()
metric.result.show()

: 

#### Heatmap Correlation of Numerical Features

**Run Test**

In [None]:
from validmind.tests.data_validation.HeatmapFeatureCorrelations import HeatmapFeatureCorrelations

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = HeatmapFeatureCorrelations(test_context, params)
metric.run()
metric.result.show()

: 

#### Correlations of Numerical Features with Target Variable

**Run Test**

In [None]:
from validmind.tests.data_validation.FeatureTargetCorrelationPlot import FeatureTargetCorrelationPlot

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = FeatureTargetCorrelationPlot(test_context, params)
metric.run()
metric.result.show()

: 

#### Selection of Features

In [None]:
drop_categorical_features = ['addr_state']
drop_numerical_features = ['total_rec_int', 'loan_amnt',
                           'funded_amnt_inv', 'dti', 'revol_util', 'total_pymnt', 
                           'total_pymnt_inv', 'last_pymnt_amnt',]

df_train.drop(columns = drop_categorical_features + drop_numerical_features, inplace=True)

# Update df_test 
df_test.drop(columns = drop_categorical_features + drop_numerical_features, inplace=True)

: 

## Step 8: Feature Engineering

### Encoding of Numerical Features

In [None]:
import pandas as pd
import numpy as np

def encode_numerical_features(df):
    
    # term
    df['term'] = df['term'].replace({' 36': '36M', ' 60': '60M'})

    # emp_length_int
    df['emp_length'] = df['emp_length'].replace('10+', '10')  # Replace '10+' with '10'
    df['emp_length'] = pd.to_numeric(df['emp_length'], errors='coerce')  # Convert to numeric
    df['emp_length'].fillna(-1, inplace=True)
    bins = [0,1,2,3,5,8,10,999]
    df['emp_length_bucket'] = pd.cut(df['emp_length'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='emp_length', inplace=True)

    # inq_last_6mths
    df['inq_last_6mths'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 10, 25, 50]
    df['inq_last_6mths_bucket'] = pd.cut(df['inq_last_6mths'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='inq_last_6mths', inplace=True)
    
    # total_acc
    df['total_acc'].fillna(-1, inplace=True)
    bins = [-1, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 999]
    df['total_acc_bucket'] = pd.cut(df['total_acc'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='total_acc', inplace=True)

    # annual_inc
    df['annual_inc'].fillna(-1, inplace=True)
    df['annual_inc_1000'] = df['annual_inc']/1000
    bins = [-1, 0, 10, 20, 30, 40, 50, 75, 100, 150, 250, 1000, 10000]
    df['annual_inc_bucket'] = pd.cut(df['annual_inc_1000'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='annual_inc', inplace=True)
    df.drop(columns='annual_inc_1000', inplace=True)
    
    # int_rate
    df['int_rate'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 10, 25, 50]
    df['int_rate_bucket'] = pd.cut(df['int_rate'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='int_rate', inplace=True)

    # installment
    df['installment'].fillna(-1, inplace=True)
    bins = [-1, 0, 100, 200, 300, 400, 500, 750, 1000, 1500]
    df['installment_bucket'] = pd.cut(df['installment'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='installment', inplace=True)

    # open_acc
    df['open_acc'].replace("N/A", 1, inplace=True)
    df['open_acc'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 8, 10, 100]
    df['open_acc_bucket'] = pd.cut(df['open_acc'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='open_acc', inplace=True)

    return df


df_train = encode_numerical_features(df_train)

# Update df_test
df_test = encode_numerical_features(df_test)

: 

In [None]:
def find_categorical_features(df):
    # Get the column names of features with the data type "category"
    categorical_features = df.select_dtypes(include='category').columns.tolist()

    return categorical_features


def convert_categorical_to_object(df):
    # Find the categorical features
    categorical_features = find_categorical_features(df)

    # Convert the categorical features to object type
    df[categorical_features] = df[categorical_features].astype(str)

    return df

df_train = convert_categorical_to_object(df_train)

# Update df_test
df_test = convert_categorical_to_object(df_test)

: 

In [None]:
vm_df_train = vm.init_dataset(dataset=df_train)
test_context = TestContext(dataset=vm_df_train)

metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

: 

### WoE and IV

In [None]:
from validmind.tests.data_validation.WOEIVTable import WOEIVTable

# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure test parameters

params = {
    "features": None,
    "order_by": ["Feature", "WoE"]
}

# Run test
metric = WOEIVTable(test_context, params=params)
metric.run()
woe_iv_df = metric.result.metric.value['woe_iv']
metric.result.show()

: 

### Group Buckets 

In [None]:
import pandas as pd

def coarse_classing(df, mappings):
    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()

    # Loop through each feature and merge set
    for feature, merge_sets in mappings.items():
        for merge_set in merge_sets:
            # Merge the specified categories into a new category
            df_new[feature] = df_new[feature].apply(lambda x: f"[{','.join(merge_set)}]" if x in merge_set else x)

    return df_new

# Create a dictionary of features and the sets to merge
mappings = {
    'sub_grade': [['B2','B3','B4','B5','C3','D1'], ['C1','C2','C4','C5'], ['D3','D4','D5','E3','G4'], ['E1','E2','E4','E5','F1','F2','F3','F4','G1','G2','G3','G5','F5']],
    'grade': [['F','G']],
    'purpose': [['wedding','major_purchase'], ['credit_card','car'], ['debt_consolidation','other','vacation'], ['medical','moving','house','educational'], ['renewable_energy','small_business']],
    'home_ownership': [['MORTGAGE','OWN','RENT']],
    'annual_inc_bucket': [['[250, 1000)','[100, 150)','[150, 250)','[1000, 10000)'], ['[50, 75)','[40, 50)'], ['[10, 20)','[0, 10)']],
    'emp_length_bucket': [['[2, 3)','[40, 50)','[3, 5)','[1, 2)','[0, 1)','[5, 8)','[8, 10)']],
    'inq_last_6mths_bucket': [['[4, 5)','[1, 2)'], ['[5, 10)','[3, 4)']],
    'installment_bucket': [['[300, 400)','[200, 300)','[0, 100)'], ['[400, 500)', '[500, 750)']],
    'total_acc_bucket': [['[20, 25)','[30, 35)','[15, 20)','[45, 50)','[40, 45)','[35, 40)','[10, 15)','[5, 10)']],
    'open_acc_bucket': [['[5, 8)','[8, 10)','[10, 100)','[4, 5)'], ['[1, 2)','[2, 3)']]
}

df_train = coarse_classing(df_train, mappings)
df_train = df_train[~df_train['home_ownership'].isin(['OTHER', 'NONE'])]
df_train.drop(columns="home_ownership", inplace=True)

# Update df_test
df_test = coarse_classing(df_test, mappings)
df_test = df_test[~df_test['home_ownership'].isin(['OTHER', 'NONE'])]
df_test.drop(columns="home_ownership", inplace=True)

: 

In [None]:
def shorten_category_names(df, max_length=20, suffix="..."):
    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()
    
    # Iterate over each column in the DataFrame
    for feature in df_new.columns:
        # Check if the column has the "object" data type
        if df_new[feature].dtype.name == 'object':
            # Shorten long category names
            df_new[feature] = df_new[feature].apply(lambda x: x[:max_length] + suffix if len(x) > max_length else x)
    
    return df_new

df_train = shorten_category_names(df_train, max_length=15, suffix="...")

# Update df_test
df_test = shorten_category_names(df_test, max_length=15, suffix="...")


: 

In [None]:
# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure test parameters
params = {
    "features": None,
    "order_by": ["Feature", "WoE"]
}

# Run test
metric = WOEIVTable(test_context, params=params)
metric.run()
woe_iv_df = metric.result.metric.value['woe_iv']
metric.result.show()

: 

In [None]:
from validmind.tests.data_validation.WOEIVPlots import WOEIVPlots

# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

params = {
    "features": None,
    "label_rotation": 90
}

# Run test
metric = WOEIVPlots(test_context, params=params)
metric.run()
metric.result.show()

: 

### Add WoE as Features

In [None]:
def woe_encoder(woe_df, original_df, target):
    # Initiate an empty DataFrame
    woe_encoded_df = pd.DataFrame()

    # Loop through each feature-category and get the corresponding WoE value
    for feature in woe_df['Feature'].unique():
        for category in woe_df[woe_df['Feature'] == feature]['Category'].unique():
            woe_value = woe_df[(woe_df['Feature'] == feature) & (woe_df['Category'] == category)]['WoE'].values[0]
            original_df.loc[original_df[feature] == category, feature] = woe_value

        # Convert the feature to float type
        original_df[feature] = original_df[feature].astype(float)

    # Creating a new dataframe with WoE values
    for feature in woe_df['Feature'].unique():
        woe_encoded_df = pd.concat([woe_encoded_df, original_df[feature]], axis=1)

    # Add the target column to the new DataFrame
    woe_encoded_df[target] = original_df[target]

    return woe_encoded_df


df_train = woe_encoder(woe_iv_df, df_train, target='default')

# Update df_test
df_test = woe_encoder(woe_iv_df, df_test, target='default')

: 

In [None]:
vm_df_train = vm.init_dataset(dataset=df_train)
test_context = TestContext(dataset=vm_df_train)

metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

: 

In [None]:
from validmind.tests.data_validation.FeatureTargetCorrelationPlot import FeatureTargetCorrelationPlot

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = FeatureTargetCorrelationPlot(test_context, params)
metric.run()
metric.result.show()

: 

In [None]:
vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = HeatmapFeatureCorrelations(test_context, params)
metric.run()
metric.result.show()

: 

In [None]:
import statsmodels.api as sm

y_train = df_train[target_column]
X_train = df_train.drop(target_column, axis=1)

# Add constant to X_train for intercept term
X_train = sm.add_constant(X_train)
df_train = pd.concat([X_train, y_train], axis=1)

# Update df_test
y_test = df_test[target_column]
X_test = df_test.drop(target_column, axis=1)
X_test = sm.add_constant(X_test)
df_test = pd.concat([X_test, y_test], axis=1)
df_test = df_test.reindex(labels=df_train.columns, axis=1, fill_value=0)

# Define the model
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
model_fit_glm = model.fit()

# Print out the statistics
print(model_fit_glm.summary())

: 

In [None]:
from sklearn.metrics import roc_auc_score
import pandas as pd

def compute_auc(y_true, y_scores):
    """Computes the Area Under the Curve (AUC)."""
    auc = roc_auc_score(y_true, y_scores)
    return auc

def compute_gini(y_true, y_scores):
    """Computes the Gini coefficient."""
    auc = compute_auc(y_true, y_scores)
    gini = 2*auc - 1
    return gini

def compute_metrics(model, X_train, y_train, X_test, y_test):
    """Computes and prints AUC and GINI for train and test sets."""

    metrics_dict = {"Dataset": ["Train", "Test"],
                    "AUC": [],
                    "GINI": []}

    for dataset, X, y in zip(["Train", "Test"], [X_train, X_test], [y_train, y_test]):
        # Get predicted probabilities
        y_scores = model.predict(X)

        # Compute AUC and GINI
        auc = compute_auc(y, y_scores)
        gini = compute_gini(y, y_scores)

        # Add the metrics to the dictionary
        metrics_dict["AUC"].append(auc)
        metrics_dict["GINI"].append(gini)

    # Convert dictionary to DataFrame for nicer display
    metrics_df = pd.DataFrame(metrics_dict)
    return metrics_df

metrics_df = compute_metrics(model_fit_glm, X_train, y_train, X_test, y_test)
display(metrics_df)

: 

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_scores):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

# Call the function using y_test and y_scores
plot_roc_curve(y_test, y_scores)

: 

In [None]:
# Cerate VM dataset
vm_train_ds = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
vm_test_ds = vm.init_dataset(dataset=df_test,
                        target_column=target_column)

# Create VM model
vm_model_glm = vm.init_model(
    model = model_fit_glm, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

: 

In [None]:
from validmind.tests.model_validation.sklearn.ConfusionMatrix import ConfusionMatrix

test_context = TestContext(model= vm_model_glm)
metric = ConfusionMatrix(test_context)
metric.run()
metric.result.show()

: 

In [None]:
from validmind.tests.model_validation.sklearn.ROCCurve import ROCCurve

test_context = TestContext(model= vm_model_glm)

metric = ROCCurve(test_context)
metric.run()
metric.result.show()

: 

More plots on performance in-sample out of sample. 

### Scorecard Development

#### Distribution of Probability of Default

In [None]:
def compute_pd(model_fit, X_train):

    # Predict probabilities
    probabilities = model_fit.predict(X_train)

    # The probabilities are a 2D array with probabilities for the two classes.
    # We are interested in the probability of default, which is the second column.
    pd = probabilities

    # Add PD as a new column in X_train
    X_train['PD'] = pd

    return X_train

X_train_pd = compute_pd(model_fit_glm, X_train)
df_train_pd = pd.concat([X_train_pd, y_train], axis=1)

# Update df_test
X_test_pd = compute_pd(model_fit_glm, X_test)
df_test_pd = pd.concat([X_test_pd, y_test], axis=1)

: 

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_pd_histogram(df_train, df_test, pd_col, target_col):
    # Separate PD based on target column for training data
    pd_train_0 = df_train[df_train[target_col] == 0][pd_col]
    pd_train_1 = df_train[df_train[target_col] == 1][pd_col]

    # Separate PD based on target column for testing data
    pd_test_0 = df_test[df_test[target_col] == 0][pd_col]
    pd_test_1 = df_test[df_test[target_col] == 1][pd_col]

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create histograms for training data
    trace_train_0 = go.Histogram(x=pd_train_0, opacity=0.75, name=f'Train {target_col} = 0')
    trace_train_1 = go.Histogram(x=pd_train_1, opacity=0.75, name=f'Train {target_col} = 1')

    # Create histograms for testing data
    trace_test_0 = go.Histogram(x=pd_test_0, opacity=0.75, name=f'Test {target_col} = 0')
    trace_test_1 = go.Histogram(x=pd_test_1, opacity=0.75, name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout to overlay the histograms in each subplot
    fig.update_layout(barmode='overlay', title_text='Histogram of Probability of Default')

    # Show the figure
    fig.show()

plot_pd_histogram(df_train_pd,
                  df_test_pd, 
                  pd_col='PD', 
                  target_col=target_column)

: 

In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_cumulative_pd(df_train, df_test, pd_col, target_col):
    # Separate PD based on target column for training data
    pd_train_0 = np.sort(df_train[df_train[target_col] == 0][pd_col])
    pd_train_1 = np.sort(df_train[df_train[target_col] == 1][pd_col])

    # Separate PD based on target column for testing data
    pd_test_0 = np.sort(df_test[df_test[target_col] == 0][pd_col])
    pd_test_1 = np.sort(df_test[df_test[target_col] == 1][pd_col])

    # Calculate cumulative distributions
    cumulative_pd_train_0 = np.cumsum(pd_train_0) / np.sum(pd_train_0)
    cumulative_pd_train_1 = np.cumsum(pd_train_1) / np.sum(pd_train_1)
    cumulative_pd_test_0 = np.cumsum(pd_test_0) / np.sum(pd_test_0)
    cumulative_pd_test_1 = np.cumsum(pd_test_1) / np.sum(pd_test_1)

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create line plots for training data
    trace_train_0 = go.Scatter(x=pd_train_0, y=cumulative_pd_train_0, mode='lines', name=f'Train {target_col} = 0')
    trace_train_1 = go.Scatter(x=pd_train_1, y=cumulative_pd_train_1, mode='lines', name=f'Train {target_col} = 1')

    # Create line plots for testing data
    trace_test_0 = go.Scatter(x=pd_test_0, y=cumulative_pd_test_0, mode='lines', name=f'Test {target_col} = 0')
    trace_test_1 = go.Scatter(x=pd_test_1, y=cumulative_pd_test_1, mode='lines', name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout
    fig.update_layout(title_text='Cumulative Probability of Default')

    # Show the figure
    fig.show()

plot_cumulative_pd(df_train_pd,
                  df_test_pd, 
                  pd_col='PD', 
                  target_col=target_column)

: 

#### Distribution of Credit Scores

In [None]:
def compute_credit_score(model_fit, X_train, target_score, target_odds, pdo):
    # Get logistic regression coefficients
    beta = model_fit.params.values

    # Get intercept (alpha)
    alpha = model_fit.params[0]  # Intercept is the first parameter in statsmodels

    # Calculate factor
    factor = pdo / np.log(2)

    # Calculate offset
    offset = target_score - (factor * np.log(target_odds))

    # Initialize an empty list to store scores
    scores = []

    # Loop over each row in the training data
    for _, row in X_train.iterrows():
        # Initialize score for current row
        score_i = 0

        # Add contribution of each feature to the score
        for i in range(len(beta)):
            WoE_i = row[i + 1]  # WoE for feature i, assuming intercept is in the first column
            score_i += (beta[i] * WoE_i + alpha / len(beta)) * factor + offset / len(beta)

        # Add score to the list of scores
        scores.append(score_i)

    # Add scores as a new column in X_train
    X_train['score'] = scores

    return X_train


# Set target_score, target_odds, and pdo
target_score = 600
target_odds = 50
pdo = 20

# Compute credit scores and add to df_train
X_train_scores = compute_credit_score(model_fit_glm, X_train_pd, target_score, target_odds, pdo)
df_train_scores = pd.concat([X_train_scores, y_train], axis=1)

# Update df_test 
X_test_scores = compute_credit_score(model_fit_glm, X_test_pd, target_score, target_odds, pdo)
df_test_scores = pd.concat([X_test_scores, y_test], axis=1)

: 

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_score_histogram(df_train, df_test, score_col, target_col):
    # Separate scores based on target column for training data
    scores_train_0 = df_train[df_train[target_col] == 0][score_col]
    scores_train_1 = df_train[df_train[target_col] == 1][score_col]

    # Separate scores based on target column for testing data
    scores_test_0 = df_test[df_test[target_col] == 0][score_col]
    scores_test_1 = df_test[df_test[target_col] == 1][score_col]

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create histograms for training data
    trace_train_0 = go.Histogram(x=scores_train_0, opacity=0.75, name=f'Train {target_col} = 0')
    trace_train_1 = go.Histogram(x=scores_train_1, opacity=0.75, name=f'Train {target_col} = 1')

    # Create histograms for testing data
    trace_test_0 = go.Histogram(x=scores_test_0, opacity=0.75, name=f'Test {target_col} = 0')
    trace_test_1 = go.Histogram(x=scores_test_1, opacity=0.75, name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout to overlay the histograms in each subplot
    fig.update_layout(barmode='overlay', title_text='Histogram of Scores')

    # Show the figure
    fig.show()

plot_score_histogram(df_train_scores, 
                     df_test_scores, 
                     score_col='score', 
                     target_col=target_column)

: 

Method B.

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def calculate_credit_scores(model, scaling_factor=None, base_points=None):
    # Set default values if not provided
    if scaling_factor is None:
        scaling_factor = 20 / np.log(2)
    if base_points is None:
        base_points = 500

    # Get the coefficients from the model
    coefficients = model.params.values
    
    # Get the feature names from the model
    selected_features = model.params.index

    # Calculate odds ratios
    odds_ratios = np.exp(coefficients).reshape(-1)
    
    # Calculate the scores for each coefficient
    scores = scaling_factor * np.log(odds_ratios)
    scores = base_points - scores

    # Create a DataFrame to store feature names and their corresponding scores
    feature_scores = pd.DataFrame({'Feature': selected_features, 'Score': scores})

    # Sort the DataFrame in descending order of scores
    feature_scores = feature_scores.sort_values(by='Score', ascending=False)

    return feature_scores


scores = calculate_credit_scores(model_fit_glm)
display(scores)

: 

## Model Training 

Separating Features and Target Variables for Training and Test Sets.

In [None]:
X_train = df_train.drop(target_column, axis=1)  
y_train = df_train[target_column]  

X_test = df_test.drop(target_column, axis=1)  
y_test = df_test[target_column]

: 

In [None]:
vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"features": None, 
          "declutter": False,
          "fontsize": 13}

metric = HeatmapFeatureCorrelations(test_context, params)
metric.run()
metric.result.show()

: 

In [None]:
vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

features = get_numerical_columns(df_train)
params = {"declutter": True,
          "features": None,
          "fontsize": 13}

metric = FeatureTargetCorrelationPlot(test_context, params)
metric.run()
metric.result.show()

: 

In [None]:
vm_df = vm.init_dataset(dataset=X_train)
test_context = TestContext(dataset=vm_df)

metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

: 

### Feature Statistical Significance 

Train a GLM Logistic Regression Model.

In [None]:
import statsmodels.api as sm

# Add constant to X_train for intercept term
#X_train = sm.add_constant(X_train)

# Define the model
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
model_fit = model.fit()

# Print out the statistics
print(model_fit.summary())

: 

**Run VM Test**

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionModelsCoeffs import RegressionModelsCoeffs

# Create VM test and train datasets
vm_train_ds = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
vm_test_ds = vm.init_dataset(dataset=df_test,
                        target_column=target_column)

# Create VM model
vm_model_reg = vm.init_model(
    model = model_fit, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)
list_of_models = [vm_model_reg]
test_context = TestContext(models=list_of_models)

# Run test
metric = RegressionModelsCoeffs(test_context)
metric.run()
metric.result.show()

: 

Statistical Significance of Features.

**Run VM Test**

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionFeatureSignificance import RegressionFeatureSignificance

params = {"p_threshold": 0.1,
          "fontsize": 12}

metric = RegressionFeatureSignificance(test_context, params)
metric.run()
metric.result.show()

: 

### Feature Importance

Build a Decision Tree model to calculate feature importance on all preliminary features.

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
tree_model = DecisionTreeClassifier(random_state=0)

# Fit the model
tree_model_fit = tree_model.fit(X_train, y_train)

: 

In [None]:
from validmind.tests.model_validation.sklearn.PermutationFeatureImportance import PermutationFeatureImportance

# Create VM model
vm_model_pfi = vm.init_model(
    model = tree_model_fit, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

test_context = TestContext(model=vm_model_pfi)

params = {"fontsize": None,
          "figure_height": 1000}

metric = PermutationFeatureImportance(test_context, params)
metric.run()
metric.result.show()

: 

### Feature Importance vs Significance

In [None]:
from validmind.tests.model_validation.statsmodels.FeatureImportanceAndSignificance import FeatureImportanceAndSignificance

test_context = TestContext(models=[vm_model_reg, vm_model_pfi])

params = {"fontsize": 12,
          "p_threshold": 0.05,
          "significant_only": False,
          "figure_height": 1000,
          "bar_width": 0.4}

metric = FeatureImportanceAndSignificance(test_context, params)
metric.run()
metric.result.show()

: 

### Drop Features

In [None]:
drop_features = ['total_acc', 'purpose__house', 'purpose__medical', 'home_ownership__OTHER', 
                 'purpose__vacation', 'purpose__renewable_energy', 'grade__F', 
                 'purpose__major_purchase', 'purpose__wedding', 'purpose__home_improvement', 'grade__G',
                 'purpose__moving', 'purpose__other', 'verification_status__Source Verified']

X_train.drop(drop_features, axis=1, inplace=True)
X_test.drop(drop_features, axis=1, inplace=True)

# If y_train and y_test are Series objects, convert them to DataFrame
if isinstance(y_train, pd.Series):
    y_train = y_train.to_frame()
if isinstance(y_test, pd.Series):
    y_test = y_test.to_frame()

# Concatenate X_train with y_train and X_test with y_test
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

: 

### Fit GLM Model 

In [None]:
# Update VM dataset
vm_train_ds = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
vm_test_ds = vm.init_dataset(dataset=df_test,
                        target_column=target_column)

# Fit model
# X_train = sm.add_constant(X_train) #BUG: need to fix model.py to support models with intercept
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
model_fit_glm = model.fit()
print(model_fit_glm.summary())


# Add constant to the input data if necessary
# X_test = sm.add_constant(X_test) #BUG: fix model.py to support intercepts in regression models 
y_pred = model_fit_glm.predict(X_test)

: 

**Compute Metric Risk Scores**

: 

**Define Metric Risk Thresholds**

: 

**Compute Metric Risk Scoring** 

: 

In [None]:
# Create VM model
vm_model_glm = vm.init_model(
    model = model_fit_glm, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

: 

### Fit Decision Tree Model

In [None]:
# Fit Decision Tree model
model_fit_tree = tree_model.fit(X_train, y_train)

# Create VM model
vm_model_tree = vm.init_model(
    model = model_fit_tree, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)


: 

## Model Evaluation

### Model Performance Metrics

**Define Model Risk Scoring Thresholds**

In [None]:
import pandas as pd

def risk_scoring_thresholds(metric_ranges):
    # Create an empty list to store rows
    rows = []

    # Iterate through each metric in the dictionary
    for metric, (min_value, max_value) in metric_ranges.items():
        # Calculate the color ranges and round the values
        red_range = [round(min_value, 2), round(min_value + (max_value - min_value) / 3, 2)]
        amber_range = [round(min_value + (max_value - min_value) / 3, 2), round(min_value + 2 * (max_value - min_value) / 3, 2)]
        green_range = [round(min_value + 2 * (max_value - min_value) / 3, 2), round(max_value, 2)]

        # Append metric and its corresponding ranges to the rows list
        rows.append([metric, red_range, amber_range, green_range])

    # Create a dataframe from the rows list
    table = pd.DataFrame(rows, columns=["Metric", "RED", "AMBER", "GREEN"])

    return table

metric_ranges = {
    "Accuracy": [0.1, 0.9],
    "ROC-AUC": [0.1, 0.9],
    "Precision": [0.1, 0.9],
    "Recall": [0.1, 0.9],
    "F1": [0.1, 0.9]
}

risk_thresholds = risk_scoring_thresholds(metric_ranges)
display(risk_thresholds)

: 

: 

### Confusion Matrix

In [None]:
from validmind.tests.model_validation.sklearn.ConfusionMatrix import ConfusionMatrix

test_context = TestContext(model= vm_model_glm)

metric = ConfusionMatrix(test_context)
metric.run()
metric.result.show()

: 

### ROC-AUC Curve

In [None]:
from validmind.tests.model_validation.sklearn.ROCCurve import ROCCurve

test_context = TestContext(model= vm_model_glm)

metric = ROCCurve(test_context)
metric.run()
metric.result.show()

: 

In [None]:
from validmind.tests.model_validation.sklearn.MinimumROCAUCScore import MinimumROCAUCScore

test_context = TestContext(model= vm_model_glm)

metric = MinimumROCAUCScore(test_context)
metric.run()
metric.result.show()

: 

### GINI Coefficients

In [None]:
from sklearn.metrics import roc_auc_score

def gini(true, pred):
    """Calculate Gini coefficient given true and predicted labels"""
    gini_score = 2 * roc_auc_score(true, pred) - 1
    return gini_score

gini_coefficient = gini(y_test, y_pred)
(f"Gini Coefficient: {gini_coefficient}")


: 

: 

### Scorecard Development

### Map Model Fit Coefficients to Scores

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def calculate_scores(model, scaling_factor=None, base_points=None):
    # Set default values if not provided
    if scaling_factor is None:
        scaling_factor = 20 / np.log(2)
    if base_points is None:
        base_points = 500

    # Get the coefficients from the model
    coefficients = model.params.values
    
    # Get the feature names from the model
    selected_features = model.params.index

    # Calculate odds ratios
    odds_ratios = np.exp(coefficients).reshape(-1)
    
    # Calculate the scores for each coefficient
    scores = scaling_factor * np.log(odds_ratios)
    scores = base_points - scores

    # Create a DataFrame to store feature names and their corresponding scores
    feature_scores = pd.DataFrame({'Feature': selected_features, 'Score': scores})

    # Sort the DataFrame in descending order of scores
    feature_scores = feature_scores.sort_values(by='Score', ascending=False)

    return feature_scores


scores = calculate_scores(model_fit)
display(scores)

: 

# Appendix 1

## Model Risk Assessment

### **Model Fit Metric** Risk Scores

**Define Metric Risk Thresholds**

In [None]:
model_fit_risk_thresholds = {
    "D-Squared": {
        "red": [0, 0.4],
        "amber": [0.4, 0.7],
        "green": [0.7, 1.0]
    },
    "Ratio of Significant Features": {
        "red": [0, 40],
        "amber": [40, 70],
        "green": [70, 100]
    }
}

: 

**Compute Metric Risk Scores**

In [None]:
def regression_model_fit_risk_scores(model_fit_glm):
    # Risk Measure 1: D Squared
    d_squared = 1 - (model_fit_glm.deviance / model_fit_glm.null_deviance)

    # Risk Measure 2: Percentage of features with p-value less than 0.05
    pvalues = model_fit_glm.pvalues
    significant_features = np.sum(pvalues < 0.05)
    total_features = pvalues.shape[0]
    percent_significant_features = (significant_features / total_features)

    # Create DataFrame
    data = {
        "Metric Risk Measure": ["D-Squared", "Ratio of Significant Features"],
        "Description": [
            "D-Squared: Proportion of the variability in the response variable explained by the model.",
            "Ratio of Significant Features: Percentage of features with a p-value less than 0.05."
        ],
        "Metric Risk Score": [d_squared, percent_significant_features],
    }

    risk_scores = pd.DataFrame(data)

    # Round to 1 decimal place
    risk_scores["Metric Risk Score"] = risk_scores["Metric Risk Score"].round(1)

    return risk_scores

model_fit_risk_scores = regression_model_fit_risk_scores(model_fit_glm)
display(model_fit_risk_scores)

: 

### **Model Performance Metric** Risk Scores

**Define Metric Risk Thresholds**

In [None]:
model_performance_risk_thresholds = {
    "Accuracy": {
        "red": [0, 0.5],
        "amber": [0.5, 0.75],
        "green": [0.75, 1.0]
    },
    "ROC-AUC": {
        "red": [0, 0.6],
        "amber": [0.6, 0.85],
        "green": [0.85, 1.0]
    },
    "Precision": {
        "red": [0, 0.4],
        "amber": [0.4, 0.6],
        "green": [0.6, 1.0]
    },
    "Recall": {
        "red": [0, 0.4],
        "amber": [0.4, 0.6],
        "green": [0.6, 1.0]
    },
    "F1": {
        "red": [0, 0.4],
        "amber": [0.4, 0.6],
        "green": [0.6, 1.0]
    }
}

: 

**Compute Metric Risk Scores**

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def regression_performance_risk_scores(y_true, y_pred_probs, threshold=0.5):
    # Threshold the probabilities to get the binary predictions
    y_pred = (y_pred_probs > threshold).astype(int)

    # Compute the metrics
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_probs)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Create a DataFrame
    metrics_df = pd.DataFrame({
        "Metric Risk Measure": ["Accuracy", "ROC-AUC", "Precision", "Recall", "F1"],
         "Description": [
            "Proportion of the total number of predictions that were correct.",
            "Aggregate measure of performance across all possible classification thresholds.",
            "Proportion of positive identifications that were actually correct.",
            "Proportion of actual positives that were identified correctly.",
            "Harmonic mean of precision and recall, it tries to find the balance between precision and recall."
        ],
        "Metric Risk Score": [accuracy, roc_auc, precision, recall, f1],
    })

    return metrics_df

y_pred = model_fit_glm.predict(X_test)
model_performance_risk_scores = regression_performance_risk_scores(y_test, y_pred)
display(model_performance_risk_scores)


: 

### **Model Risk Assessment** 

In [None]:
def metric_risk_assessment(risk_thresholds, test_results):
    # Prepare thresholds and test_results data
    thresholds_df = pd.DataFrame(risk_thresholds).T.reset_index()
    thresholds_df.columns = ['Metric Risk Measure', 'RED', 'AMBER', 'GREEN']
    
    test_results_df = test_results.rename(columns={"Risk Measure": "Metric Risk Measure", "Risk Score": "Metric Risk Score"})

    # Add a "GREY" column to the thresholds DataFrame and initialize it with "Fail"
    thresholds_df["GREY"] = "Fail"

    # Replace the range values in the thresholds DataFrame with "Pass" or "Fail"
    for row in thresholds_df.index:
        metric = thresholds_df.loc[row, "Metric Risk Measure"]
        test_result = test_results_df[test_results_df["Metric Risk Measure"] == metric]["Metric Risk Score"].values[0]
        grey_pass = True

        for col in ['RED', 'AMBER', 'GREEN']:
            range_values = thresholds_df.loc[row, col]
            range_start, range_end = extract_range(range_values)
            if range_start is not None and range_end is not None:
                if range_start <= test_result <= range_end:
                    thresholds_df.loc[row, col] = "Pass"
                    grey_pass = False
                else:
                    thresholds_df.loc[row, col] = "Fail"

        if grey_pass:
            thresholds_df.loc[row, "GREY"] = "Pass"

    # Consolidate the risk levels into a single column
    risk_levels = ['GREY', 'RED', 'AMBER', 'GREEN']
    risk_scoring_table = pd.concat([test_results_df, thresholds_df[risk_levels]], axis=1)
    risk_scoring_table['Metric Risk Assessment'] = risk_scoring_table[risk_levels].apply(
        lambda x: next((level for level in risk_levels if x[level] == "Pass"), None),
        axis=1
    )
    risk_scoring_table.drop(columns=risk_levels, inplace=True)

    # Reorder the columns to desired order
    risk_scoring_table = risk_scoring_table[['Metric Risk Measure', 'Description', 'Metric Risk Score', 'Metric Risk Assessment']]

    return risk_scoring_table

def extract_range(value):
    if isinstance(value, (list, tuple)) and len(value) == 2:
        return value[0], value[1]
    else:
        return None, None

def color_cells(val):
    colors = {"GREEN": "green", "AMBER": "yellow", "RED": "red", "GREY": "grey"}
    return 'background-color: %s' % colors[val]

: 

In [None]:
# Compute risk assessments for all metrics
model_fit_risk_assessment = metric_risk_assessment(model_fit_risk_thresholds, model_fit_risk_scores)
model_performance_risk_assessment = metric_risk_assessment(model_performance_risk_thresholds, model_performance_risk_scores)

model_risk_assessment = pd.concat([model_performance_risk_assessment, model_fit_risk_assessment]).reset_index(drop=True)
model_risk_assessment.style.applymap(color_cells, subset=['Metric Risk Assessment'])

: 

# Appendix 2

## Step 8: Univariate Analysis

### Histograms of Numerical Features

**Update VM Dataset and Run Test**

In [None]:
from validmind.tests.data_validation.TabularNumericalHistograms import TabularNumericalHistograms

vm_df_train = vm.init_dataset(dataset=df_train)
test_context = TestContext(dataset=vm_df_train)

metric = TabularNumericalHistograms(test_context)
metric.run()
metric.result.show()

: 

In [None]:
# If 'df' is your DataFrame and 'column_name' is the name of the column
unique_values = df['inq_last_6mths'].unique()
print(unique_values)


: 

### High Cardinality of Categorical Features

**Run Test**

In [None]:
from validmind.tests.data_validation.HighCardinality import HighCardinality
metric = HighCardinality(test_context)
metric.run()
metric.result.show()

: 

### Bar Plots of Categorical Features

**Run Test**

In [None]:
from validmind.tests.data_validation.TabularCategoricalBarPlots import TabularCategoricalBarPlots
metric = TabularCategoricalBarPlots(test_context)
metric.run()
metric.result.show()

: 

### Default Ratios by Categorical Feature

**Run Test**

In [None]:
from validmind.tests.data_validation.DefaultRateBarPlots import DefaultRatioBarPlots

# Configure the metric
params = {
    "default_column": target_column,
    "columns": None
}

metric = DefaultRatioBarPlots(test_context, params=params)
metric.run()
metric.result.show()

: 

## Step 9: Multivariate Analysis

### Bivariate Bar Plots of Default Ratios

**Update VM Dataset and Run Test**

In [None]:
from validmind.tests.data_validation.BivariateFeaturesBarPlots import BivariateFeaturesBarPlots

# Pass target column to validmind dataset
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure the metric
features_pairs = {'home_ownership': 'grade', 
                  'purpose': 'grade',
                  'grade': 'verification_status'}

params = {
    "features_pairs": features_pairs,
}

metric = BivariateFeaturesBarPlots(test_context, params=params)
metric.run()

: 

### Scatter Plots by Default Status

**Run Test**

In [None]:
from validmind.tests.data_validation.BivariateScatterPlots import BivariateScatterPlots

features_pairs = {'int_rate': 'annual_inc', 
                  'funded_amnt_inv': 'dti', 
                  'annual_inc': 'funded_amnt_inv',
                  'loan_amnt': 'int_rate',
                  'int_rate': 'annual_inc',
                  'earliest_cr_line': 'int_rate'}

params = {
    "features_pairs": features_pairs,
    "target_filter": None
}

metric = BivariateScatterPlots(test_context, params=params)
metric.run()

: 

### Bivariate Histograms

**Run Test**

In [None]:
from validmind.tests.data_validation.BivariateHistograms import BivariateHistograms

features_pairs = {'int_rate': 'annual_inc', 
                  'funded_amnt_inv': 'dti', 
                  'annual_inc': 'funded_amnt_inv',
                  'loan_amnt': 'int_rate',
                  'int_rate': 'annual_inc',
                  'earliest_cr_line': 'int_rate'}

params = {
    "features_pairs": features_pairs,
    "target_filter": None
}

metric = BivariateHistograms(test_context, params=params)
metric.run()

: 

**Run Test**

In [None]:
from validmind.tests.data_validation.PearsonCorrelationMatrix import PearsonCorrelationMatrix

metric = PearsonCorrelationMatrix(test_context)
metric.run()
metric.result.show()

: 

## Step 10: Feature Engineering 

### Add Dummy Catergorical Variables

In [None]:
def add_dummy_variables(df, columns_list):
    """
    Generate dummy variables for specified columns in the DataFrame,
    concatenate them with the original DataFrame.

    Parameters:
    df (pandas.DataFrame): DataFrame to be processed.
    columns_list (list): List of column names to be processed.
    """
    for column in columns_list:
        dummies = pd.get_dummies(df[column], prefix=column + ":", drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df


: 

In [None]:
# df_train = add_dummy_variables(df_train, ['grade', 'home_ownership', 'verification_status', 'purpose'])
# df_test = add_dummy_variables(df_test, ['grade', 'home_ownership', 'verification_status', 'purpose'])

: 

In [None]:
# Adjust the X_test DataFrame to match the column structure of the X_train DataFrame
# df_test = df_test.reindex(labels=df_train.columns, axis=1, fill_value=0)

: 

### Weight of Evidence (WoE) Binning

From a modelling perspective, the **WoE** allows us to transform raw variables into a format which provides a more robust base for statistical analysis. Specifically, the WoE measures the predictive power of an individual class of a categorical variable, distinguishing between 'good' (non-defaulters) and 'bad' (defaulters) risks. This is accomplished by comparing the distribution of 'good' and 'bad' risks within a specific category to the overall 'good'/'bad' distribution. If the 'good'/'bad' ratio of a particular category is significantly divergent from the overall ratio, it suggests that category is a strong predictor of credit risk.

**Information Value (IV)**, on the other hand, is a fundamental metric we use to quantify the predictive power of each input variable in our scorecards. The IV is calculated by taking the sum of the differences between the WoE of each category and the overall WoE, multiplied by the WoE of that category. In other words, IV measures the total amount of 'information' or predictive power a variable brings to the model. For example, variables with an IV between 0.1 and 0.3 provide a weak predictive power, those between 0.3 and 0.5 a medium predictive power, and those with an IV greater than 0.5 have strong predictive power. Therefore, we utilize the IV to prioritize variables for inclusion in the model and to ensure the model's stability and accuracy.

#### WoE and IV for Categorical Variables

: 

In [None]:
#categorical_woe_iv_df = calculate_woe_iv(df_train, target_column, categorical_features)
#display(categorical_woe_iv_df)

: 

**Update VM Dataset and Run Test**

In [None]:
from validmind.tests.data_validation.WOEIVPlots import WoEandIVPlots

# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure test parameters
params = {
    "features": categorical_features,
    "label_rotation": 90
}

# Run test
metric = WoEandIVPlots(test_context, params=params)
metric.run()
metric.result.show()

: 

## Step 11: Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# First, we define the preprocessing steps
numeric_features = ['pub_rec', 'revol_util', 'funded_amnt_inv', 'int_rate', 'dti', 'annual_inc', 'loan_amnt', 'earliest_cr_line']
categorical_features = ['term', 'grade', 'purpose', 'annual_inc_range', 'loan_amnt_range']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))])



# Train the model
clf.fit(X_train, y_train)

# We can now evaluate on the test set
print("model score: %.3f" % clf.score(X_test, y_test))


: 

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import pandas as pd

# First, we define the preprocessing steps
numeric_features = ['pub_rec', 'revol_util', 'funded_amnt_inv', 'int_rate', 'dti', 'annual_inc', 'loan_amnt', 'earliest_cr_line']
categorical_features = ['term', 'grade', 'purpose', 'annual_inc_range', 'loan_amnt_range', 'installment']  # Added 'installment'

# Handle categorical features
df_encoded = pd.get_dummies(df_multivariate, columns=categorical_features)

# Split the data
X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Add a constant to the independent values
X_train = sm.add_constant(X_train)

# Define the model
glm_model_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
results = glm_model_fit.fit()

# Print the summary
print(results.summary())

# Evaluate on the test set
X_test = sm.add_constant(X_test)  # Adding a constant to the test data
y_pred = results.predict(X_test)

# You can then further analyze y_pred to measure model performance on the test set.

: 

Scale variable X. 

In [None]:
import statsmodels.api as sm
from sklearn.preprocessing import scale

# Scale your variables
X_scaled = scale(X)

# Add a constant to the independent values
X_scaled = sm.add_constant(X_scaled)

# Define the model
model = sm.GLM(y, X_scaled, family=sm.families.Binomial())

# Fit the model
results = model.fit()

# Print the summary
print(results.summary())


: 

#### ValidMind Models 

In [None]:
# Initialize training and testing datasets for model A
vm_train_ds = vm.init_dataset(dataset=X_train, type="generic", target_column='loan_status')
vm_test_ds = vm.init_dataset(dataset=X_test, type="generic", target_column='loan_status')

# Initialize model A
vm_model_A = vm.init_model(
    model = glm_model_fit, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

: 