# Credit Risk Scorecard Demo

## Introduction

#### Use Case

The **Credit risk Scorecard** model created from the Lending Club dataset is instrumental in computing the Probability of Default (PD), a key factor in ECL calculations. This scorecard assesses several credit characteristics of potential borrowers, like their credit history, income, outstanding debts, and more, each of which is assigned a specific score. By combining these scores, we derive a total score for each borrower, which translates into an estimated Point-in-Time (PiT) PD. The PiT PD reflects the borrower's likelihood of default at a specific point in time, accounting for both current and foreseeable future conditions.

Additionally, for a holistic view of credit risk, it's essential to estimate the Lifetime PD. The Lifetime PD, as the name suggests, predicts the borrower's likelihood of default throughout the life of the exposure, taking into account potential future changes in the economic and financial conditions.

#### Import Libraries

In [1]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

# Standard library imports
import re
import pickle
from datetime import datetime
from typing import List

# Data handling and analysis imports
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Visualization imports
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# File handling import
import zipfile


#### Connect to ValidMind Project

In [2]:

import validmind as vm

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "2494c3838f48efe590d531bfe225d90b",
  api_secret = "4f692f8161f128414fef542cab2a4e74834c75d01b3a8e088a1834f2afcfe838",
  project = "clk00h0u800x9qjy67gduf5om"
)
  
  

2023-07-27 15:06:45,509 - INFO(validmind.api_client): Connected to ValidMind. Project: [6] Credit Risk Scorecard - Initial Validation (clk00h0u800x9qjy67gduf5om)


#### Processing Functions

In [3]:
def save_model_and_df(model, df, base_filename):
    """Save a model and a dataframe with a timestamp in the filename"""
    # Get current date and time
    now = datetime.now()

    # Convert the current date and time to string
    timestamp_str = now.strftime("%Y%m%d_%H%M%S")

    filename = f'{base_filename}_{timestamp_str}.pkl'

    # Save the model and dataframe
    with open(filename, 'wb') as file:
        pickle.dump((model, df), file)
        
    print(f"Model and dataframe saved as {filename}")


In [4]:
def get_numerical_columns(df):
        numerical_columns = df.select_dtypes(
            include=["int", "float", "uint"]
        ).columns.tolist()
        return numerical_columns

def get_categorical_columns(df):
        categorical_columns = df.select_dtypes(
            include=["object", "category"]
        ).columns.tolist()
        return categorical_columns

In [5]:
def add_target_column(df, target_column):
    # Assuming the column name is 'loan_status'
    df[target_column] = df['loan_status'].apply(lambda x: 0 if x == "Fully Paid" else 1 if x == "Charged Off" else np.nan)
    # Remove rows where the target column is NaN
    df = df.dropna(subset=[target_column])
    # Convert target column to integer
    df[target_column] = df[target_column].astype(int)
    return df

In [6]:
unused_variables = ["id", "member_id", "funded_amnt", "emp_title", "url", "desc", "application_type",
                    "title", "zip_code", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record",
                    "revol_bal", "total_rec_prncp", "total_rec_late_fee", "recoveries", "out_prncp_inv", "out_prncp", 
                    "collection_recovery_fee", "next_pymnt_d", "initial_list_status", "pub_rec",
                    "collections_12_mths_ex_med", "policy_code", "acc_now_delinq", "pymnt_plan",
                    "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "last_pymnt_d", "last_credit_pull_d",
                    'earliest_cr_line', 'issue_d']

In [7]:
def variables_with_min_missing(df, min_missing_percentage):
    # Calculate the percentage of missing values in each column
    missing_percentages = df.isnull().mean() * 100

    # Get the variables where the percentage of missing values is greater than the specified minimum
    variables_to_drop = missing_percentages[missing_percentages > min_missing_percentage].index.tolist()

    # Also add any columns where all values are missing
    variables_to_drop.extend(df.columns[df.isnull().all()].tolist())

    # Remove duplicates (if any)
    variables_to_drop = list(set(variables_to_drop))

    return variables_to_drop

In [8]:
def clean_term_column(df, column):
    """
    Function to remove 'months' string from the 'term' column and convert it to categorical
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].str.replace(' months', '')
    
    # Convert to categorical
    df[column] = df[column].astype('object')

def clean_emp_length_column(df, column):
    """
    Function to clean 'emp_length' column and convert it to categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].replace('n/a', np.nan)
    df[column] = df[column].str.replace('< 1 year', str(0))
    df[column] = df[column].apply(lambda x: re.sub('\D', '', str(x)))
    df[column].fillna(value = 0, inplace=True)

    # Convert to categorical
    df[column] = df[column].astype('object')

def clean_inq_last_6mths(df, column):
    """
    Function to convert 'inq_last_6mths' column into categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")

    # Convert to categorical
    df[column] = df[column].astype('category')

In [9]:
def compute_outliers(series, threshold=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return series[(series < lower_bound) | (series > upper_bound)]

def remove_iqr_outliers(df, target_column, threshold=1.5):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols.remove(target_column)  # Exclude target_column from numerical columns
    for col in num_cols:
        outliers = compute_outliers(df[col], threshold)
        df = df[~df[col].isin(outliers)]
    return df

In [10]:
import pandas as pd
import numpy as np

def class_binning(df, bin_mappings):
    df_new = df.copy()
    
    for feature, bins in bin_mappings.items():
        # Convert to numeric, turn non-numeric data into NaN
        df_new[feature] = pd.to_numeric(df_new[feature], errors='coerce')
        
        # Fill NaN with a default value
        df_new[feature].fillna(-1, inplace=True)
        
        # Bin the feature
        bins = [-np.inf] + bins + [np.inf]
        df_new[f'{feature}_bucket'] = pd.cut(df_new[feature], bins=bins, right=False, include_lowest=True)
        df_new.drop(columns=feature, inplace=True)
    
    return df_new


def class_coarsing(df, coarse_mappings):
    df_new = df.copy()

    for feature, coarse_bins in coarse_mappings.items():
        df_new[f'{feature}_coarse'] = df_new[feature]  # start with original categories

        for i, coarse_bin in enumerate(coarse_bins):
            df_new.loc[df_new[feature].isin(coarse_bin), f'{feature}_coarse'] = f'{feature}_group_{i}'

        df_new.drop(columns=feature, inplace=True)
    
    return df_new

In [11]:
def find_categorical_features(df):
    # Get the column names of features with the data type "category"
    categorical_features = df.select_dtypes(include='category').columns.tolist()

    return categorical_features


def convert_categorical_to_object(df):
    # Find the categorical features
    categorical_features = find_categorical_features(df)

    # Convert the categorical features to object type
    df[categorical_features] = df[categorical_features].astype(str)

In [12]:
def check_categories(woe_df, original_df):
    for feature in woe_df['Feature'].unique():
        woe_categories = woe_df[woe_df['Feature'] == feature]['Category'].unique()
        original_categories = original_df[feature].unique()
        
        # Check categories in WoE table that are not in original DataFrame
        for category in woe_categories:
            if category not in original_categories:
                print(f"Category '{category}' not found in feature '{feature}' in original DataFrame.")
                
        # Check categories in original DataFrame that are not in WoE table
        for category in original_categories:
            if category not in woe_categories:
                print(f"Category '{category}' in feature '{feature}' not found in WoE table.")

In [13]:
def woe_encoder(woe_df, original_df, target):
    # Create a new DataFrame with the same columns as original_df
    woe_encoded_df = pd.DataFrame(columns=original_df.columns, index=original_df.index)

    # Loop through each feature-category and assign the corresponding WoE value as float
    for feature in woe_df['Feature'].unique():
        # Check that the feature exists in the original DataFrame
        if feature not in original_df.columns:
            print(f"Feature {feature} not found in original DataFrame. Skipping...")
            continue

        feature_woe = woe_df[woe_df['Feature'] == feature]
        woe_dict = dict(zip(feature_woe['Category'], feature_woe['WoE']))

        # Check that the categories exist in the original DataFrame
        # Converting both to strings to avoid datatype issues
        original_categories = original_df[feature].astype(str).unique()
        woe_categories = feature_woe['Category'].astype(str).unique()
        
        # Two-way check:
        # 1. For each category in the original DataFrame, check if it exists in the WoE DataFrame
        missing_from_woe = [category for category in original_categories if category not in woe_categories]
        if missing_from_woe:
            print(f"Categories {missing_from_woe} from original DataFrame not found in WoE DataFrame for feature {feature}.")
            
        # 2. For each category in the WoE DataFrame, check if it exists in the original DataFrame
        missing_from_original = [category for category in woe_categories if category not in original_categories]
        if missing_from_original:
            print(f"Categories {missing_from_original} from WoE DataFrame not found in original DataFrame for feature {feature}.")
        
        # Also converting original dataframe feature to string before replacement
        woe_encoded_df[feature] = original_df[feature].astype(str).replace(woe_dict).astype(float)

    # Check that the target exists in the original DataFrame
    if target not in original_df.columns:
        print(f"Target {target} not found in original DataFrame. Returning None...")
        return None

    # Add the target column to the new DataFrame
    woe_encoded_df[target] = original_df[target]

    return woe_encoded_df

## Data Description

#### Import Raw Data

In [14]:
filepath = '/Users/juanvalidmind/Dev/datasets/lending club/data_2007_2014/loan_data_2007_2014.csv'
df_raw = pd.read_csv(filepath)

  df_raw = pd.read_csv(filepath)


#### Validate Raw Data

In [15]:
from validmind.vm_models.test_context import TestContext
from validmind.tests.data_validation.DescriptiveStatistics import DescriptiveStatistics

vm_df_raw = vm.init_dataset(dataset=df_raw)
test_context_raw = TestContext(dataset=vm_df_raw)

metric = DescriptiveStatistics(test_context_raw)
metric.run()
await metric.result.log()
metric.result.show()

2023-07-27 15:06:48,329 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:06:48,329 - INFO(validmind.vm_models.dataset): Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical and categorical varia…

In [16]:
from validmind.tests.data_validation.MissingValuesBarPlot import MissingValuesBarPlot

params = {"threshold": 70,
          "fig_height": 1100}

metric = MissingValuesBarPlot(test_context_raw, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Generates a visual analysis of missing values by plotting horizontal bar plots w…

## Data Preparation

#### Process Raw Data

In [17]:
# Definition of default
target_column = 'default'
df_prep_1 = add_target_column(df_raw, target_column)

# Drop 'loan_status' variable 
df_prep_1.drop(columns='loan_status', axis=1, inplace=True)

# Remove unused variables
df_prep_1 = df_prep_1.drop(columns=unused_variables)

# Remove missing values
min_missing_count = 80
variables_to_drop = variables_with_min_missing(df_prep_1, min_missing_count)
df_prep_1.drop(columns=variables_to_drop, axis=1, inplace=True)
df_prep_1.dropna(axis=0, subset=["emp_length"], inplace=True)
df_prep_1.dropna(axis=0, subset=["revol_util"], inplace=True)

# Format variable types
clean_emp_length_column(df_prep_1, 'emp_length')
clean_term_column(df_prep_1, 'term')
clean_inq_last_6mths(df_prep_1, 'inq_last_6mths')

# Remove outliers
df_prep_1 = remove_iqr_outliers(df_prep_1, target_column, threshold=1.5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Validate Prepared Data

In [18]:
from validmind.tests.data_validation.ClassImbalance import ClassImbalance

vm_df_prep_1 = vm.init_dataset(dataset=df_prep_1,
                        target_column=target_column)
test_context_prep_1 = TestContext(dataset=vm_df_prep_1)

metric = ClassImbalance(test_context_prep_1)
metric.run()
await metric.result.log()
metric.result.show()

2023-07-27 15:06:59,701 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:06:59,701 - INFO(validmind.vm_models.dataset): Inferring dataset types...


VBox(children=(HTML(value='\n            <h2>Class Imbalance ❌</h2>\n            <p>The class imbalance test m…

In [19]:
from validmind.tests.data_validation.IQROutliersTable import IQROutliersTable

num_features = get_numerical_columns(df_prep_1)
params = {"num_features": num_features,
          "threshold": 1.5
        }

metric = IQROutliersTable(test_context_prep_1, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Analyzes the distribution of outliers in numerical features using the Interquart…

In [20]:
from validmind.tests.data_validation.IQROutliersBarPlot import IQROutliersBarPlot

num_features = get_numerical_columns(df_prep_1)
params = {"num_features": num_features,
          "threshold": 1.5,
          "fig_width": 500}

metric = IQROutliersBarPlot(test_context_prep_1, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Generates a visual analysis of the outliers for numeric variables based on perce…

## Data Sampling

#### Sampling Method

We employ stratified sampling to create our training and testing sets. Stratified sampling is particularly important in this context. When the `stratify = y` parameter is set, it ensures that the distribution of the target variable (`y`) in the test set is the same as that in the original dataset. 

This is crucial for maintaining a consistent representation of the target variable classes, especially important in scenarios where the classes are imbalanced, which is often the case in credit risk scorecards.

#### Data Split

In [21]:
# Split data into train and test 
X = df_prep_1.drop(target_column, axis = 1)
y = df_prep_1[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    random_state = 42, stratify = y)

# Concatenate X_train with y_train to form df_train
df_train_1 = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test with y_test to form df_test
df_test_1 = pd.concat([X_test, y_test], axis=1)

## Exploratory Data Analysis 

#### Validate Train Data 1

In [22]:
from validmind.tests.data_validation.TabularNumericalHistograms import TabularNumericalHistograms

vm_df = vm.init_dataset(dataset=df_train_1,
                                target_column=target_column)
test_context_train_1 = TestContext(dataset=vm_df)

metric = TabularNumericalHistograms(test_context_train_1)
metric.run()
await metric.result.log()
metric.result.show()

2023-07-27 15:07:22,116 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:07:22,116 - INFO(validmind.vm_models.dataset): Inferring dataset types...


VBox(children=(HTML(value='<p>Generates a visual analysis of numerical data by plotting the histogram. The inp…

In [23]:
from validmind.tests.data_validation.HighCardinality import HighCardinality
metric = HighCardinality(test_context_train_1)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='\n            <h2>Cardinality ✅</h2>\n            <p>The high cardinality test meas…

In [24]:
from validmind.tests.data_validation.TabularCategoricalBarPlots import TabularCategoricalBarPlots
metric = TabularCategoricalBarPlots(test_context_train_1)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Generates a visual analysis of categorical data by plotting bar plots. The input…

#### Process Data 

In [25]:
# Select rows where purpose is 'debt_consolidation' or 'credit_card'
df_train_1 = df_train_1[df_train_1['purpose'].isin(['debt_consolidation', 'credit_card'])]
df_test_1 = df_test_1[df_test_1['purpose'].isin(['debt_consolidation', 'credit_card'])]

# Remove rows where grade is 'F' or 'G'
df_train_1 = df_train_1[~df_train_1['grade'].isin(['F', 'G'])]
df_test_1 = df_test_1[~df_test_1['grade'].isin(['F', 'G'])]

# Remove rows where sub_grade starts with 'F' or 'G'
df_train_1 = df_train_1[~df_train_1['sub_grade'].str.startswith(('F', 'G'))]
df_test_1 = df_test_1[~df_test_1['sub_grade'].str.startswith(('F', 'G'))]

# Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
df_train_1 = df_train_1[~df_train_1['home_ownership'].isin(['OTHER', 'NONE', 'ANY'])]
df_test_1 = df_test_1[~df_test_1['home_ownership'].isin(['OTHER', 'NONE', 'ANY'])]

# Update train and test
df_train_2 = df_train_1.copy()
df_test_2 = df_test_1.copy()

In [26]:
from validmind.tests.data_validation.TargetRateBarPlots import TargetRateBarPlots

vm_df = vm.init_dataset(dataset=df_train_2, target_column=target_column)
test_context_train_2 = TestContext(dataset=vm_df)

# Configure the metric
params = {
    "default_column": target_column,
    "columns": None
}

metric = TargetRateBarPlots(test_context_train_2, params=params)
metric.run()
await metric.result.log()
metric.result.show()

2023-07-27 15:08:20,448 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:08:20,449 - INFO(validmind.vm_models.dataset): Inferring dataset types...


The column default is correct and contains only 1 and 0.


VBox(children=(HTML(value='<p>Generates a visual analysis of target ratios by plotting bar plots. The input da…

#### Process Data 

In [27]:
drop_categorical_features = ['addr_state']
drop_numerical_features = ['total_rec_int', 'loan_amnt',
                           'funded_amnt_inv', 'dti', 'revol_util', 'total_pymnt', 
                           'total_pymnt_inv', 'last_pymnt_amnt']

In [28]:
# Drop variables from next analysis
df_train_3 = df_train_2.drop(columns = drop_categorical_features + drop_numerical_features, axis=1)

# Update df_test 
df_test_3 = df_test_2.drop(columns = drop_categorical_features + drop_numerical_features, axis=1)

In [29]:
from validmind.tests.data_validation.ChiSquaredFeaturesTable import ChiSquaredFeaturesTable

vm_df = vm.init_dataset(dataset=df_train_3, target_column=target_column)
test_context_train_3 = TestContext(dataset=vm_df)

cat_features = get_categorical_columns(df_train_3)
params = {"cat_features": cat_features,
          "p_threshold": 0.05}

metric = ChiSquaredFeaturesTable(test_context_train_3, params)
metric.run()
await metric.result.log() 
metric.result.show()

2023-07-27 15:08:31,782 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:08:31,782 - INFO(validmind.vm_models.dataset): Inferring dataset types...


VBox(children=(HTML(value='<p>Perform a Chi-Squared test of independence for each categorical variable with th…

In [30]:
from validmind.tests.data_validation.ANOVAOneWayTable import ANOVAOneWayTable

num_features = get_numerical_columns(df_train_3)
params = {"num_features": num_features,
          "p_threshold": 0.05}

metric = ANOVAOneWayTable(test_context_train_3, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Perform an ANOVA F-test for each numerical variable with the target. The input d…

In [31]:
from validmind.tests.data_validation.PearsonCorrelationMatrix import PearsonCorrelationMatrix

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = PearsonCorrelationMatrix(test_context_train_3, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Extracts the Pearson correlation coefficient for all pairs of numerical variable…

In [32]:
from validmind.tests.data_validation.FeatureTargetCorrelationPlot import FeatureTargetCorrelationPlot

params = {"features": None}

metric = FeatureTargetCorrelationPlot(test_context_train_3, params)
metric.run()
await metric.result.log()
metric.result.show()

VBox(children=(HTML(value='<p>Generates a visual analysis of correlations between features and target by plott…

## Feature Engineering

#### Process Train Data I: Class Binning

In [33]:
import scorecardpy as sc
import pandas as pd

def binning_data(df, y):
    '''
    This function performs automatic binning using WoE.
    
    df: A pandas dataframe
    y: The target variable in quotes, e.g. 'target'
    '''

    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(exclude=['int64', 'float64']).columns

    # Convert non-numeric columns to string type
    df[non_numeric_cols] = df[non_numeric_cols].astype(str)

    # Perform binning
    try:
        bins = sc.woebin(df, y)
    except Exception as e:
        print("Error during binning: ")
        print(e)
    else:
        # Concatenate the individual dataframes into a single dataframe
        bins_df = pd.concat(bins.values(), keys=bins.keys())
        
        # Reset index and convert multi-index into columns
        bins_df.reset_index(inplace=True)
        
        # Drop the 'variable' column as it is identical to 'level_0'
        bins_df.drop(columns=['variable'], inplace=True)
        
        # Rename 'level_0' to 'variable' and 'level_1' to 'bin_number'
        bins_df.rename(columns={'level_0': 'variable', 'level_1': 'bin_number'}, inplace=True)
        
        return bins_df


bins = binning_data(df_train_3, y=target_column)

[INFO] creating woe binning ...


In [34]:
display(bins)

Unnamed: 0,variable,bin_number,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,emp_length,0,"1%,%2%,%4",25128,0.237471,20830,4298,0.171044,-0.063836,0.000948,0.001936,"1%,%2%,%4",False
1,emp_length,1,"6%,%3",15853,0.149818,13020,2833,0.178704,-0.010742,1.7e-05,0.001936,"6%,%3",False
2,emp_length,2,"10%,%5",39904,0.377111,32691,7213,0.180759,0.003194,4e-06,0.001936,"10%,%5",False
3,emp_length,3,"7%,%8%,%0%,%9",24930,0.2356,20197,4733,0.189852,0.063434,0.000967,0.001936,"7%,%8%,%0%,%9",False
4,annual_inc,0,"[-inf,44000.0)",28661,0.27086,22263,6398,0.22323,0.267468,0.021052,0.057567,44000.0,False
5,annual_inc,1,"[44000.0,66000.0)",38276,0.361726,31012,7264,0.189779,0.062965,0.001463,0.057567,66000.0,False
6,annual_inc,2,"[66000.0,96000.0)",26692,0.252252,22749,3943,0.147722,-0.238171,0.013238,0.057567,96000.0,False
7,annual_inc,3,"[96000.0,inf)",12186,0.115163,10714,1472,0.120794,-0.470521,0.021814,0.057567,inf,False
8,open_acc,0,"[-inf,7.0)",17356,0.164022,14509,2847,0.164035,-0.114095,0.002058,0.003491,7.0,False
9,open_acc,1,"[7.0,11.0)",41370,0.390965,34005,7365,0.178028,-0.01536,9.2e-05,0.003491,11.0,False


#### Validate Dataset

In [36]:
from validmind.tests.data_validation.WOEIVTable import WOEIVTable

# Update vm dataset and test context
vm_df = vm.init_dataset(dataset=df_train_3, 
                              target_column=target_column)
test_context_train_3 = TestContext(dataset=vm_df)

# Run test
metric = WOEIVTable(test_context_train_3)
metric.run()
await metric.result.log()
woe_iv_dic = metric.result.metric.value['woe_iv']
metric.result.show()

2023-07-27 15:09:49,437 - INFO(validmind.client): Pandas dataset detected. Initializing VM Dataset instance...
2023-07-27 15:09:49,438 - INFO(validmind.vm_models.dataset): Inferring dataset types...


[INFO] creating woe binning ...


VBox(children=(HTML(value='<p>Calculate the Weight of Evidence (WoE) and Information Value (IV) of features. T…

#### Process Train Data II: Class Coarsing

In [None]:
import pandas as pd
import numpy as np

# Initialize the original DataFrame
df = pd.DataFrame({
    'Feature': ['verification_status'] * 3 + ['total_acc_bucket'] * 6 + ['term'] * 2,
    'Category': ['Not Verified', 'Source Verified', 'Verified', '[0.0, 9.0)', '[9.0, 18.0)', '[18.0, 27.0)', '[27.0, 36.0)', '[36.0, 45.0)', '[45.0, inf)', '36', '60'],
    'WoE': [0.2663, -0.1386, -0.1939, -0.1030, -0.0516, -0.0150, 0.0524, 0.0898, 0.1414, 0.2131, -0.9409],
    'IV': [0.0273, 0.0056, 0.0120, 0.0005, 0.0007, 0.0001, 0.0006, 0.0008, 0.0006, 0.0364, 0.1608]
})

def merge_bins(df, woe_diff=0.1):
    while True:
        # Group by 'Feature' and create a sorted list of unique WoE values
        woe_diffs = df.groupby('Feature')['WoE'].apply(lambda x: sorted(x.unique())).apply(np.diff).reset_index()

        # Calculate the minimum absolute WoE difference for each feature
        woe_diffs['Min_WoE_Diff'] = woe_diffs['WoE'].apply(lambda x: np.min(np.abs(x)) if len(x) > 0 else np.inf)

        # If the minimum WoE difference is >= woe_diff for all features, break the loop
        if all(woe_diffs['Min_WoE_Diff'] >= woe_diff):
            break

        # Find the feature with the smallest WoE difference
        feature = woe_diffs.loc[woe_diffs['Min_WoE_Diff'].idxmin(), 'Feature']

        # Get the categories of the feature in the order of WoE values
        categories = df.loc[df['Feature'] == feature, :].sort_values(by='WoE')['Category'].tolist()

        # Find the pair of consecutive categories with the smallest WoE difference
        pair = min(zip(categories, categories[1:]), key=lambda x: abs(df.loc[(df['Feature'] == feature) & (df['Category'] == x[0]), 'WoE'].values[0] - df.loc[(df['Feature'] == feature) & (df['Category'] == x[1]), 'WoE'].values[0]))

        # Merge the pair of categories
        df.loc[(df['Feature'] == feature) & (df['Category'].isin(pair)), 'Category'] = ', '.join(pair)

        # Update the WoE value for the merged category to the average of the WoE values of the pair
        df.loc[(df['Feature'] == feature) & (df['Category'] == ', '.join(pair)), 'WoE'] = df.loc[(df['Feature'] == feature) & (df['Category'].isin(pair)), 'WoE'].mean()

    return df

df_new = merge_bins(df)
df_new

In [None]:
def merge_bins(df, woe_diff=0.1):
    df = df.sort_values(by='WoE').reset_index(drop=True)
    merged = True
    
    while merged:
        merged = False
        df['WoE_Diff'] = df['WoE'].diff().abs()

        for idx in range(df.shape[0] - 1):
            if df.loc[idx, 'WoE_Diff'] < woe_diff:
                merged = True
                df.at[idx, 'All'] += df.at[idx + 1, 'All']
                df.at[idx, 'Good'] += df.at[idx + 1, 'Good']
                df.at[idx, 'Bad'] += df.at[idx + 1, 'Bad']
                df.at[idx, 'Distr_Good'] += df.at[idx + 1, 'Distr_Good']
                df.at[idx, 'Distr_Bad'] += df.at[idx + 1, 'Distr_Bad']
                df.at[idx, 'WoE'] = np.log(df.at[idx, 'Distr_Good'] / df.at[idx, 'Distr_Bad'])
                df.at[idx, 'IV'] = (df.at[idx, 'Distr_Good'] - df.at[idx, 'Distr_Bad']) * df.at[idx, 'WoE']
                df = df.drop(idx + 1).reset_index(drop=True)
                break

    df = df.drop(columns='WoE_Diff')
    df = df.sort_values(by='Feature').reset_index(drop=True)
    
    return df


woe_iv_df = pd.DataFrame(woe_iv_dic)
woe_iv_df_coarsed = merge_bins(woe_iv_df)
woe_iv_df_coarsed

In [None]:
woe_iv_df

In [None]:
coarse_mappings = {
    'sub_grade': [['B2','B3','B4','B5','C3','D1'], 
                  ['C1','C2','C4','C5'], 
                  ['D3','D4','D5','E3','G4'], 
                  ['E1','E2','E4','E5','F1','F2','F3','F4','G1','G2','G3','G5','F5']],
    'grade': [['F','G']],
    'purpose': [['wedding','major_purchase'], 
                ['credit_card','car'], 
                ['debt_consolidation','other','vacation'], 
                ['medical','moving','house','educational'], 
                ['renewable_energy','small_business']],
    'home_ownership': [['MORTGAGE','OWN','RENT']],
    'annual_inc_bucket': [['[250, 1000)','[100, 150)','[150, 250)','[1000, 10000)'],
                           ['[50, 75)','[40, 50)'],
                           ['[10, 20)','[0, 10)']],
    'emp_length_bucket': [['[2, 3)','[40, 50)','[3, 5)','[1, 2)','[0, 1)','[5, 8)','[8, 10)']],
    'inq_last_6mths_bucket': [['[4, 5)','[1, 2)'],
                              ['[5, 10)','[3, 4)']],
    'installment_bucket': [['[300, 400)','[200, 300)','[0, 100)'],
                           ['[400, 500)', '[500, 750)']],
    'total_acc_bucket': [['[20, 25)','[30, 35)','[15, 20)','[45, 50)','[40, 45)','[35, 40)','[10, 15)','[5, 10)']],
    'open_acc_bucket': [['[5, 8)','[8, 10)','[10, 100)','[4, 5)'], ['[1, 2)','[2, 3)']]
}

df_train_3 = class_coarsing(df_train_2, coarse_mappings)

# Update df_test
df_test_3 = class_coarsing(df_test_2, coarse_mappings)

#### Validate Train Data 3

In [None]:
from validmind.tests.data_validation.WOEIVPlots import WOEIVPlots

# Update vm dataset and test context
vm_df_train_3 = vm.init_dataset(dataset=df_train_3, 
                              target_column=target_column)
test_context_train_3 = TestContext(dataset=vm_df_train_3)

params = {
    "features": None,
    "fig_height": 500,
    "fig_height": 500,
}

# Run test
metric = WOEIVPlots(test_context_train_3, params=params)
metric.run()
await metric.result.log()
metric.result.show()

## Model Training

#### Process Train Data III: Feature Encoding with WoE

In [None]:
import statsmodels.api as sm

# Compute WoE and IV 
metric = WOEIVTable(test_context_train_3, params=params)
metric.run()
woe_iv_dic = metric.result.metric.value['woe_iv']
woe_iv_df = pd.DataFrame(woe_iv_dic)
check_categories(woe_iv_df, df_train_3)

# Encode features with WoE
df_train_4 = woe_encoder(woe_iv_df, df_train_3, target='default')

# Update df_test
df_test_4 = woe_encoder(woe_iv_df, df_test_3, target='default')

# Add constant to X_train for intercept term
y_train = df_train_4[target_column]
X_train = df_train_4.drop(target_column, axis=1)
X_train = sm.add_constant(X_train)
df_train_4 = pd.concat([X_train, y_train], axis=1)

# Update df_test
y_test = df_test_4[target_column]
X_test = df_test_4.drop(target_column, axis=1)
X_test = sm.add_constant(X_test)
df_test_4 = pd.concat([X_test, y_test], axis=1)

#### Fit Model 1

In [None]:
# Define the model
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
model_fit_glm = model.fit()

# Print out the statistics
print(model_fit_glm.summary())

#### Process Train Data 4

In [None]:
features_to_drop = ['total_acc_bucket_coarse']
df_train_5 = df_train_4.drop(columns = features_to_drop, axis=1)

# Update df_test 
df_test_5 = df_test_4.drop(columns = features_to_drop, axis=1)

#### Fit Model 2 

In [None]:
# Create X_train and y_train
y_train = df_train_5[target_column]
X_train = df_train_5.drop(target_column, axis=1)

# Update df_test
y_test = df_test_5[target_column]
X_test = df_test_5.drop(target_column, axis=1)

# Define the model
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
model_fit_glm = model.fit()

# Save the model and train dataset for PD development 
save_data = False
if save_data:
    save_model_and_df(model_fit_glm, df=df_train_5, base_filename='model_fit_glm_scorecard')

# Print out the statistics
print(model_fit_glm.summary())

## Model Evaluation

#### Validate Model Fit 2

In [None]:
# Create VM dataset
vm_ds_train_5 = vm.init_dataset(dataset=df_train_5,
                        target_column=target_column)
vm_ds_test_5 = vm.init_dataset(dataset=df_test_5,
                        target_column=target_column)

# Create VM model
vm_model_glm = vm.init_model(
    model = model_fit_glm, 
    train_ds=vm_ds_train_5, 
    test_ds=vm_ds_test_5)

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionCoeffsPlot import RegressionCoeffsPlot

test_context_model = TestContext(models = [vm_model_glm])

metric = RegressionCoeffsPlot(test_context_model)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionModelsCoeffs import RegressionModelsCoeffs

metric = RegressionModelsCoeffs(test_context_model)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.LogRegressionConfusionMatrix import LogRegressionConfusionMatrix

test_context_model = TestContext(model= vm_model_glm)

# Configure test parameters
params = {
    "cut_off_threshold": 0.5,
}

metric = LogRegressionConfusionMatrix(test_context_model, params)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionROCCurve import RegressionROCCurve

metric = RegressionROCCurve(test_context_model)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.GINITable import GINITable

metric = GINITable(test_context_model)
metric.run()
await metric.result.log() 
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.LogisticRegPredictionHistogram import LogisticRegPredictionHistogram

# Configure test parameters
params = {
    "title": "Histogram of Probability of Default",
}

metric = LogisticRegPredictionHistogram(test_context_model, params)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.LogisticRegCumulativeProb import LogisticRegCumulativeProb

# Configure test parameters
params = {
    "title": "Cumulative Probability of Default",
}

metric = LogisticRegCumulativeProb(test_context_model, params)
metric.run()
await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.ScorecardHistogram import ScorecardHistogram

# Configure test parameters
params = {
    "target_score": 600,
    "target_odds": 50,
    "pdo": 20,
    "title": "Histogram of Credit Scores",
}

metric = ScorecardHistogram(test_context_model, params)
metric.run()
await metric.result.log()
metric.result.show()