# Credit Scorecard Demo

## Introduction

#### Connect to ValidMind Project

**Import Libraries**

In [1]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
%matplotlib inline

**Connect to ValidMind Project**

In [2]:
import validmind as vm

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "2494c3838f48efe590d531bfe225d90b",
  api_secret = "4f692f8161f128414fef542cab2a4e74834c75d01b3a8e088a1834f2afcfe838",
  project = "cliwzqjgv00001fy6869rlav9"
)

2023-07-05 13:19:34,413 - INFO - api_client - Connected to ValidMind. Project: [3] PD Model - Initial Validation (cliwzqjgv00001fy6869rlav9)


## Data Description

**Import Lending Club Dataset**

In [3]:
filepath = '/Users/juanvalidmind/Dev/datasets/lending club/data_2007_2014/loan_data_2007_2014.csv'
df = pd.read_csv(filepath)

  df = pd.read_csv(filepath)


**Describe Raw Dataset**

In [4]:
from validmind.vm_models.test_context import TestContext
from validmind.tests.data_validation.TabularDescriptionTables import TabularDescriptionTables

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)

metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

2023-07-05 13:19:36,856 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:19:36,857 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical, categorical and date…

**Identify Missing Values in Raw Dataset**

In [5]:
from validmind.tests.data_validation.MissingValuesBarPlot import MissingValuesBarPlot

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)

params = {"threshold": 80,
          "xticks_fontsize": 8}

metric = MissingValuesBarPlot(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:19:43,206 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:19:43,207 - INFO - dataset - Inferring dataset types...
  ax.set_yticklabels(["{:.1f}%".format(x) for x in ax.get_yticks()])


VBox(children=(HTML(value='<p>Generates a visual analysis of missing values by plotting bar plots with colored…

## Data Preparation

#### Identify Target Variable

**Definition of Default**

We categorizing `Fully Paid` loans as "default = 0" and `Charged Off` loans as "default = 1". This binary classification is suitable for developing a credit scorecard, as it enables distinction between applicants likely to fulfill their credit obligations (low risk) and those likely to fail (high risk). 

Loans with `Current` status, which represents ongoing loans with an unresolved outcome, should be excluded from the model, as their final repayment status is still unknown and thus not suitable for a retrospective risk analysis.

**Add `default` Variable**

In [6]:
def add_target_column(df, target_column):
    # Assuming the column name is 'loan_status'
    df[target_column] = df['loan_status'].apply(lambda x: 0 if x == "Fully Paid" else 1 if x == "Charged Off" else np.nan)
    # Remove rows where the target column is NaN
    df = df.dropna(subset=[target_column])
    # Convert target column to integer
    df[target_column] = df[target_column].astype(int)
    return df

target_column = 'default'
df = add_target_column(df, target_column)

# Drop 'loan_status' variable 
df.drop(columns='loan_status', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_column] = df[target_column].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns='loan_status', axis=1, inplace=True)


#### Remove Unused Variables

In [7]:
unused_variables = ["id", "member_id", "funded_amnt", "emp_title", "url", "desc", "application_type",
                    "title", "zip_code", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record",
                    "revol_bal", "total_rec_prncp", "total_rec_late_fee", "recoveries", "out_prncp_inv", "out_prncp", 
                    "collection_recovery_fee", "next_pymnt_d", "initial_list_status", "pub_rec",
                    "collections_12_mths_ex_med", "policy_code", "acc_now_delinq", "pymnt_plan",
                    "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "last_pymnt_d", "last_credit_pull_d",
                    'earliest_cr_line', 'issue_d']

df = df.drop(columns=unused_variables)

#### Remove Variables with Large Number of Missing Values

In [8]:
def variables_with_min_missing(df, min_missing_percentage):
    # Calculate the percentage of missing values in each column
    missing_percentages = df.isnull().mean() * 100

    # Get the variables where the percentage of missing values is greater than the specified minimum
    variables_to_drop = missing_percentages[missing_percentages > min_missing_percentage].index.tolist()

    # Also add any columns where all values are missing
    variables_to_drop.extend(df.columns[df.isnull().all()].tolist())

    # Remove duplicates (if any)
    variables_to_drop = list(set(variables_to_drop))

    return variables_to_drop

min_missing_count = 80
variables_to_drop = variables_with_min_missing(df, min_missing_count)
df.drop(columns=variables_to_drop, axis=1, inplace=True)

df.dropna(axis=0, subset=["emp_length"], inplace=True)
df.dropna(axis=0, subset=["revol_util"], inplace=True)

#### Format Type of Variables

In [9]:
from typing import List
import pandas as pd
import numpy as np
from datetime import datetime
import re

def clean_term_column(df, column):
    """
    Function to remove 'months' string from the 'term' column and convert it to categorical
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].str.replace(' months', '')
    
    # Convert to categorical
    df[column] = df[column].astype('object')

def clean_emp_length_column(df, column):
    """
    Function to clean 'emp_length' column and convert it to categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")
    
    df[column] = df[column].replace('n/a', np.nan)
    df[column] = df[column].str.replace('< 1 year', str(0))
    df[column] = df[column].apply(lambda x: re.sub('\D', '', str(x)))
    df[column].fillna(value = 0, inplace=True)

    # Convert to categorical
    df[column] = df[column].astype('object')

def clean_inq_last_6mths(df, column):
    """
    Function to convert 'inq_last_6mths' column into categorical.
    """
    # Ensure the column exists in the dataframe
    if column not in df.columns:
        raise ValueError(f"The column '{column}' does not exist in the dataframe.")

    # Convert to categorical
    df[column] = df[column].astype('category')

clean_emp_length_column(df, 'emp_length')
clean_term_column(df, 'term')
clean_inq_last_6mths(df, 'inq_last_6mths')

#### Handle Outliers

**Identify Outliers**

In [10]:
def get_numerical_columns(df):
        numerical_columns = df.select_dtypes(
            include=["int", "float", "uint"]
        ).columns.tolist()
        return numerical_columns

def get_categorical_columns(df):
        categorical_columns = df.select_dtypes(
            include=["object", "category"]
        ).columns.tolist()
        return categorical_columns

In [11]:
from validmind.tests.data_validation.IQROutliersPlots import IQROutliersPlots

vm_df = vm.init_dataset(dataset=df,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

num_features = get_numerical_columns(df)
params = {"num_features": num_features,
          "threshold": 1.5}

metric = IQROutliersPlots(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:19:48,692 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:19:48,692 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Generates a visual analysis of the outliers for numeric variables. The input dat…

**Remove Outliers using IQR Method**

In [12]:
def compute_outliers(series, threshold=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return series[(series < lower_bound) | (series > upper_bound)]

def remove_iqr_outliers(df, target_column, threshold=1.5):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols.remove(target_column)  # Exclude target_column from numerical columns
    for col in num_cols:
        outliers = compute_outliers(df[col], threshold)
        df = df[~df[col].isin(outliers)]
    return df

df = remove_iqr_outliers(df, target_column, threshold=1.5)

## Data Sampling

#### Sampling Method

We employ stratified sampling to create our training and testing sets. Stratified sampling is particularly important in this context. When the `stratify = y` parameter is set, it ensures that the distribution of the target variable (`y`) in the test set is the same as that in the original dataset. 

This is crucial for maintaining a consistent representation of the target variable classes, especially important in scenarios where the classes are imbalanced, which is often the case in credit risk scorecards.

In [13]:
# Split data into train and test 
X = df.drop(target_column, axis = 1)
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    random_state = 42, stratify = y)

# Concatenate X_train with y_train to form df_train
df_train = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test with y_test to form df_test
df_test = pd.concat([X_test, y_test], axis=1)

#### Class Imbalance

Class imbalance is a common issue in credit risk scorecards and datasets like the Lending Club's. This imbalance arises when the number of defaulting loans (negative class) is significantly smaller than the number of loans that are paid off (positive class). Such imbalance can lead to biased models that favor the majority class, thus affecting predictive performance. 

Special techniques like oversampling, undersampling, or cost-sensitive learning are often needed to ensure that the minority class is appropriately represented during model training.

In [14]:
from validmind.tests.data_validation.ClassImbalance import ClassImbalance

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

metric = ClassImbalance(test_context)
metric.run()
metric.result.show()

2023-07-05 13:20:00,831 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:00,832 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='\n            <h2>Class Imbalance ❌</h2>\n            <p>The class imbalance test m…

## Univariate Analysis 

#### Histograms of Numerical Features

In [15]:
from validmind.tests.data_validation.TabularNumericalHistograms import TabularNumericalHistograms

vm_df_train = vm.init_dataset(dataset=df_train)
test_context = TestContext(dataset=vm_df_train)

metric = TabularNumericalHistograms(test_context)
metric.run()
metric.result.show()

2023-07-05 13:20:01,476 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:01,476 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Generates a visual analysis of numerical data by plotting the histogram. The inp…

#### High Cardinality of Categorical Features

In [16]:
from validmind.tests.data_validation.HighCardinality import HighCardinality
metric = HighCardinality(test_context)
metric.run()
metric.result.show()

VBox(children=(HTML(value='\n            <h2>Cardinality ✅</h2>\n            <p>The high cardinality test meas…

#### Bar Plots of Categorical Features

In [17]:
from validmind.tests.data_validation.TabularCategoricalBarPlots import TabularCategoricalBarPlots
metric = TabularCategoricalBarPlots(test_context)
metric.run()
metric.result.show()

VBox(children=(HTML(value='<p>Generates a visual analysis of categorical data by plotting bar plots. The input…

#### Default Rate by Categorical Feature

In [18]:
from validmind.tests.data_validation.DefaultRateBarPlots import DefaultRateBarPlots

# Configure the metric
params = {
    "default_column": target_column,
    "columns": None
}

metric = DefaultRateBarPlots(test_context, params=params)
metric.run()
metric.result.show()

The column default is correct and contains only 1 and 0.


VBox(children=(HTML(value='<p>Generates a visual analysis of loan default ratios by plotting bar plots. The in…

#### Chi-Squared Test on Categorical Features

In [19]:
from validmind.tests.data_validation.ChiSquaredFeaturesTable import ChiSquaredFeaturesTable

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

cat_features = get_categorical_columns(df_train)
params = {"cat_features": cat_features,
          "p_threshold": 0.05}

metric = ChiSquaredFeaturesTable(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:20:05,048 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:05,048 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Perform a Chi-Squared test of independence for each categorical variable with th…

#### ANOVA Test on Numerical Features

In [20]:
from validmind.tests.data_validation.ANOVAOneWayTable import ANOVAOneWayTable

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

num_features = get_numerical_columns(df_train)
params = {"num_features": num_features,
          "p_threshold": 0.05}

metric = ANOVAOneWayTable(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:20:05,813 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:05,814 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Perform an ANOVA F-test for each numerical variable with the target. The input d…

## Multivariate Analysis

#### Heatmap Correlation of Numerical Features

In [21]:
from validmind.tests.data_validation.PearsonCorrelationMatrix import PearsonCorrelationMatrix

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"declutter": False,
          "features": None,
          "fontsize": 13}

metric = PearsonCorrelationMatrix(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:20:06,607 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:06,608 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Extracts the Pearson correlation coefficient for all pairs of numerical variable…

#### Correlations of Numerical Features with Target Variable

In [22]:
from validmind.tests.data_validation.FeatureTargetCorrelationPlot import FeatureTargetCorrelationPlot

vm_df = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
test_context = TestContext(dataset=vm_df)

params = {"features": None,
          "fig_height": 600}

metric = FeatureTargetCorrelationPlot(test_context, params)
metric.run()
metric.result.show()

2023-07-05 13:20:07,369 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:07,370 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Generates a visual analysis of correlations between features and target by plott…

#### Feature Selection

In [23]:
drop_categorical_features = ['addr_state']
drop_numerical_features = ['total_rec_int', 'loan_amnt',
                           'funded_amnt_inv', 'dti', 'revol_util', 'total_pymnt', 
                           'total_pymnt_inv', 'last_pymnt_amnt',]

df_train.drop(columns = drop_categorical_features + drop_numerical_features, inplace=True)

# Update df_test 
df_test.drop(columns = drop_categorical_features + drop_numerical_features, inplace=True)

## Feature Engineering

#### Encoding of Numerical Features

In [24]:
import pandas as pd
import numpy as np

def encode_numerical_features(df):
    
    # term
    df['term'] = df['term'].replace({' 36': '36M', ' 60': '60M'})

    # emp_length_int
    df['emp_length'] = df['emp_length'].replace('10+', '10')  # Replace '10+' with '10'
    df['emp_length'] = pd.to_numeric(df['emp_length'], errors='coerce')  # Convert to numeric
    df['emp_length'].fillna(-1, inplace=True)
    bins = [0,1,2,3,5,8,10,999]
    df['emp_length_bucket'] = pd.cut(df['emp_length'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='emp_length', inplace=True)

    # inq_last_6mths
    df['inq_last_6mths'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 10, 25, 50]
    df['inq_last_6mths_bucket'] = pd.cut(df['inq_last_6mths'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='inq_last_6mths', inplace=True)
    
    # total_acc
    df['total_acc'].fillna(-1, inplace=True)
    bins = [-1, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 999]
    df['total_acc_bucket'] = pd.cut(df['total_acc'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='total_acc', inplace=True)

    # annual_inc
    df['annual_inc'].fillna(-1, inplace=True)
    df['annual_inc_1000'] = df['annual_inc']/1000
    bins = [-1, 0, 10, 20, 30, 40, 50, 75, 100, 150, 250, 1000, 10000]
    df['annual_inc_bucket'] = pd.cut(df['annual_inc_1000'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='annual_inc', inplace=True)
    df.drop(columns='annual_inc_1000', inplace=True)
    
    # int_rate
    df['int_rate'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 10, 25, 50]
    df['int_rate_bucket'] = pd.cut(df['int_rate'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='int_rate', inplace=True)

    # installment
    df['installment'].fillna(-1, inplace=True)
    bins = [-1, 0, 100, 200, 300, 400, 500, 750, 1000, 1500]
    df['installment_bucket'] = pd.cut(df['installment'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='installment', inplace=True)

    # open_acc
    df['open_acc'].replace("N/A", 1, inplace=True)
    df['open_acc'].fillna(-1, inplace=True)
    bins = [-1, 0, 1, 2, 3, 4, 5, 8, 10, 100]
    df['open_acc_bucket'] = pd.cut(df['open_acc'], bins=bins, right=False, include_lowest=True)
    df.drop(columns='open_acc', inplace=True)

def find_categorical_features(df):
    # Get the column names of features with the data type "category"
    categorical_features = df.select_dtypes(include='category').columns.tolist()

    return categorical_features


def convert_categorical_to_object(df):
    # Find the categorical features
    categorical_features = find_categorical_features(df)

    # Convert the categorical features to object type
    df[categorical_features] = df[categorical_features].astype(str)


encode_numerical_features(df_train)
convert_categorical_to_object(df_train)

# Update df_test
encode_numerical_features(df_test)
convert_categorical_to_object(df_test)

#### Weight of Evidence (WoE) and Infomation Value (IV)

In [25]:
from validmind.tests.data_validation.WOEIVTable import WOEIVTable

# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure test parameters

params = {
    "features": None,
    "order_by": ["Feature", "WoE"]
}

# Run test
metric = WOEIVTable(test_context, params=params)
metric.run()
woe_iv_df = metric.result.metric.value['woe_iv']
metric.result.show()

2023-07-05 13:20:08,401 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:08,402 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Calculate the Weight of Evidence (WoE) and Information Value (IV) of categorical…

#### Class Coarsing

In [26]:
import pandas as pd

def coarse_classing(df, mappings):
    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()

    # Loop through each feature and merge set
    for feature, merge_sets in mappings.items():
        for merge_set in merge_sets:
            # Merge the specified categories into a new category
            df_new[feature] = df_new[feature].apply(lambda x: f"[{','.join(merge_set)}]" if x in merge_set else x)

    return df_new

def shorten_category_names(df, max_length=20, suffix="..."):
    # Create a copy of the DataFrame to avoid modifying the original
    df_new = df.copy()
    
    # Iterate over each column in the DataFrame
    for feature in df_new.columns:
        # Check if the column has the "object" data type
        if df_new[feature].dtype.name == 'object':
            # Shorten long category names
            df_new[feature] = df_new[feature].apply(lambda x: x[:max_length] + suffix if len(x) > max_length else x)
    
    return df_new

# Create a dictionary of features and the sets to merge
mappings = {
    'sub_grade': [['B2','B3','B4','B5','C3','D1'], ['C1','C2','C4','C5'], ['D3','D4','D5','E3','G4'], ['E1','E2','E4','E5','F1','F2','F3','F4','G1','G2','G3','G5','F5']],
    'grade': [['F','G']],
    'purpose': [['wedding','major_purchase'], ['credit_card','car'], ['debt_consolidation','other','vacation'], ['medical','moving','house','educational'], ['renewable_energy','small_business']],
    'home_ownership': [['MORTGAGE','OWN','RENT']],
    'annual_inc_bucket': [['[250, 1000)','[100, 150)','[150, 250)','[1000, 10000)'], ['[50, 75)','[40, 50)'], ['[10, 20)','[0, 10)']],
    'emp_length_bucket': [['[2, 3)','[40, 50)','[3, 5)','[1, 2)','[0, 1)','[5, 8)','[8, 10)']],
    'inq_last_6mths_bucket': [['[4, 5)','[1, 2)'], ['[5, 10)','[3, 4)']],
    'installment_bucket': [['[300, 400)','[200, 300)','[0, 100)'], ['[400, 500)', '[500, 750)']],
    'total_acc_bucket': [['[20, 25)','[30, 35)','[15, 20)','[45, 50)','[40, 45)','[35, 40)','[10, 15)','[5, 10)']],
    'open_acc_bucket': [['[5, 8)','[8, 10)','[10, 100)','[4, 5)'], ['[1, 2)','[2, 3)']]
}

df_train = coarse_classing(df_train, mappings)
df_train = df_train[~df_train['home_ownership'].isin(['OTHER', 'NONE'])]
df_train.drop(columns="home_ownership", inplace=True)
df_train = shorten_category_names(df_train, max_length=15, suffix="...")

# Update df_test
df_test = coarse_classing(df_test, mappings)
df_test = df_test[~df_test['home_ownership'].isin(['OTHER', 'NONE'])]
df_test.drop(columns="home_ownership", inplace=True)
df_test = shorten_category_names(df_test, max_length=15, suffix="...")

In [27]:
from validmind.tests.data_validation.WOEIVPlots import WOEIVPlots

# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

params = {
    "features": None,
    "fig_height": 500,
    "fig_height": 500,
}

# Run test
metric = WOEIVPlots(test_context, params=params)
metric.run()
metric.result.show()

2023-07-05 13:20:16,296 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:16,296 - INFO - dataset - Inferring dataset types...


VBox(children=(HTML(value='<p>Generates a visual analysis of the WoE and IV values distribution for categorica…

#### Encode Features with WoE

In [28]:
# Update vm dataset and test context
vm_df_train = vm.init_dataset(dataset=df_train, 
                              target_column=target_column)
test_context = TestContext(dataset=vm_df_train)

# Configure test parameters

params = {
    "features": None,
    "order_by": ["Feature", "WoE"]
}

# Run test
metric = WOEIVTable(test_context, params=params)
metric.run()
woe_iv_df = metric.result.metric.value['woe_iv']

2023-07-05 13:20:21,465 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:21,465 - INFO - dataset - Inferring dataset types...


In [29]:
def check_categories(woe_df, original_df):
    for feature in woe_df['Feature'].unique():
        woe_categories = woe_df[woe_df['Feature'] == feature]['Category'].unique()
        original_categories = original_df[feature].unique()
        
        # Check categories in WoE table that are not in original DataFrame
        for category in woe_categories:
            if category not in original_categories:
                print(f"Category '{category}' not found in feature '{feature}' in original DataFrame.")
                
        # Check categories in original DataFrame that are not in WoE table
        for category in original_categories:
            if category not in woe_categories:
                print(f"Category '{category}' in feature '{feature}' not found in WoE table.")

                
check_categories(woe_iv_df, df_train)

In [30]:
woe_iv_df

Unnamed: 0,Feature,Category,All,Good,Bad,Distr_Good,Distr_Bad,WoE,IV
19,verification_status,Not Verified,58436,49886,8550,0.444664,0.338574,0.272578,0.02891799
20,verification_status,Source Verified,38235,30399,7836,0.270965,0.3103,-0.13555,0.005331848
21,verification_status,Verified,40770,31903,8867,0.284371,0.351127,-0.210868,0.01407662
37,total_acc_bucket,"[50, 999)",753,639,114,0.005696,0.004514,0.232475,0.0002746648
35,total_acc_bucket,"[25, 30)",20878,17146,3732,0.152833,0.147784,0.03359,0.0001695709
34,total_acc_bucket,"[[20, 25),[30, ...",114925,93722,21203,0.835401,0.839623,-0.005041,2.128088e-05
36,total_acc_bucket,"[0, 5)",885,681,204,0.00607,0.008078,-0.285789,0.0005738868
0,term,36M,117433,99441,17992,0.886378,0.71247,0.218406,0.03798268
1,term,60M,20008,12747,7261,0.113622,0.28753,-0.928453,0.1614657
10,sub_grade,A1,3623,3508,115,0.031269,0.004554,1.926638,0.05147019


In [31]:
import pandas as pd
import numpy as np

def woe_encoder(woe_df, original_df, target):
    # Create a new DataFrame with the same columns as original_df
    woe_encoded_df = pd.DataFrame(columns=original_df.columns, index=original_df.index)

    # Loop through each feature-category and assign the corresponding WoE value as float
    for feature in woe_df['Feature'].unique():
        # Check that the feature exists in the original DataFrame
        if feature not in original_df.columns:
            print(f"Feature {feature} not found in original DataFrame. Skipping...")
            continue

        feature_woe = woe_df[woe_df['Feature'] == feature]
        woe_dict = dict(zip(feature_woe['Category'], feature_woe['WoE']))

        # Check that the categories exist in the original DataFrame
        # Converting both to strings to avoid datatype issues
        original_categories = original_df[feature].astype(str).unique()
        woe_categories = feature_woe['Category'].astype(str).unique()
        
        # Two-way check:
        # 1. For each category in the original DataFrame, check if it exists in the WoE DataFrame
        missing_from_woe = [category for category in original_categories if category not in woe_categories]
        if missing_from_woe:
            print(f"Categories {missing_from_woe} from original DataFrame not found in WoE DataFrame for feature {feature}.")
            
        # 2. For each category in the WoE DataFrame, check if it exists in the original DataFrame
        missing_from_original = [category for category in woe_categories if category not in original_categories]
        if missing_from_original:
            print(f"Categories {missing_from_original} from WoE DataFrame not found in original DataFrame for feature {feature}.")
        
        # Also converting original dataframe feature to string before replacement
        woe_encoded_df[feature] = original_df[feature].astype(str).replace(woe_dict).astype(float)

    # Check that the target exists in the original DataFrame
    if target not in original_df.columns:
        print(f"Target {target} not found in original DataFrame. Returning None...")
        return None

    # Add the target column to the new DataFrame
    woe_encoded_df[target] = original_df[target]

    return woe_encoded_df


df_train = woe_encoder(woe_iv_df, df_train, target='default')

# Update df_test
df_test = woe_encoder(woe_iv_df, df_test, target='default')

## Model Training

#### Fit GLM Logistic Regression Model

In [32]:
import statsmodels.api as sm

# Create X_train, y_train and X_test, y_test
df_test = df_test.reindex(labels=df_train.columns, axis=1, fill_value=0)
y_train = df_train[target_column]
X_train = df_train.drop(target_column, axis=1)

# Add constant to X_train for intercept term
X_train = sm.add_constant(X_train)
df_train = pd.concat([X_train, y_train], axis=1)

# Update df_test
y_test = df_test[target_column]
X_test = df_test.drop(target_column, axis=1)
X_test = sm.add_constant(X_test)
df_test = pd.concat([X_test, y_test], axis=1)
df_test = df_test.reindex(labels=df_train.columns, axis=1, fill_value=0)

# Define the model
model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

# Fit the model
model_fit_glm = model.fit()

# Print out the statistics
print(model_fit_glm.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                default   No. Observations:               137441
Model:                            GLM   Df Residuals:                   137428
Model Family:                Binomial   Df Model:                           12
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -59834.
Date:                Wed, 05 Jul 2023   Deviance:                   1.1967e+05
Time:                        13:20:27   Pearson chi2:                 1.38e+05
No. Iterations:                     6   Pseudo R-squ. (CS):            0.07997
Covariance Type:            nonrobust                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -1.49

#### Create ValidMind Model

In [33]:
# Cerate VM dataset
vm_train_ds = vm.init_dataset(dataset=df_train,
                        target_column=target_column)
vm_test_ds = vm.init_dataset(dataset=df_test,
                        target_column=target_column)

# Create VM model
vm_model_glm = vm.init_model(
    model = model_fit_glm, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

2023-07-05 13:20:27,472 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:27,473 - INFO - dataset - Inferring dataset types...
2023-07-05 13:20:28,648 - INFO - client - Pandas dataset detected. Initializing VM Dataset instance...
2023-07-05 13:20:28,648 - INFO - dataset - Inferring dataset types...


## Model Validation

#### Confusion Matrix

In [34]:
from validmind.tests.model_validation.sklearn.ConfusionMatrix import ConfusionMatrix

test_context = TestContext(model= vm_model_glm)
metric = ConfusionMatrix(test_context)
metric.run()
metric.result.show()

VBox(children=(HTML(value='<p>A confusion matrix is a table that is used to describe the performance of a clas…

#### ROC Curve

In [35]:
from validmind.tests.model_validation.statsmodels.RegressionROCCurve import RegressionROCCurve

test_context = TestContext(model= vm_model_glm)
metric = RegressionROCCurve(test_context)
metric.run()
metric.result.show()

VBox(children=(HTML(value='<p>A receiver operating characteristic (ROC), or simply ROC curve, is a graphical p…

#### GINI and Kolmogorov-Smirnov (KS)

In [36]:
from validmind.tests.model_validation.statsmodels.GINITable import GINITable

test_context = TestContext(model= vm_model_glm)
metric = GINITable(test_context)
metric.run()
metric.result.show()

VBox(children=(HTML(value='<p>Compute and display the AUC, GINI, and KS for train and test sets.</p>'), HTML(v…

#### Uncertainty Analysis

## Scorecard Development

#### Compute Probability of Default

In [None]:
def compute_pd(model_fit, X_train):

    # Predict probabilities
    probabilities = model_fit.predict(X_train)

    # The probabilities are a 2D array with probabilities for the two classes.
    # We are interested in the probability of default, which is the second column.
    pd = probabilities

    # Add PD as a new column in X_train
    X_train['PD'] = pd

    return X_train

X_train_pd = compute_pd(model_fit_glm, X_train)
df_train_pd = pd.concat([X_train_pd, y_train], axis=1)

# Update df_test
X_test_pd = compute_pd(model_fit_glm, X_test)
df_test_pd = pd.concat([X_test_pd, y_test], axis=1)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_pd_histogram(df_train, df_test, pd_col, target_col):
    # Separate PD based on target column for training data
    pd_train_0 = df_train[df_train[target_col] == 0][pd_col]
    pd_train_1 = df_train[df_train[target_col] == 1][pd_col]

    # Separate PD based on target column for testing data
    pd_test_0 = df_test[df_test[target_col] == 0][pd_col]
    pd_test_1 = df_test[df_test[target_col] == 1][pd_col]

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create histograms for training data
    trace_train_0 = go.Histogram(x=pd_train_0, opacity=0.75, name=f'Train {target_col} = 0')
    trace_train_1 = go.Histogram(x=pd_train_1, opacity=0.75, name=f'Train {target_col} = 1')

    # Create histograms for testing data
    trace_test_0 = go.Histogram(x=pd_test_0, opacity=0.75, name=f'Test {target_col} = 0')
    trace_test_1 = go.Histogram(x=pd_test_1, opacity=0.75, name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout to overlay the histograms in each subplot
    fig.update_layout(barmode='overlay', title_text='Histogram of Probability of Default')

    # Show the figure
    fig.show()

plot_pd_histogram(df_train_pd,
                  df_test_pd, 
                  pd_col='PD', 
                  target_col=target_column)

In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_cumulative_pd(df_train, df_test, pd_col, target_col):
    # Separate PD based on target column for training data
    pd_train_0 = np.sort(df_train[df_train[target_col] == 0][pd_col])
    pd_train_1 = np.sort(df_train[df_train[target_col] == 1][pd_col])

    # Separate PD based on target column for testing data
    pd_test_0 = np.sort(df_test[df_test[target_col] == 0][pd_col])
    pd_test_1 = np.sort(df_test[df_test[target_col] == 1][pd_col])

    # Calculate cumulative distributions
    cumulative_pd_train_0 = np.cumsum(pd_train_0) / np.sum(pd_train_0)
    cumulative_pd_train_1 = np.cumsum(pd_train_1) / np.sum(pd_train_1)
    cumulative_pd_test_0 = np.cumsum(pd_test_0) / np.sum(pd_test_0)
    cumulative_pd_test_1 = np.cumsum(pd_test_1) / np.sum(pd_test_1)

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create line plots for training data
    trace_train_0 = go.Scatter(x=pd_train_0, y=cumulative_pd_train_0, mode='lines', name=f'Train {target_col} = 0')
    trace_train_1 = go.Scatter(x=pd_train_1, y=cumulative_pd_train_1, mode='lines', name=f'Train {target_col} = 1')

    # Create line plots for testing data
    trace_test_0 = go.Scatter(x=pd_test_0, y=cumulative_pd_test_0, mode='lines', name=f'Test {target_col} = 0')
    trace_test_1 = go.Scatter(x=pd_test_1, y=cumulative_pd_test_1, mode='lines', name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout
    fig.update_layout(title_text='Cumulative Probability of Default')

    # Show the figure
    fig.show()

plot_cumulative_pd(df_train_pd,
                  df_test_pd, 
                  pd_col='PD', 
                  target_col=target_column)

#### Compute Credit Scores

In [None]:
def compute_credit_score(model_fit, X, target_score, target_odds, pdo):
    # Create a copy of X
    X_copy = X.copy()
    
    # Get logistic regression coefficients
    beta = model_fit.params.values

    # Get intercept (alpha)
    alpha = model_fit.params[0]  # Intercept is the first parameter in statsmodels

    # Calculate factor
    factor = pdo / np.log(2)

    # Calculate offset
    offset = target_score - (factor * np.log(target_odds))

    # Loop over each row in the copied data
    for _, row in X_copy.iterrows():
        # Initialize score for current row
        score_i = 0

        # Add contribution of each feature to the score
        for i in range(1, len(beta)):  # Starting from 1 to skip the intercept
            WoE_i = row[i]  # WoE for feature i, assuming intercept is in the first column
            score_i += (beta[i] * WoE_i) * factor

        # Add intercept's contribution to the score
        score_i += alpha * factor

        # Adjust the score scale using the offset
        score_i += offset

        # Add score to the new column in X_copy
        X_copy.loc[row.name, 'score'] = score_i

    return X_copy


# Set target_score, target_odds, and pdo
target_score = 600
target_odds = 50
pdo = 20

# Compute credit scores and add to df_train
X_train_scores = compute_credit_score(model_fit_glm, X_train_pd, target_score, target_odds, pdo)
df_train_scores = pd.concat([X_train_scores, y_train], axis=1)

# Update df_test 
X_test_scores = compute_credit_score(model_fit_glm, X_test_pd, target_score, target_odds, pdo)
df_test_scores = pd.concat([X_test_scores, y_test], axis=1)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_score_histogram(df_train, df_test, score_col, target_col):
    # Separate scores based on target column for training data
    scores_train_0 = df_train[df_train[target_col] == 0][score_col]
    scores_train_1 = df_train[df_train[target_col] == 1][score_col]

    # Separate scores based on target column for testing data
    scores_test_0 = df_test[df_test[target_col] == 0][score_col]
    scores_test_1 = df_test[df_test[target_col] == 1][score_col]

    # Create subplot
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"))

    # Create histograms for training data
    trace_train_0 = go.Histogram(x=scores_train_0, opacity=0.75, name=f'Train {target_col} = 0')
    trace_train_1 = go.Histogram(x=scores_train_1, opacity=0.75, name=f'Train {target_col} = 1')

    # Create histograms for testing data
    trace_test_0 = go.Histogram(x=scores_test_0, opacity=0.75, name=f'Test {target_col} = 0')
    trace_test_1 = go.Histogram(x=scores_test_1, opacity=0.75, name=f'Test {target_col} = 1')

    # Add traces to the subplots
    fig.add_trace(trace_train_0, row=1, col=1)
    fig.add_trace(trace_train_1, row=1, col=1)
    fig.add_trace(trace_test_0, row=1, col=2)
    fig.add_trace(trace_test_1, row=1, col=2)

    # Update layout to overlay the histograms in each subplot
    fig.update_layout(barmode='overlay', title_text='Histogram of Scores')

    # Show the figure
    fig.show()

plot_score_histogram(df_train_scores, 
                     df_test_scores, 
                     score_col='score', 
                     target_col=target_column)