In [4]:
import warnings
import datetime as dt
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import re
import string 
import copy
import seaborn as sns 
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

# Scorecard Modelling:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from optbinning import Scorecard
from optbinning import BinningProcess

# Extras:
import pickle
from scipy import stats
from typing import Tuple
from typing import Union

In [5]:
def missing_zero_values_table(df):
  
    """
    function for data audit, returns the count and percentage of missing and o values in each column
     %% parameters 

    df: [dataframe] 
    
    """
  
    zero_val = (df == 0.00).astype(int).sum(axis=0)
        
    zero_val_percent = (df == 0.00).astype(int).sum(axis=0) / len(df)
        
    mis_val = df.isnull().sum()
    
    mis_val_percent = round(df.isnull().sum() / len(df),1)
    
    mz_table = pd.concat([zero_val,zero_val_percent, mis_val, mis_val_percent], axis=1)
    
    mz_table = mz_table.rename(
                                columns = {0 : 'No of 0s', 1 : '% of 0s', 2: 'No of Missing Values', 3 : '% of Missing Values'})
    
    mz_table['Rows'] = len(df)
    mz_table['No of Unique'] = df.nunique()
    
    mz_table['Data Type'] = df.dtypes
    
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
        "There are " + str(mz_table.shape[0]) +
            " columns that have missing values.")
    
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
    return mz_table


In [33]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from optbinning.scorecard import plot_auc_roc, plot_cap, plot_ks
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def plot_all_metrics(y_train, y_test, train_pred, test_pred, model, X_train, X_test):
    """
    Generate and display all relevant plots: AUC-ROC, CAP, KS, Precision-Recall, and Confusion Matrix.
    
    Parameters:
    - y_train (array-like): True labels for the training set.
    - y_test (array-like): True labels for the test set.
    - train_pred (array-like): Predictions for the training set.
    - test_pred (array-like): Predictions for the test set.
    - model: Trained model (needed for confusion matrix plot).
    - X_train (array-like): Training features (for confusion matrix plot).
    - X_test (array-like): Test features (for confusion matrix plot).
    """
    
    # Create a figure with subplots (5 rows and 2 columns for better organization)
    fig, axs = plt.subplots(5, 2, figsize=(15, 25))
    
    # AUC-ROC for training set (top-left)
    plt.sca(axs[0, 0])  # Set the current axis to top-left
    plot_auc_roc(y_train, train_pred)
    axs[0, 0].set_title('AUC-ROC for Training Set')

    # AUC-ROC for test set (top-right)
    plt.sca(axs[0, 1])  # Set the current axis to top-right
    plot_auc_roc(y_test, test_pred)
    axs[0, 1].set_title('AUC-ROC for Test Set')

    # CAP for training set (middle-left)
    plt.sca(axs[1, 0])  # Set the current axis to middle-left
    plot_cap(y_train, train_pred)
    axs[1, 0].set_title('CAP for Training Set')

    # CAP for test set (middle-right)
    plt.sca(axs[1, 1])  # Set the current axis to middle-right
    plot_cap(y_test, test_pred)
    axs[1, 1].set_title('CAP for Test Set')

    # KS for training set (third row, left)
    plt.sca(axs[2, 0])  # Set the current axis to bottom-left
    plot_ks(y_train, train_pred)
    axs[2, 0].set_title('KS for Training Set')

    # KS for test set (third row, right)
    plt.sca(axs[2, 1])  # Set the current axis to bottom-right
    plot_ks(y_test, test_pred)
    axs[2, 1].set_title('KS for Test Set')

    # Precision-Recall Curve for training set (fourth row, left)
    plt.sca(axs[3, 0])
    precision_train, recall_train, _ = precision_recall_curve(y_train, train_pred)
    pr_auc_train = auc(recall_train, precision_train)
    axs[3, 0].plot(recall_train, precision_train, label=f"PR AUC = {pr_auc_train:.2f}")
    axs[3, 0].set_title('Precision-Recall Curve for Training Set')
    axs[3, 0].set_xlabel('Recall')
    axs[3, 0].set_ylabel('Precision')
    axs[3, 0].legend()

    # Precision-Recall Curve for test set (fourth row, right)
    plt.sca(axs[3, 1])
    precision_test, recall_test, _ = precision_recall_curve(y_test, test_pred)
    pr_auc_test = auc(recall_test, precision_test)
    axs[3, 1].plot(recall_test, precision_test, label=f"PR AUC = {pr_auc_test:.2f}")
    axs[3, 1].set_title('Precision-Recall Curve for Test Set')
    axs[3, 1].set_xlabel('Recall')
    axs[3, 1].set_ylabel('Precision')
    axs[3, 1].legend()

    # Confusion Matrix for test set (fifth row, left)
    plt.sca(axs[4, 0])
    # Step 1: Predict probabilities for the test set using the scorecard object
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Step 2: Binarize the predictions (using a threshold of 0.5 for example)
    threshold = 0.5
    y_pred = (y_pred_proba >= threshold).astype(int)

    # Step 3: Generate the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=['Non-Default', 'Default']).plot(cmap='Blues', ax=axs[4, 1])
    axs[4, 1].set_title('Confusion Matrix for Test Set')

    # Confusion Matrix for training set (fifth row, right)
    plt.sca(axs[4, 0])
    y_pred_proba_train = model.predict_proba(X_train)[:, 1]
    y_pred_train = (y_pred_proba_train >= threshold).astype(int)
    cm_train = confusion_matrix(y_train, y_pred_train)
    ConfusionMatrixDisplay(cm_train, display_labels=['Non-Default', 'Default']).plot(cmap='Blues', ax=axs[4, 0])
    axs[4, 0].set_title('Confusion Matrix for Training Set')

    # Adjust layout to prevent overlap
    plt.tight_layout()
    
    # Display the plots
    plt.show()


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
# from sklearn.metrics import brier_score_loss, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import category_encoders as ce

def calibration_analysis(X_train, X_test, y_train, y_test, scorecard_model):
    """
    Perform calibration analysis using Platt scaling and isotonic regression.

    Parameters:
    - X_train (DataFrame): Training feature set.
    - X_test (DataFrame): Test feature set.
    - y_train (Series): Training target variable.
    - y_test (Series): Test target variable.
    - scorecard_model: A fitted scorecard model to compare non-calibrated results.

    Returns:
    - Plots calibration curves and prints Brier scores.
    """

    # Identify categorical and numerical features
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Logistic regression estimator
    estimator = LogisticRegression(solver="lbfgs", class_weight='balanced')

    # Preprocessing pipeline for numerical and categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', ce.TargetEncoder(cols=categorical_features), categorical_features)
        ])

    # Logistic regression pipeline
    logreg_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', estimator)
    ])

    # Non-calibrated predictions from your scorecard model
    y_train_pred_proba = scorecard_model.predict_proba(X_train)[:, -1]
    y_test_pred_proba = scorecard_model.predict_proba(X_test)[:, -1]

    # Brier score for non-calibrated predictions from scorecard
    non_calibrated_brier = brier_score_loss(y_test, y_test_pred_proba)

    # Calibrate using Platt scaling
    platt_model = CalibratedClassifierCV(base_estimator=logreg_pipeline, method='sigmoid')
    platt_model.fit(X_train, y_train)
    platt_pred_calibrated = platt_model.predict_proba(X_test)[:, 1]
    platt_train_pred_proba = platt_model.predict_proba(X_train)[:, 1]

    # Calibrate using Isotonic regression
    iso_model = CalibratedClassifierCV(base_estimator=logreg_pipeline, method='isotonic')
    iso_model.fit(X_train, y_train)
    iso_pred_calibrated = iso_model.predict_proba(X_test)[:, 1]

    # Calculate Brier scores for calibrated predictions
    platt_brier = brier_score_loss(y_test, platt_pred_calibrated)
    iso_brier = brier_score_loss(y_test, iso_pred_calibrated)

    # Print Brier scores
    print(f'Non-Calibrated Brier Score (Scorecard): {non_calibrated_brier:.4f}')
    print(f'Platt Scaling Brier Score: {platt_brier:.4f}')
    print(f'Isotonic Regression Brier Score: {iso_brier:.4f}')

    # Calibration curve using the correct test labels
    prob_true, prob_pred_non_calibrated = calibration_curve(y_test, y_test_pred_proba, n_bins=10)
    prob_true_calibrated_platt, prob_pred_calibrated_platt = calibration_curve(y_test, platt_pred_calibrated, n_bins=10)
    prob_true_calibrated_isotonic, prob_pred_calibrated_isotonic = calibration_curve(y_test, iso_pred_calibrated, n_bins=10)

    # Plotting calibration curves with Brier score annotations
    plt.figure(figsize=(10, 6))
    plt.plot(prob_pred_non_calibrated, prob_true, marker='o', label=f'Non-calibrated (Scorecard, Brier: {non_calibrated_brier:.3f})', color='blue')
    plt.plot(prob_pred_calibrated_platt, prob_true_calibrated_platt, marker='o', label=f'Platt Calibrated (Brier: {platt_brier:.3f})', color='green')
    plt.plot(prob_pred_calibrated_isotonic, prob_true_calibrated_isotonic, marker='o', label=f'Isotonic Calibrated (Brier: {iso_brier:.3f})', color='red')

    plt.plot([0, 1], [0, 1], linestyle='--', color='black')  # Perfect calibration line

    plt.xlabel('Predicted Probability')
    plt.ylabel('True Probability')
    plt.title('Calibration Plot: Scorecard vs Platt & Isotonic Calibration')
    plt.legend()
    plt.grid()
    plt.show()

In [8]:
credit_levels_decriptions = {
    1: "Very Poor",
    2: "Poor",
    3: "Below Average",
    4: "Average",
    5: "Above Average",
    6: "Good",
    7: "Very Good",
    8: "Excellent",
    9: "Exceptional",
}

def get_credit_levels(
    df: pd.DataFrame,
    target_col: str = "credit_score",
    left_bound = -np.inf,
    level_1 = 350,
    level_2 = 400,
    level_3 = 450,
    level_4 = 500,
    level_5 = 550,
    level_6 = 600,
    level_7 = 650,
    level_8 = 700,
    right_bound = np.inf
) -> pd.DataFrame:
    """
    Explain the credit levels and description for all FICO credit scores.

    Args:
        df (pd.DataFrame): The dataframe containing the credit score
        target_col (str): Column containing FICO credit score.
        left_bound (int): Lowest possible FICO credit score.
        level_1 (int): Value where the credit scores are equal or below will be categorize as level 1.
        level_2 (int): Value where the credit scores are equal or below will be categorize as level 2.
        level_3 (int): Value where the credit scores are equal or below will be categorize as level 3.
        level_4 (int): Value where the credit scores are equal or below will be categorize as level 4.
        right_bound (int): Lowest possible FICO credit score.

    Returns:
        float: The dataframe containing the credit levels and descriptions for all credit scores.
    """
    conditions = [
        (df[target_col] > left_bound) & (df[target_col] <= level_1),
        (df[target_col] > level_1) & (df[target_col] <= level_2),
        (df[target_col] > level_2) & (df[target_col] <= level_3),
        (df[target_col] > level_3) & (df[target_col] <= level_4),
        (df[target_col] > level_4) & (df[target_col] <= level_5),
        (df[target_col] > level_5) & (df[target_col] <= level_6),
        (df[target_col] > level_6) & (df[target_col] <= level_7),
        (df[target_col] > level_7) & (df[target_col] <= level_8),
        (df[target_col] > level_8) & (df[target_col] <= right_bound),
    ]

    level_choices = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    lower_bound_choices = [left_bound, level_1, level_2, level_3, level_4, level_5, level_6, level_7, level_8]
    upper_bound_choices = [level_1, level_2, level_3, level_4, level_5, level_6, level_7, level_8, right_bound]
    df["credit_level"] = np.select(conditions, level_choices)
    df["credit_lower_bound"] = np.select(conditions, lower_bound_choices)
    df["credit_upper_bound"] = np.select(conditions, upper_bound_choices)
    return df

In [28]:
def roc_auc(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve).
    
    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: ROC AUC score.
    """
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    return auc(fpr, tpr)

def pr_auc(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate PR AUC (Area Under the Precision Recall Curve).
    
    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: PR AUC score.
    """
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    return auc(recall, precision)

def gini(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate Gini coefficient.

    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: Gini coefficient.
    """
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    return 2 * roc_auc - 1

def ks(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate Kolmogorov-Smirnov (KS) statistic.

    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: KS statistic.
    """
    y_pred_proba_not_default = y_pred_proba[y_true == 0]
    y_pred_proba_default = y_pred_proba[y_true == 1]
    ks_stat, _ = stats.ks_2samp(y_pred_proba_not_default, y_pred_proba_default)
    return ks_stat

def plot_calibration_curve(y_true: np.array, y_pred_proba: np.array, model_name: str, figsize: Tuple[int, int], n_bins=10) -> plt.Axes:
    """
    Plot calibration curve.

    Args:
        y_pred_proba (np.array): Predicted probabilities for the positive class (default).
        y_true (np.array): True binary labels (0 for not default, 1 for default).
        model_name (str): Name of the model for labeling the plot.
        figsize (Tuple[int, int]): size of the plot.
        n_bins (int): Number of bins to use for calibration curve.
    Return:
        plt.Axes: Matplotlib axis object.
    """
    prob_true, prob_pred = calibration_curve(y_true, y_pred_proba, n_bins=n_bins)
    
    plt.style.use("fivethirtyeight")
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], linestyle="--", label="Perfectly calibrated")
    ax.plot(prob_pred, prob_true, marker="o", label=model_name)
    
    ax.set_xlabel("Mean predicted probability")
    ax.set_ylabel("Fraction of positives")
    ax.set_title("Calibration plot")
    ax.legend()
    ax.grid(True)

    return fig

def print_side_by_side(dict1: dict, dict2: dict) -> None:
    """
    Prints the content of two dictionaries side by side.

    Args:
        dict1 (dict): The first dictionary to be printed.
        dict2 (dict): The second dictionary to be printed.

    Returns:
        None
    """
    # Calculate the maximum length of keys in both dictionaries
    max_key_len = max(max(len(key) for key in dict1), max(len(key) for key in dict2))
    
    # Define the format string for printing, adjusted for floating point numbers
    format_str = "{:<{key_len}}: {:<10} | {:<10}"
    
    # Print header
    print(format_str.format("Metric", "Train", "Test", key_len=max_key_len))
    
    # Print separator
    print("-" * (max_key_len + 24))  # Adjusted separator length
    
    # Print key-value pairs side by side, rounding floats if necessary
    for key in dict1:
        val1 = dict1[key]
        val2 = dict2[key]
        
        # Check if the values are float, if so, round to 2 decimal places
        if isinstance(val1, float):
            val1 = round(val1, 2)
        if isinstance(val2, float):
            val2 = round(val2, 2)
        
        # Print the formatted output
        print(format_str.format(key, val1, val2, key_len=max_key_len))


In [10]:
# A function to calculate WOE and IV for numerical features
def woe_num(data, feature, gbflag):
    dt = pd.crosstab(index=data[feature], columns=data[gbflag])
    dt['Freq'] = dt.sum(axis=1)
    dt['Percentage'] = round((dt['Freq']/dt['Freq'].sum() *100),1)
    dt['% Good'] = round((dt['Good']/dt['Good'].sum() *100), 1)
    dt['% Bad'] = round((dt['Bad']/dt['Bad'].sum() *100), 1)
    dt['Bad Rate'] = round((dt['Bad']/dt['Freq']) *100, 1) 
    dt['GoodBaddOdds'] = round(dt['Good']/dt['Bad'], 2)
    dt['WOE'] = np.log(dt['% Bad']/dt['% Good'])
    dt['Class IV'] = (dt['% Bad']- dt['% Good']) * dt['WOE']
    dt['Variable IV'] = ((dt['% Bad']-dt['% Good']) * dt['WOE']).sum() #IV for the variable
#     dt['IV'] = ((dt['% Good']- dt['% Bad']) * dt['WOE']).sum()
    dt = dt.sort_values(['WOE'])
    
    return dt
# Function to calculate WOE and IV for categorical variables
# Function to calculate WOE and IV for categorical variables
def woe_cat(data, feature, gbflag):
    dt = pd.crosstab(index=data[feature], columns=data[gbflag])
    dt['Freq'] = dt.sum(axis=1)
    dt['Proptn'] = dt['Freq']/dt['Freq'].sum()
    dt['% Good'] = dt['Good']/dt['Good'].sum() 
    dt['% Bad'] = dt['Bad']/dt['Bad'].sum()
    dt['Bad Rate'] = dt['Bad']/dt['Freq'] 
    dt['GoodBaddOdds'] = round(dt['Good']/dt['Bad'], 2)
    dt['WOE'] = np.log(dt['% Bad']/dt['% Good'])
    dt['Class IV'] = (dt['% Bad']- dt['% Good']) * dt['WOE']
    dt['Variable IV'] = ((dt['% Bad']-dt['% Good']) * dt['WOE']).sum() #IV for the variable
    dt = dt.sort_values(['WOE'])
    return dt, dt['Variable IV'].iloc[0]


# A function to get all variables WOE and IV
def woe_iv(df, variablie_list, GBFlag):   
    output = {}
    for variable in variablie_list:
        try:
            var = woe_cat(df, variable, GBFlag)
            
            for i in range(len(var.index.values)):
                output.setdefault('Variables',[]).append(variable)
                output.setdefault('Categories',[]).append(var.index.values[i])
                output.setdefault('WOE',[]).append(var['WOE'].values[i])
                output.setdefault('IV',[]).append(var['IV'].values[i])
        except Exception as e:
            pass

    return pd.DataFrame(output)
# woe_iv(data_universe_fai, data_universe_fai_cat.columns, 'GBFlag')
# A function that automates and return CA report
def Export_CA_Report(df, variablie_list, GBFlag, fileName='CA_Report'):
    
    # This inner function estimates WoE and IV which represent CA for all the provided variables
    def CA_Report(df, variablie_list, GBFlag):
        # Create a dictionary to track CA iteratively
        output = {}
        for variable in variablie_list:
            # use the function for estimating WoE for categorical variables to 
            var = woe_cat(df, variable, GBFlag)
            # append to dictionary
            output[variable]=var        

        return output
    
    
   # Instatiate above function to CA report for all provided variables
    ca_result = CA_Report(df, variablie_list, GBFlag)
    # save CA report to Excel
    with pd.ExcelWriter(fileName +'.xlsx', engine="openpyxl") as writer:
        for variable_name, data in ca_result.items():
            data.to_excel(writer, sheet_name=variable_name)
    
    
def fill_binned_nan_values(df, variablie_list, GBFlag):
    for variable in variablie_list:
        if df[variable].isna().sum() > 0:
            # Run CA(WoE & IV) on variable
            run_ca = woe_num(df, variable, GBFlag)
            # get index value(bin class) of category with least woe
            bin_class = run_ca[run_ca['WOE']==max(run_ca['WOE'])].index.values[0]
            df[variable] = df[variable].fillna(bin_class)
        else:
            pass

In [11]:
def replace_with_woe(df_with_selected_variables, variables_woe_iv):
    '''
    This function replaces each category in the selected columns with
    their woe
    
    df_with_selected_variables :: Dataframe containing grouped/binned selected
    variables
    
    grpd_variables_woe_iv :: Dataframe containing WoE and IV of grouped/binned
    variables
    
    Note:: Make sure woe_iv function has returned it's result before running
    this function. result in this case - grpd_variables_woe_iv
    
    '''
    
    for variable in df_with_selected_variables.columns:
        for category in df_with_selected_variables[variable].unique():
            
            try:
                ### Get WoE from the dataframe running the WoE_IV function
                woe =variables_woe_iv[
                    (variables_woe_iv['Variables']==variable) & 
                    (variables_woe_iv['Categories']==category)
                ]['WOE'].values[0]

                ### Replace each category with their respective woe
                df_with_selected_variables[variable] = df_with_selected_variables[variable].replace(category, woe)
            except Exception as e:
                pass
    
    return df_with_selected_variables

In [12]:
def woe_iv(df, features, target):   
    output = {}
    for variable in features:
        # Adjusted to unpack the tuple into var (DataFrame) and iv (scalar value)
        var, iv = woe_cat(df, variable, target)
        
        for i in range(len(var.index)):
            # Now assuming 'var' correctly references the DataFrame part of the tuple
            output.setdefault('Variables', []).append(variable)
            output.setdefault('Categories', []).append(var.index[i])  # Accessing index directly
            output.setdefault('WOE', []).append(var['WOE'].iloc[i])  # Using .iloc[] for accessing values
            # Instead of appending var['IV'].iloc[i], append the iv variable for each category
            output.setdefault('IV', []).append(iv)  # Assuming iv is a scalar value representing the IV for the entire variable

    # Convert the dictionary to DataFrame
    return pd.DataFrame(output)

In [13]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
def compute_gini(y_test, y_train, test_preds, train_preds):
    # instantiate roc auc score from sci-kit learn metrics and compute AUC

    Test_AUC = roc_auc_score(y_test, test_preds)
    Train_AUC = roc_auc_score(y_train, train_preds)
    # Compute Gini from AUC
    Test_Gini = (2*Test_AUC) - 1
    Train_Gini = (2*Train_AUC) - 1
    return Test_Gini, Train_Gini

In [14]:
def plot_by_woe(df_WoE, feature_name, rotation_of_x_axis_labels=0, xlabel_fontsize=12, ylabel_fontsize=12):
    x = np.array(df_WoE.index).astype(str)
    y = df_WoE['WOE']
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, marker='o', linestyle='--', color='k')
    plt.xlabel('Categories', fontsize=xlabel_fontsize)
    plt.ylabel('Weight of Evidence', fontsize=ylabel_fontsize)
    plt.title(f'Weight of Evidence by {feature_name}', fontsize=14)
    plt.xticks(rotation=rotation_of_x_axis_labels)
    plt.grid(True)  # Add gridlines for better visualization
    plt.show()

In [15]:
import plotly.graph_objects as go
import numpy as np

def plot_by_woe_plotly(df_WoE, feature_name):
    x = np.array(df_WoE.index).astype(str)
    y = df_WoE['WOE']
    
    # Create Plotly figure
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y, mode='lines+markers', name='WoE', line=dict(color='black')))
    
    # Update layout with titles and labels
    fig.update_layout(
        title=f'Weight of Evidence by {feature_name}',
        xaxis_title='Categories',
        yaxis_title='Weight of Evidence',
        xaxis=dict(tickmode='array', tickvals=list(range(len(x))), ticktext=x),
        template='plotly_white'
    )
    
    return fig


In [16]:
from plotly.subplots import make_subplots

def calculate_woe_and_iv_for_all_features_plotly(df_train, df_test, features, target):
    iv_summary = {'Feature': [], 'Train IV': [], 'Test IV': []}

    for feature in features:
        # Calculate WoE and IV for training and testing datasets
        woe_df_train, iv_train = woe_cat(df_train, feature, target)
        woe_df_test, iv_test = woe_cat(df_test, feature, target)

        # Create Plotly figures for WoE of training and testing datasets
        fig_train = plot_by_woe_plotly(woe_df_train, f'Train {feature}')
        fig_test = plot_by_woe_plotly(woe_df_test, f'Test {feature}')

        # Setup for side-by-side subplots
        fig = make_subplots(rows=1, cols=2, subplot_titles=(f'Train {feature}', f'Test {feature}'))
        
        # Add traces from the individual plots to the combined subplot
        for trace in fig_train['data']:
            fig.add_trace(trace, row=1, col=1)
        
        for trace in fig_test['data']:
            fig.add_trace(trace, row=1, col=2)
        
        # Display the subplot
        fig.show()

        # Append IV values to the summary
        iv_summary['Feature'].append(feature)
        iv_summary['Train IV'].append(iv_train)
        iv_summary['Test IV'].append(iv_test)

    return woe_results, pd.DataFrame(iv_summary)


In [17]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

def calculate_vif(X):
    """
    Calculate VIF for each variable in the given DataFrame.
    """
    X_const = sm.add_constant(X)
    vif_df = pd.DataFrame({
        'Variable': X_const.columns,
        'VIF': [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
    })
    return vif_df

def remove_high_vif_variables(X, threshold=5.0):
    """
    Remove variables with VIF greater than the specified threshold and return
    the reduced DataFrame along with the DataFrame of VIF values for the remaining variables.
    """
    while True:
        vif_df = calculate_vif(X)
        vif_df = vif_df[vif_df['Variable'] != 'const']  # Exclude the constant term
        high_vif = vif_df[vif_df['VIF'] > threshold]
        
        if high_vif.empty:
            break  # No variables with VIF > threshold
        
        # Remove the variable with the highest VIF
        highest_vif_var = high_vif.sort_values('VIF', ascending=False).iloc[0]['Variable']
        X = X.drop(columns=[highest_vif_var])
    
    # Calculate the final VIF DataFrame for the remaining variables
    final_vif_df = calculate_vif(X)
    return X, final_vif_df.drop(index=0)  # Drop the row corresponding to the 'const' term



In [18]:
def plot_feature_distribution(df, feature):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=feature, hue='IsDefault', palette='coolwarm')
    plt.title(f'Distribution by {feature}')
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()

In [19]:
def plot_credit_score_distributions(data, features):
    """
    Plot distributions of credit score by a list of categorical features.
    
    Parameters:
    - data: DataFrame containing the data.
    - features: List of strings representing the categorical features to plot.
    """
    num_features = len(features)
    # Calculate the number of rows/columns needed for the subplots
    nrows = (num_features + 1) // 2
    ncols = 2 if num_features > 1 else 1
    
    # Set up the matplotlib figure
    f, axes = plt.subplots(nrows, ncols, figsize=(18, 6 * nrows))
    
    # Flatten axes array for easy indexing if there's more than one subplot
    if nrows * ncols > 1:
        axes = axes.flatten()
    else:
        axes = [axes]
    
    # Plot distributions of credit score by specified features
    for i, feature in enumerate(features):
        sns.boxplot(x=feature, y='credit_score', data=data, ax=axes[i])
        axes[i].set_title(f'Credit Score by {feature.capitalize()}')
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    
    plt.tight_layout()
    plt.show()

In [20]:
def select_features(data):
    '''
    This function estimates the statistical significance of categorical 
    variables with respect to predicting the target variable
    
    parameters ::
    - data :: DataFrame Object
    
    output ::
    - Dataframe result of variables in order of significance
    '''
    # define an empty dictionary to store chi-test results
    chi2_check = {}
    
    # select only categorical variables
#     training_data_cat = data.select_dtypes(include = ['object','category'])
    
    # iteratively pick columns and calculate chi statistic with the target variable
    for column in data.columns.tolist()[:-1]:
    
        chi, p, dof, ex = chi2_contingency(pd.crosstab(data['IsDefault'], data[column]))
        chi2_check.setdefault('Features',[]).append(column)
        chi2_check.setdefault('p-values',[]).append(round(p, 3))

    # convert the dictionary to a DF
    chi2_result = pd.DataFrame(data = chi2_check).sort_values('p-values', ascending=True, ignore_index=True)
    
    # return result
    return chi2_result

In [21]:
def bin_months_loan_duration(value):
    """
    Assigns a value to its corresponding bin for months_loan_duration.
    
    Parameters:
    - value: The raw input value for months_loan_duration.
    
    Returns:
    - A string representing the binned category.
    """
    bins = [(4.0, 12.0), (12.0, 18.0), (18.0, 24.0), (24.0, 72.0)]
    bin_labels = ['(3.999, 12.0]', '(12.0, 18.0]', '(18.0, 24.0]', '(24.0, 72.0]']

    for bin_range, label in zip(bins, bin_labels):
        if bin_range[0] < value <= bin_range[1]:
            return label
    return '(3.999, 12.0]'  # Return 'Unknown' or another appropriate value if the value doesn't fit in any bin




In [22]:
def bin_amount(value):
    """
    Assigns an amount value to its corresponding bin.
    
    Parameters:
    - value: The raw input value for the amount.
    
    Returns:
    - A string representing the binned category.
    """
    bins = [(250.0, 1365.5), (1365.5, 2319.5), (2319.5, 3972.25), (3972.25, 18424.0)]
    bin_labels = ['(249.999, 1365.5]', '(1365.5, 2319.5]', '(2319.5, 3972.25]', '(3972.25, 18424.0]']

    for bin_range, label in zip(bins, bin_labels):
        if bin_range[0] < value <= bin_range[1]:
            return label
    return '(249.999, 1365.5]' # Return 'Unknown' if the value doesn't fit in any bin



In [23]:
def bin_savings_balance(value):
    """
    Assigns a savings balance value to its corresponding bin.
    
    Parameters:
    - value: The raw input value for the savings balance.
    
    Returns:
    - A string representing the binned category.
    """
    bins = [(-0.001, 10.75), (10.75, 49.5), (49.5, 91.25), (91.25, 19972.0)]
    bin_labels = ['(-0.001, 10.75]', '(10.75, 49.5]', '(49.5, 91.25]', '(91.25, 19972.0]']

    for bin_range, label in zip(bins, bin_labels):
        if bin_range[0] < value <= bin_range[1]:
            return label
    return '(-0.001, 10.75)'  # Return 'Unknown' if the value doesn't fit in any bin



In [24]:
def bin_age(value):
    """
    Assigns an age value to its corresponding bin.
    
    Parameters:
    - value: The raw input value for age.
    
    Returns:
    - A string representing the binned category.
    """
    bins = [(19.0, 27.0), (27.0, 33.0), (33.0, 42.0), (42.0, 75.0)]
    bin_labels = ['(18.999, 27.0]', '(27.0, 33.0]', '(33.0, 42.0]', '(42.0, 75.0]']

    for bin_range, label in zip(bins, bin_labels):
        if bin_range[0] < value <= bin_range[1]:
            return label
    return '(18.999, 27.0]'  # Return 'Unknown' if the value doesn't fit in any bin


In [25]:
def extract_years(employment_length):
    """
    Extracts the numeric part from the employment length input, which might be a string or float.
    
    Parameters:
    - employment_length: Input describing the employment length, which could be a string like '2 years', a float, or 'Unknown'.
    
    Returns:
    - The numeric part (number of years) as an integer, 'Unknown' if not applicable, or the input directly if it's already numeric.
    """
    if pd.isnull(employment_length) or employment_length == 'Unknown':
        return 'Unknown'
    elif isinstance(employment_length, (int, float)):
        return employment_length  # Return directly if already a number
    else:
        try:
            # Assuming the format could be '[number] years' if it's a string
            return int(employment_length.split()[0])
        except ValueError:
            return 'Unknown'

def bin_employment_length(employment_length):
    """
    Assigns an employment length value to its corresponding bin, handling both string and numeric inputs.
    
    Parameters:
    - employment_length: The raw input value for employment length, which could be a string or numeric.
    
    Returns:
    - A string representing the binned category.
    """
    years = extract_years(employment_length)
    
    if years == 'Unknown':
        return 'Unknown'
    elif years == 0:
        return '0 years'
    elif 0 < years < 5:
        return '< 5 years'
    elif 5 <= years < 10:
        return '5-10 years'
    elif 10 <= years < 15:
        return '10-15 years'
    elif years >= 15:
        return '> 15 years'
    else:
        return 'Unknown'


In [26]:

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from itertools import cycle

def calculate_store_and_plot_bad_rates(df, grouping_columns, outcome_col='IsDefault'):
    if outcome_col not in df.columns:
        raise ValueError(f"Error: '{outcome_col}' column does not exist.")
    
    if not grouping_columns:
        raise ValueError("No grouping columns provided.")
    
    # Initialize subplot figure with dynamic configuration based on the provided grouping columns
    cols = len(grouping_columns)
    color_cycle = cycle(px.colors.qualitative.Plotly)
    fig = make_subplots(rows=1, cols=cols, subplot_titles=[f'Bad Rate(%) by {col}' for col in grouping_columns])
    
    for idx, group_col in enumerate(grouping_columns):
        if group_col not in df.columns:
            raise ValueError(f"Error: '{group_col}' column does not exist.")
        
        temp_df = df[[group_col, outcome_col]].copy()
        temp_df[group_col] = temp_df[group_col].astype(str)
        
        crosstab = pd.crosstab(temp_df[group_col], temp_df[outcome_col])
        
        if not {'Bad', 'Good'}.issubset(crosstab.columns):
            print(f"Warning: 'Good' and 'Bad' labels not found in '{outcome_col}' for group '{group_col}'.")
            continue
        
        total = crosstab['Bad'] + crosstab['Good']
        bad_rate = (crosstab['Bad'] / total) * 100
        crosstab_sorted = bad_rate.sort_values(ascending=False)
        
        fig.add_trace(go.Bar(x=crosstab_sorted.index, y=crosstab_sorted.values, name=group_col, marker_color=next(color_cycle)), row=1, col=idx + 1)
    
    fig.update_layout(height=600, width=400 * cols, title_text="Comparison of Bad Rates Across Categories")
    fig.show()


In [27]:
def calculate_woe_and_iv_for_all_features(df_train, df_test, features, target):
    # Function definitions for `woe_cat` and `plot_by_woe` should be included or imported here

    woe_results = {}
    iv_summary = {'Feature': [], 'Train IV': [], 'Test IV': []}

    for feature in features:
        # Calculate WoE and IV for the training and testing datasets
        woe_df_train, iv_train = woe_cat(df_train, feature, target)
        woe_df_test, iv_test = woe_cat(df_test, feature, target)

        # Plot WoE for training and testing datasets
        plot_by_woe(woe_df_train, feature)
        plot_by_woe(woe_df_test, feature)

        # Combine Train and Test DataFrames with a spacer for visualization
        spacer_df = pd.DataFrame({(' ', ' ', f'Spacer_{feature}'): [None] * len(woe_df_train) for _ in range(1, 4)})
        woe_combined = pd.concat([woe_df_train, spacer_df, woe_df_test], axis=1)
        woe_results[feature] = woe_combined

        # Append IV values to the summary
        iv_summary['Feature'].append(feature)
        iv_summary['Train IV'].append(iv_train)
        iv_summary['Test IV'].append(iv_test)

    return woe_results, pd.DataFrame(iv_summary)