In [4]:
import warnings
import datetime as dt
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import re
import string 
import copy
import seaborn as sns 
import numpy as np
import pandas as pd

In [2]:
# A function to calculate WOE and IV for numerical features
def woe_num(data, feature, gbflag):
    dt = pd.crosstab(index=data[feature], columns=data[gbflag])
    dt['Freq'] = dt.sum(axis=1)
    dt['Percentage'] = round((dt['Freq']/dt['Freq'].sum() *100),1)
    dt['% Good'] = round((dt['Good']/dt['Good'].sum() *100), 1)
    dt['% Bad'] = round((dt['Bad']/dt['Bad'].sum() *100), 1)
    dt['Bad Rate'] = round((dt['Bad']/dt['Freq']) *100, 1) 
    dt['GoodBaddOdds'] = round(dt['Good']/dt['Bad'], 2)
    dt['WOE'] = np.log(dt['% Bad']/dt['% Good'])
    dt['Class IV'] = (dt['% Bad']- dt['% Good']) * dt['WOE']
    dt['Variable IV'] = ((dt['% Bad']-dt['% Good']) * dt['WOE']).sum() #IV for the variable
#     dt['IV'] = ((dt['% Good']- dt['% Bad']) * dt['WOE']).sum()
    dt = dt.sort_values(['WOE'])
    
    return dt
# Function to calculate WOE and IV for categorical variables
# Function to calculate WOE and IV for categorical variables
def woe_cat(data, feature, gbflag):
    dt = pd.crosstab(index=data[feature], columns=data[gbflag])
    dt['Freq'] = dt.sum(axis=1)
    dt['Proptn'] = dt['Freq']/dt['Freq'].sum()
    dt['% Good'] = dt['Good']/dt['Good'].sum() 
    dt['% Bad'] = dt['Bad']/dt['Bad'].sum()
    dt['Bad Rate'] = dt['Bad']/dt['Freq'] 
    dt['GoodBaddOdds'] = round(dt['Good']/dt['Bad'], 2)
    dt['WOE'] = np.log(dt['% Bad']/dt['% Good'])
    dt['Class IV'] = (dt['% Bad']- dt['% Good']) * dt['WOE']
    dt['Variable IV'] = ((dt['% Bad']-dt['% Good']) * dt['WOE']).sum() #IV for the variable
    dt = dt.sort_values(['WOE'])
    return dt, dt['Variable IV'].iloc[0]


# A function to get all variables WOE and IV
def woe_iv(df, variablie_list, GBFlag):   
    output = {}
    for variable in variablie_list:
        try:
            var = woe_cat(df, variable, GBFlag)
            
            for i in range(len(var.index.values)):
                output.setdefault('Variables',[]).append(variable)
                output.setdefault('Categories',[]).append(var.index.values[i])
                output.setdefault('WOE',[]).append(var['WOE'].values[i])
                output.setdefault('IV',[]).append(var['IV'].values[i])
        except Exception as e:
            pass

    return pd.DataFrame(output)
# woe_iv(data_universe_fai, data_universe_fai_cat.columns, 'GBFlag')
# A function that automates and return CA report
def Export_CA_Report(df, variablie_list, GBFlag, fileName='CA_Report'):
    
    # This inner function estimates WoE and IV which represent CA for all the provided variables
    def CA_Report(df, variablie_list, GBFlag):
        # Create a dictionary to track CA iteratively
        output = {}
        for variable in variablie_list:
            # use the function for estimating WoE for categorical variables to 
            var = woe_cat(df, variable, GBFlag)
            # append to dictionary
            output[variable]=var        

        return output
    
    
   # Instatiate above function to CA report for all provided variables
    ca_result = CA_Report(df, variablie_list, GBFlag)
    # save CA report to Excel
    with pd.ExcelWriter(fileName +'.xlsx', engine="openpyxl") as writer:
        for variable_name, data in ca_result.items():
            data.to_excel(writer, sheet_name=variable_name)
    
    
def fill_binned_nan_values(df, variablie_list, GBFlag):
    for variable in variablie_list:
        if df[variable].isna().sum() > 0:
            # Run CA(WoE & IV) on variable
            run_ca = woe_num(df, variable, GBFlag)
            # get index value(bin class) of category with least woe
            bin_class = run_ca[run_ca['WOE']==max(run_ca['WOE'])].index.values[0]
            df[variable] = df[variable].fillna(bin_class)
        else:
            pass

In [2]:
def replace_with_woe(df_with_selected_variables, variables_woe_iv):
    '''
    This function replaces each category in the selected columns with
    their woe
    
    df_with_selected_variables :: Dataframe containing grouped/binned selected
    variables
    
    grpd_variables_woe_iv :: Dataframe containing WoE and IV of grouped/binned
    variables
    
    Note:: Make sure woe_iv function has returned it's result before running
    this function. result in this case - grpd_variables_woe_iv
    
    '''
    
    for variable in df_with_selected_variables.columns:
        for category in df_with_selected_variables[variable].unique():
            
            try:
                ### Get WoE from the dataframe running the WoE_IV function
                woe =variables_woe_iv[
                    (variables_woe_iv['Variables']==variable) & 
                    (variables_woe_iv['Categories']==category)
                ]['WOE'].values[0]

                ### Replace each category with their respective woe
                df_with_selected_variables[variable] = df_with_selected_variables[variable].replace(category, woe)
            except Exception as e:
                pass
    
    return df_with_selected_variables

In [4]:
def woe_iv(df, features, target):   
    output = {}
    for variable in features:
        # Adjusted to unpack the tuple into var (DataFrame) and iv (scalar value)
        var, iv = woe_cat(df, variable, target)
        
        for i in range(len(var.index)):
            # Now assuming 'var' correctly references the DataFrame part of the tuple
            output.setdefault('Variables', []).append(variable)
            output.setdefault('Categories', []).append(var.index[i])  # Accessing index directly
            output.setdefault('WOE', []).append(var['WOE'].iloc[i])  # Using .iloc[] for accessing values
            # Instead of appending var['IV'].iloc[i], append the iv variable for each category
            output.setdefault('IV', []).append(iv)  # Assuming iv is a scalar value representing the IV for the entire variable

    # Convert the dictionary to DataFrame
    return pd.DataFrame(output)

In [5]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
def compute_gini(y_test, y_train, test_preds, train_preds):
    # instantiate roc auc score from sci-kit learn metrics and compute AUC

    Test_AUC = roc_auc_score(y_test, test_preds)
    Train_AUC = roc_auc_score(y_train, train_preds)
    # Compute Gini from AUC
    Test_Gini = (2*Test_AUC) - 1
    Train_Gini = (2*Train_AUC) - 1
    return Test_Gini, Train_Gini

In [13]:
def plot_by_woe(df_WoE, feature_name, rotation_of_x_axis_labels=0, xlabel_fontsize=12, ylabel_fontsize=12):
    x = np.array(df_WoE.index).astype(str)
    y = df_WoE['WOE']
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, marker='o', linestyle='--', color='k')
    plt.xlabel('Categories', fontsize=xlabel_fontsize)
    plt.ylabel('Weight of Evidence', fontsize=ylabel_fontsize)
    plt.title(f'Weight of Evidence by {feature_name}', fontsize=14)
    plt.xticks(rotation=rotation_of_x_axis_labels)
    plt.grid(True)  # Add gridlines for better visualization
    plt.show()

In [None]:
def plot_feature_distribution(df, feature, title):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=feature, hue='GBFlag_cus', palette='coolwarm')
    plt.title(title)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()