### **3. Initial Characteristic Analysis**

#### **3.1 Characteristic Binning**

In [1]:
#import library
import pandas as pd
import numpy as np
import sys
sys.path.append("../src")
#load configuration
import utils

In [2]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': '../data/raw/Loan_default.csv',
 'dataset_path': '../data/output/data.pkl',
 'predictors_set_path': '../data/output/predictors.pkl',
 'response_set_path': '../data/output/response.pkl',
 'train_path': ['../data/output/X_train.pkl', '../data/output/y_train.pkl'],
 'test_path': ['../data/output/X_test.pkl', '../data/output/y_test.pkl'],
 'data_train_path': '../data/output/training_data.pkl',
 'data_train_binned_path': '../data/output/bin_training_data.pkl',
 'crosstab_list_path': '../data/output/list_crosstab.pkl',
 'WOE_table_path': '../data/output/WOE_table.pkl',
 'IV_table_path': '../data/output/IV_table.pkl',
 'WOE_map_dict_path': '../data/output/WOE_map_dict.pkl',
 'X_train_woe_path': '../data/output/X_train_woe.pkl',
 'response_variable': 'Default',
 'test_size': 0.2,
 'numeric_col': ['Age',
  'Income',
  'LoanAmount',
  'MonthsEmployed',
  'NumCreditLines',
  'InterestRate',
  'LoanTerm',
  'DTIRatio'],
 'categoric_col': ['Education',
  'EmploymentType',
  'Ma

In [3]:
#load the training data from a pickled file using the configuration data
training_data = utils.pickle_load(config_data['data_train_path'])

In [4]:
#display information about the training data
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204277 entries, 15826 to 7493
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Age             204277 non-null  int64  
 1   Income          204277 non-null  int64  
 2   LoanAmount      204277 non-null  int64  
 3   MonthsEmployed  204277 non-null  int64  
 4   NumCreditLines  204277 non-null  int64  
 5   InterestRate    204277 non-null  float64
 6   LoanTerm        204277 non-null  int64  
 7   DTIRatio        204277 non-null  float64
 8   Education       204277 non-null  object 
 9   EmploymentType  204277 non-null  object 
 10  MaritalStatus   204277 non-null  object 
 11  HasMortgage     204277 non-null  object 
 12  HasDependents   204277 non-null  object 
 13  LoanPurpose     204277 non-null  object 
 14  HasCoSigner     204277 non-null  object 
 15  Default         204277 non-null  int64  
dtypes: float64(2), int64(7), object(7)
memory usage: 26.5+

Create a function for binning the numerical predictors (Age, Income, LoanAmount, MonthsEmployed, NumCreditLines, InterestRate, LoanTerm, and DTIRatio)

In [5]:
def create_binning(data, predictor_label, num_of_bins):
    """
    Bin a numerical predictor into the specified number of bins

    Parameters
    ----------
    data : DataFrame
        The DataFrame containing the data
    predictor_label : Str
        The label of the numerical predictor column
    num_of_bins : Int
        The number of bins to create
    
    Returns
    -------
    pd.DataFrame : The DataFrame with a bew column containing the binned predictor values
    """
    #create a new column containing the binned predictor
    data[predictor_label + "_binned"] = pd.qcut(data[predictor_label],
                                             q = num_of_bins,
                                             duplicates='drop')

    return data

In [6]:
def binned_data(type):
    """
    Bin the numerical column in the concatenated data
    
    Parameters
    ----------
    type : Str
        The type of data

    Returns
    -------
    pd.DataFrame : The DataFrame with binned numerical columns    
    """
    #load the concatenated data
    data = utils.pickle_load(config_data[f'data_{type}_path'])

    #bin the numerical columns
    numeric_col = config_data['numeric_col']
    num_of_bins = config_data['num_of_bins']

    for column in numeric_col:
        bin_data = create_binning(data = data,
                                         predictor_label = column,
                                         num_of_bins = num_of_bins)

    #validate
    print(f"Original data shape : ", data.shape)
    print(f"Binned data shape  : ", bin_data.shape)

    #dump binned data
    utils.pickle_dump(bin_data, config_data[f'data_{type}_binned_path'])
        
    return bin_data

In [7]:
#check the function
bin_training_data = binned_data(type='train')
bin_training_data.T

Original data shape :  (204277, 24)
Binned data shape  :  (204277, 24)


Unnamed: 0,15826,147371,178180,126915,163930,54422,180728,198671,50465,68947,...,132667,73318,182558,121355,34697,59108,71610,85645,21010,7493
Age,45,48,47,42,20,42,37,54,26,43,...,52,36,68,36,40,18,67,60,37,44
Income,37039,133963,100204,36078,99464,61602,106031,148090,23724,143641,...,108360,22468,73103,32799,46265,115282,39547,48470,56697,115809
LoanAmount,247916,66275,6967,25966,248557,78867,219866,102452,74526,64944,...,133462,58993,108073,226588,148883,142016,94145,29331,76419,114482
MonthsEmployed,19,119,108,2,74,113,51,26,42,39,...,107,48,43,82,115,50,82,42,15,22
NumCreditLines,2,3,3,3,3,3,2,4,4,4,...,3,2,4,3,3,3,4,1,2,4
InterestRate,4.62,14.72,5.51,18.29,19.45,22.24,3.8,16.13,20.31,22.38,...,3.38,16.44,6.58,23.6,8.7,3.43,3.32,12.67,19.61,5.23
LoanTerm,60,48,24,36,60,24,24,60,12,24,...,48,24,60,24,48,36,48,12,60,36
DTIRatio,0.85,0.49,0.76,0.76,0.45,0.43,0.28,0.53,0.55,0.67,...,0.85,0.51,0.1,0.74,0.58,0.36,0.8,0.83,0.24,0.16
Education,High School,PhD,High School,Bachelor's,Bachelor's,High School,PhD,Master's,Bachelor's,High School,...,Master's,High School,Bachelor's,Bachelor's,High School,Master's,PhD,High School,High School,Master's
EmploymentType,Unemployed,Self-employed,Self-employed,Self-employed,Part-time,Unemployed,Unemployed,Self-employed,Part-time,Unemployed,...,Part-time,Part-time,Unemployed,Part-time,Unemployed,Unemployed,Unemployed,Self-employed,Part-time,Self-employed


#### **3.2 WoE and IV**

To assess the strength of each characteristic individually as a predictor of the credit performance. Create a contingency table/crosstab for all predictors: numerical and categorical predictors.

In [8]:
def create_list_crosstab():
    """
    Generate the crosstab list (contingency table) for WOE and IV calculation. Only in training data
    
    Returns
    -------
    list : A list of contingency tables for each numerical and categorical predictor
    """
    #load the binned train data
    bin_training_data = utils.pickle_load(config_data['data_train_binned_path'])

    #load the response variable (we will summarize based on the response variable)
    response_variable = config_data['response_variable']

    #iterate over numercial columns
    numeric_crosstab = []
    numeric_col = config_data['numeric_col']
    for column in numeric_col:
        #create a contingency table
        crosstab = pd.crosstab(bin_training_data[column + "_binned"],
                               bin_training_data[response_variable],
                               margins = True)

        #append to the list
        numeric_crosstab.append(crosstab)

    #iterate over categorical columns
    categoric_crosstab = []
    categoric_col = config_data['categoric_col']
    for column in categoric_col:
        #create a contingency table
        crosstab = pd.crosstab(bin_training_data[column],
                               bin_training_data[response_variable],
                               margins = True)

        #append to the list
        categoric_crosstab.append(crosstab)

    #put all two in a crosstab_list
    list_crosstab = numeric_crosstab + categoric_crosstab

    #validate the crosstab_list
    print('number of num bin : ', [bin.shape for bin in numeric_crosstab])
    print('number of cat bin : ', [bin.shape for bin in categoric_crosstab])

    #dump the result
    utils.pickle_dump(list_crosstab, config_data['crosstab_list_path'])

    return list_crosstab


In [9]:
#check the function
list_crosstab = create_list_crosstab()
list_crosstab[0]

number of num bin :  [(6, 3), (6, 3), (6, 3), (6, 3), (4, 3), (6, 3), (4, 3), (6, 3)]
number of cat bin :  [(5, 3), (5, 3), (4, 3), (3, 3), (3, 3), (6, 3), (3, 3)]


Default,0,1,All
Age_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(17.999, 28.0]",34379,8577,42956
"(28.0, 38.0]",33751,5775,39526
"(38.0, 49.0]",38708,4433,43141
"(49.0, 59.0]",36590,2918,39508
"(59.0, 69.0]",37127,2019,39146
All,180555,23722,204277


In [10]:
list_crosstab[14]

Default,0,1,All
HasCoSigner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,89030,13084,102114
Yes,91525,10638,102163
All,180555,23722,204277


Calculate the WOE and IV for each crosstab.

- Weight of Evidence (WoE) — measures the strength of each attribute.

$$
\begin{align*}
W_i &= \ln ( \% \text{Good} / \% \text{Bad} ) \\
W_i &= \ln \left ( \left ( \frac{N_i}{\sum N} \right ) / \left ( \frac{P_i}{\sum P} \right )\right ) \\
\end{align*}
$$

- Information Value (IV) — measures the total strength of the characteristic.

$$
\begin{align*}
IV &= \sum_{i=1}^{n} ( \% \text{Good} - \% \text{Bad} ) \times W_i \\
IV &= \sum_{i=1}^{n}\left [ \left ( \frac{N_i}{\sum N} - \frac{P_i}{\sum P} \right ) \times W_i \right ] \\
\end{align*}
$$

In [11]:
def WOE_and_IV():
    """
    Get the WoE and IV for each characteristic
    Calculates WOE and IV for each characteristic based on the provided contingency tables

    Returns
    -------
    tuple : two DataFrames, the first containing WOE values, and the second containing IV values
    """
    #load the crosstab list
    list_crosstab = utils.pickle_load(config_data['crosstab_list_path'])

    #create initial storage for WoE and IV
    WOE_list, IV_list = [], []
    
    #perform the calculation for all crosstab list
    for crosstab in list_crosstab:
        #calculate the WoE and IV
        #------------------------
        #calculate % Good
        crosstab['p_good'] = crosstab[0]/crosstab[0]['All']    
        #calculate % Bad                             
        crosstab['p_bad'] = crosstab[1]/crosstab[1]['All']      
        #calculate the WOE                            
        crosstab['WOE'] = np.log(crosstab['p_good']/crosstab['p_bad'])       
        #calculate the contribution value for IV               
        crosstab['contribution'] = (crosstab['p_good']-crosstab['p_bad'])*crosstab['WOE']   
                
        #append to list
        IV = crosstab['contribution'][:-1].sum()
        add_IV = {'Characteristic': crosstab.index.name, 
                  'Information Value': IV}

        WOE_list.append(crosstab)
        IV_list.append(add_IV)


    #create WOE tabke
    #create initial table to summarize the WOE values
    WOE_table = pd.DataFrame({'Characteristic': [],
                              'Attribute': [],
                              'WOE': []})
    for i in range(len(list_crosstab)):
        #define crosstab and reset index
        crosstab = list_crosstab[i].reset_index()

        #save the characteristic name
        char_name = crosstab.columns[0]

        #only use two columns (Attribute name and its WOE value)
        #drop the last row (average/total WOE)
        crosstab = crosstab.iloc[:-1, [0,-2]]
        crosstab.columns = ['Attribute', 'WOE']

        #add the characteristic name in a column
        crosstab['Characteristic'] = char_name

        WOE_table = pd.concat((WOE_table, crosstab), 
                                axis = 0)

        #reorder the column
        WOE_table.columns = ['Characteristic',
                            'Attribute',
                            'WOE']
    

    #crate IV table
    #create the initial table for IV
    IV_table = pd.DataFrame({'Characteristic': [],
                             'Information Value' : []})
    IV_table = pd.DataFrame(IV_list)

    #define the predictive power of each characteristic
    strength = []

    #assign the rule of thumb regarding IV
    for iv in IV_table['Information Value']:
        if iv < 0.02:
            strength.append('Unpredictive')
        elif iv >= 0.02 and iv < 0.1:
            strength.append('Weak')
        elif iv >= 0.1 and iv < 0.3:
            strength.append('Medium')
        else:
            strength.append('Strong')

    #assign the strength to each characteristic
    IV_table = IV_table.assign(Strength = strength)

    #sort the table by the IV values
    IV_table = IV_table.sort_values(by='Information Value')
    
    #validate
    print('WOE table shape : ', WOE_table.shape)
    print('IV table shape  : ', IV_table.shape)

    #dump data
    utils.pickle_dump(WOE_table, config_data['WOE_table_path'])
    utils.pickle_dump(IV_table, config_data['IV_table_path']) 

    return WOE_table, IV_table

In [12]:
#check the function
WOE_table, IV_table = WOE_and_IV()

WOE table shape :  (58, 3)
IV table shape  :  (15, 3)


In [13]:
#display WOE table
WOE_table

Unnamed: 0,Characteristic,Attribute,WOE
0,Age_binned,"(17.999, 28.0]",-0.641271
1,Age_binned,"(28.0, 38.0]",-0.264161
2,Age_binned,"(38.0, 49.0]",0.137337
3,Age_binned,"(49.0, 59.0]",0.499244
4,Age_binned,"(59.0, 69.0]",0.88211
0,Income_binned,"(14999.999, 42062.0]",-0.553031
1,Income_binned,"(42062.0, 69093.0]",0.031519
2,Income_binned,"(69093.0, 96027.6]",0.182972
3,Income_binned,"(96027.6, 123007.0]",0.245767
4,Income_binned,"(123007.0, 149999.0]",0.284585


In [14]:
#display IV table
IV_table

Unnamed: 0,Characteristic,Information Value,Strength
6,LoanTerm_binned,2.8e-05,Unpredictive
7,DTIRatio_binned,0.004507,Unpredictive
11,HasMortgage,0.00487,Unpredictive
13,LoanPurpose,0.005125,Unpredictive
10,MaritalStatus,0.006637,Unpredictive
4,NumCreditLines_binned,0.007504,Unpredictive
8,Education,0.007595,Unpredictive
12,HasDependents,0.01251,Unpredictive
14,HasCoSigner,0.013716,Unpredictive
9,EmploymentType,0.02051,Weak
