### **4. Design Scorecards**

#### **4.1 Pre-processing Training Set**

In [1]:
#import library
import pandas as pd
import numpy as np
import sys
sys.path.append("../src")
#load configuration
import utils

In [2]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': '../data/raw/Loan_default.csv',
 'dataset_path': '../data/output/data.pkl',
 'predictors_set_path': '../data/output/predictors.pkl',
 'response_set_path': '../data/output/response.pkl',
 'train_path': ['../data/output/X_train.pkl', '../data/output/y_train.pkl'],
 'test_path': ['../data/output/X_test.pkl', '../data/output/y_test.pkl'],
 'data_train_path': '../data/output/training_data.pkl',
 'data_train_binned_path': '../data/output/bin_training_data.pkl',
 'crosstab_list_path': '../data/output/list_crosstab.pkl',
 'WOE_table_path': '../data/output/WOE_table.pkl',
 'IV_table_path': '../data/output/IV_table.pkl',
 'WOE_map_dict_path': '../data/output/WOE_map_dict.pkl',
 'X_train_woe_path': '../data/output/X_train_woe.pkl',
 'response_variable': 'Default',
 'test_size': 0.2,
 'numeric_col': ['Age',
  'Income',
  'LoanAmount',
  'MonthsEmployed',
  'NumCreditLines',
  'InterestRate',
  'LoanTerm',
  'DTIRatio'],
 'categoric_col': ['Education',
  'EmploymentType',
  'Ma

In [3]:
def get_woe_map_dict():
    """
    Get the WOE mapping dictionary

    Returns
    -------
    dict : A dictionary containing the  mapping of characteristic, attribute, and their corresponding WOE values
    """
    #load the WOE table
    WOE_table = utils.pickle_load(config_data['WOE_table_path'])

    #initialize the dictionary
    WOE_map_dict = {}
    WOE_map_dict['Missing'] = {}
    
    #get unique characteristics
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        #get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     
                            [['Attribute', 'WOE']])                
        
        #get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']

            if attribute == 'Missing':
                WOE_map_dict['Missing'][char] = woe
            else:
                WOE_map_dict[char][attribute] = woe
                WOE_map_dict['Missing'][char] = np.nan

    #validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    #dump
    utils.pickle_dump(WOE_map_dict, config_data['WOE_map_dict_path'])

    return WOE_map_dict
    

In [4]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  16


{'Missing': {'Education': nan,
  'NumCreditLines_binned': nan,
  'HasCoSigner': nan,
  'EmploymentType': nan,
  'InterestRate_binned': nan,
  'DTIRatio_binned': nan,
  'LoanAmount_binned': nan,
  'MaritalStatus': nan,
  'MonthsEmployed_binned': nan,
  'HasMortgage': nan,
  'Income_binned': nan,
  'HasDependents': nan,
  'LoanTerm_binned': nan,
  'LoanPurpose': nan,
  'Age_binned': nan},
 'Education': {"Bachelor's": -0.04317159202011873,
  'High School': -0.11548933910187546,
  "Master's": 0.07233014390140881,
  'PhD': 0.09928936450959838},
 'NumCreditLines_binned': {Interval(0.999, 2.0, closed='right'): 0.08527532380344567,
  Interval(2.0, 3.0, closed='right'): -0.04270951886823329,
  Interval(3.0, 4.0, closed='right'): -0.11609555621351765},
 'HasCoSigner': {'No': -0.1120492709832814, 'Yes': 0.12254710979345039},
 'EmploymentType': {'Full-time': 0.229125134449018,
  'Part-time': -0.028895475054852786,
  'Self-employed': 0.00847586071301078,
  'Unemployed': -0.17626982581844736},
 'Int

In [5]:
def transform_woe(raw_data=None, type=None, config_data=None):
    """
    Replace data value with WOE scores
    
    Args
    ----
    raw_data : DataFrame
        Raw data to be transformed with WOE scores 
        If not provided, it is expected to load the data based on the specified type
    type : Str
        Type of data to transform, either "train" or "app"
        If provided, the raw data is loaded based on this type
    config_data : dict
        Configuration data including numeric columns and WOE map

    Returns
    -------
    pandas.DataFrame: Transformed data with WOE scores.

    This function replaces the original values in the raw data with WOE scores based on the provided WOE map. It takes care of both numerical and categorical columns.
    It is typically used for preparing data for credit scoring models.
    """
    #load the numerical columns
    numeric_col = config_data['numeric_col']

    #load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(config_data['WOE_map_dict_path'])

    #load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(config_data[f'{type}_path'][0])

    #map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        if col in numeric_col:
            map_col = col + '_binned'
        else:
            map_col = col    

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    #map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in numeric_col:
            map_col = col + '_binned'
        else:
            map_col = col 

        woe_data[col] = woe_data[col].fillna(value=WOE_map_dict['Missing'][map_col])

    #validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    #dump data
    if type is not None:
        utils.pickle_dump(woe_data, config_data[f'X_{type}_woe_path'])

    return woe_data

In [9]:
#transform the train set
X_train_woe = transform_woe(type='train', config_data=config_data)
X_train_woe

Raw data shape :  (204277, 15)
WOE data shape :  (204277, 15)


Unnamed: 0,Age,Income,LoanAmount,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
15826,0.137337,-0.553031,-0.358424,-0.382546,0.085275,0.678131,-0.005742,-0.077760,-0.115489,-0.176270,0.113294,-0.067980,0.116667,-0.021700,-0.112049
147371,0.137337,0.284585,0.208702,0.466979,-0.042710,0.053646,-0.005742,-0.016666,0.099289,0.008476,0.113294,0.071661,0.116667,-0.010732,0.122547
178180,0.137337,0.245767,0.398000,0.466979,-0.042710,0.678131,0.006070,-0.077760,-0.115489,0.008476,-0.023328,0.071661,0.116667,-0.075896,0.122547
126915,0.137337,-0.553031,0.398000,-0.382546,-0.042710,-0.229457,-0.000626,-0.077760,-0.043172,0.008476,0.113294,-0.067980,0.116667,-0.021859,-0.112049
163930,-0.641271,0.245767,-0.358424,0.244616,-0.042710,-0.229457,-0.005742,-0.016666,-0.043172,-0.028895,-0.082647,-0.067980,-0.107344,-0.021700,0.122547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59108,-0.641271,0.245767,0.043436,0.047652,-0.042710,0.678131,-0.000626,0.040058,0.072330,-0.176270,-0.082647,-0.067980,-0.107344,-0.075896,0.122547
71610,0.882110,-0.553031,0.208702,0.244616,-0.116096,0.678131,-0.005742,-0.077760,0.099289,-0.176270,0.113294,-0.067980,-0.107344,-0.021700,0.122547
85645,0.882110,0.031519,0.398000,-0.173849,0.085275,0.053646,0.006070,-0.077760,-0.115489,0.008476,-0.023328,-0.067980,-0.107344,-0.021700,-0.112049
21010,-0.264161,0.031519,0.208702,-0.382546,0.085275,-0.229457,-0.005742,0.109887,-0.115489,-0.028895,-0.082647,0.071661,0.116667,-0.010732,-0.112049
