# **Design Scorecards**
---

## **Pre-processing Training Set**

In [1]:
#import library
import pandas as pd
import numpy as np
import sys
sys.path.append("../src")
#load configuration
import utils

In [2]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': '../dataset/raw/credit_risk.csv',
 'dataset_path': '../dataset/output/data.pkl',
 'predictors_set_path': '../dataset/output/predictors.pkl',
 'response_set_path': '../dataset/output/response.pkl',
 'train_path': ['../dataset/output/X_train.pkl',
  '../dataset/output/y_train.pkl'],
 'test_path': ['../dataset/output/X_test.pkl', '../dataset/output/y_test.pkl'],
 'data_train_path': '../dataset/output/training_data.pkl',
 'data_train_binned_path': '../dataset/output/bin_training_data.pkl',
 'crosstab_list_path': '../dataset/output/list_crosstab.pkl',
 'WOE_table_path': '../dataset/output/WOE_table.pkl',
 'IV_table_path': '../dataset/output/IV_table.pkl',
 'WOE_map_dict_path': '../dataset/output/WOE_map_dict.pkl',
 'X_train_woe_path': '../dataset/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.2,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_c

In [3]:
def get_woe_map_dict():
    """
    Get the WOE mapping dictionary

    Returns
    -------
    dict : A dictionary containing the  mapping of characteristic, attribute, and their corresponding WOE values
    """
    #load the WOE table
    WOE_table = utils.pickle_load(config_data['WOE_table_path'])

    #initialize the dictionary
    WOE_map_dict = {}
    WOE_map_dict['Missing'] = {}
    
    #get unique characteristics
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        #get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     
                            [['Attribute', 'WOE']])                
        
        #get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']

            if attribute == 'Missing':
                WOE_map_dict['Missing'][char] = woe
            else:
                WOE_map_dict[char][attribute] = woe
                WOE_map_dict['Missing'][char] = np.nan

    #validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    #dump
    utils.pickle_dump(WOE_map_dict, config_data['WOE_map_dict_path'])

    return WOE_map_dict

In [4]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  11


{'Missing': {'person_age_bin': nan,
  'person_income_bin': nan,
  'cb_person_cred_hist_length_bin': nan,
  'cb_person_default_on_file': nan,
  'person_home_ownership': nan,
  'loan_int_rate_bin': 0.034577926166809825,
  'person_emp_length_bin': -0.5216805596571665,
  'loan_intent': nan,
  'loan_amnt_bin': nan,
  'loan_percent_income_bin': nan},
 'person_age_bin': {Interval(19.999, 23.0, closed='right'): -0.1215242745737818,
  Interval(23.0, 25.0, closed='right'): 0.006518968643743249,
  Interval(25.0, 27.0, closed='right'): 0.049261238842137456,
  Interval(27.0, 32.0, closed='right'): 0.07891633325666864,
  Interval(32.0, 144.0, closed='right'): 0.05456465518304823},
 'person_income_bin': {Interval(3999.999, 35000.0, closed='right'): -1.0072369839339057,
  Interval(35000.0, 48996.0, closed='right'): -0.052875086095511434,
  Interval(48996.0, 63000.0, closed='right'): 0.1754188638522987,
  Interval(63000.0, 86000.0, closed='right'): 0.4920460029432934,
  Interval(86000.0, 6000000.0, clo

In [5]:
def transform_woe(raw_data=None, type=None, config_data=None):
    """
    Replace data value with WOE scores
    
    Args
    ----
    raw_data : DataFrame
        Raw data to be transformed with WOE scores 
        If not provided, it is expected to load the data based on the specified type
    type : Str
        Type of data to transform, either "train" or "app"
        If provided, the raw data is loaded based on this type
    config_data : dict
        Configuration data including numeric columns and WOE map

    Returns
    -------
    pandas.DataFrame: Transformed data with WOE scores.

    This function replaces the original values in the raw data with WOE scores based on the provided WOE map. It takes care of both numerical and categorical columns.
    It is typically used for preparing data for credit scoring models.
    """
    #load the numerical columns
    numeric_cols = config_data['num_columns']

    #load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(config_data['WOE_map_dict_path'])

    #load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(config_data[f'{type}_path'][0])

    #map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        if col in numeric_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    #map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in numeric_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        woe_data[col] = woe_data[col].fillna(value=WOE_map_dict['Missing'][map_col])

    #validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    #dump data
    if type is not None:
        utils.pickle_dump(woe_data, config_data[f'X_{type}_woe_path'])

    return woe_data


In [6]:
#transform the train set
X_train_woe = transform_woe(type='train', config_data=config_data)
X_train_woe

Raw data shape :  (26064, 10)
WOE data shape :  (26064, 10)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15884,0.006519,1.030836,0.679659,0.115487,0.277027,-0.483470,1.032338,0.769938,0.212829,-0.004326
15138,-0.121524,-1.007237,-0.507538,0.115487,0.097897,0.043609,0.080689,0.769938,0.212829,-0.004326
7474,0.006519,0.175419,0.679659,0.321916,-0.272080,-0.483470,0.080689,-1.401235,0.212829,-0.072840
18212,0.078916,-1.007237,1.262009,-0.521681,-0.272080,0.278812,-1.283691,-1.401235,0.212829,0.064258
6493,0.006519,0.175419,0.679659,-0.086865,0.469865,0.207943,0.600096,0.162202,0.212829,-0.072840
...,...,...,...,...,...,...,...,...,...,...
14621,0.006519,1.030836,0.679659,0.321916,-0.230444,-0.483470,-1.283691,-1.401235,0.212829,-0.004326
18736,0.078916,0.492046,-0.507538,-0.309268,-0.272080,-0.483470,0.080689,0.162202,0.212829,0.064258
1663,-0.121524,-1.007237,-0.507538,0.235009,0.277027,0.043609,0.600096,0.769938,0.212829,-0.072840
18257,0.078916,1.030836,-0.507538,0.115487,-0.230444,-0.483470,-1.283691,0.162202,-0.773827,0.064258
