# expand shifu bining Info

- Shifu WOE calculation: good distribution / bad distribution

In [None]:
def expand_bin_info(x):
    """expand shifu bin info as columns in dataframe.
    :param x: row of shifu binning result.
    
    :return: a pandas series.
    """

    ret = {}
    keys = [
        'binBoundary',
        'binCountPos',
        'binPosRate',
        'binWeightedPos',
        'binWeightedPosRate',
        'binCntPosDis',
        'binWgtPosDis',
        'binCountWoe',
        'binWeightedWoe',
        'binCountIV',
        'binWeightedIV',
        'binCountNeg',
        'binWeightedNeg',
        'binCntNegDis',
        'binWgtNegDis',
    ]

    bin_cnt_pos = x['binCountPos']
    bin_wgt_pos = x['binWeightedPos']

    bin_cnt_neg = x['binCountNeg']
    bin_wgt_neg = x['binWeightedNeg']

    bin_cnt_woe = x['binCountWoe']
    bin_wgt_woe = x['binWeightedWoe']

    # binPosDis
    bin_cnt_pos_dis = np.zeros(len(bin_cnt_pos))
    bin_wgt_pos_dis = np.zeros(len(bin_wgt_pos))
    for i in range(len(bin_cnt_pos)):
        bin_cnt_pos_dis[i] = bin_cnt_pos[i] / sum(bin_cnt_pos)
        bin_wgt_pos_dis[i] = bin_wgt_pos[i] / sum(bin_wgt_pos)
    ret['binCntPosDis'] = bin_cnt_pos_dis
    ret['binWgtPosDis'] = bin_wgt_pos_dis

    # binNegDis
    bin_cnt_neg_dis = np.zeros(len(bin_cnt_neg))
    bin_wgt_neg_dis = np.zeros(len(bin_wgt_neg))
    for i in range(len(bin_cnt_neg)):
        bin_cnt_neg_dis[i] = bin_cnt_neg[i] / sum(bin_cnt_neg)
        bin_wgt_neg_dis[i] = bin_wgt_neg[i] / sum(bin_wgt_neg)
    ret['binCntNegDis'] = bin_cnt_neg_dis
    ret['binWgtNegDis'] = bin_wgt_neg_dis

    # binWeightedPosRate
    bin_wgt_pos_rate = np.zeros(len(bin_wgt_pos))
    for i in range(len(bin_wgt_pos_rate)):
        if abs(bin_wgt_pos[i] + bin_wgt_neg[i]) < 1e-8:
            bin_wgt_pos_rate[i] = 0
        else:
            bin_wgt_pos_rate[i] = bin_wgt_pos[i] / (bin_wgt_pos[i] + bin_wgt_neg[i])
    ret['binWeightedPosRate'] = bin_wgt_pos_rate.tolist()

    # binCountIV
    bin_cnt_iv = np.zeros(len(bin_cnt_pos))
    for i in range(len(bin_cnt_iv)):
        bin_cnt_iv[i] = (bin_cnt_neg_dis[i] - bin_cnt_pos_dis[i]) * bin_cnt_woe[i]
    ret['binCountIV'] = bin_cnt_iv.tolist()

    # binWeightedIV
    bin_wgt_iv = np.zeros(len(bin_wgt_pos))
    for i in range(len(bin_wgt_iv)):
        bin_wgt_iv[i] = (bin_wgt_neg_dis[i] - bin_wgt_pos_dis[i]) * bin_wgt_woe[i]
    ret['binWeightedIV'] = bin_wgt_iv.tolist()

    for k in keys:
        if k in ret:
            continue
        ret[k] = x[k]

    for k in keys:
        if k == 'binBoundary':
            continue
        ret[k] = np.round(ret[k], 8).tolist()

    return pd.Series(data=ret, index=keys)
 

In [None]:
from pyshifu.ShifuEngine import Shifu

shifu_local_dir = ''
shifu_model_name = ''

shifu_model = Shifu.load(os.path.join(shifu_local_dir, shifu_model_name))
config = pd.DataFrame(shifu_model.column_config.get_dict())

config['missingPercentage'] = config['columnStats'].map(lambda x: x['missingPercentage'])
config['mean'] = config['columnStats'].map(lambda x: x['mean'])
config['min'] = config['columnStats'].map(lambda x: x['min'])
config['max'] = config['columnStats'].map(lambda x: x['max'])
config['iv'] = config['columnStats'].map(lambda x: x['iv'])
config['totalCount'] = config['columnStats'].map(lambda x: x['totalCount'])
config['columnFlag'] = config['columnFlag'].map(lambda x: 'Meta' if x is None else x)
config['model_name'] = shifu_model_name

r = config['columnBinning'].apply(expand_bin_info)
config = config.join(r)

In [None]:
def _seg_num(v):
    import re
    m = re.search(r'_seg([0-9]+)$', v)
    if not m:
        return 0
    return int(m.group(1))

if not 'seg_num' in config.columns:
    config.insert(1, 'seg_num', '0')
    
config['seg_num'] = config['columnName'].map(_seg_num)


# load shifu SE ranking result

- high rms means high importance

In [None]:
se_ranking = pd.read_csv(os.path.join(shifu_folder, 'varsel', 'se.0'),
                                      sep='\t', 
                                      names=['column_index', 'column_name', 'mean', 'rms', 'variance'])

# load shifu ModelConfig as dataframe

In [None]:
column_config = pd.DataFrame(shifu_model.column_config.get_dict())

# shifu test filter

In [None]:
shifu_model.test('-filter', '-n', '1000000')

# shifu rebin

- bic: min instanc in each bucket
- n: expected bin num

In [None]:
shifu_model.stats('-rebin', '-n', '6', '-bic', '2000', '-ivr', '0.98')

# shifu GBT config

In [None]:
model_config = shifu_model.model_config

# GBT config
model_config.train.baggingNum = 1
model_config.train.validSetRate = 0.2
model_config.train.numTrainEpochs = 500
model_config.train.algorithm = 'GBT'
model_config.train.params['TreeNum'] = 500
model_config.train.params['FeatureSubsetStrategy'] = 'ALL'
model_config.train.params['MaxDepth'] = 5
model_config.train.params['Impurity'] = 'variance'
model_config.train.params['LearningRate'] = 0.05
model_config.train.params['MinInstancesPerNode'] = 200
model_config.train.params['Loss'] = 'squared'
model_config.train.params['MinInfoGain'] = 0.0


# shifu CLI

In [None]:
%%bash
Usage: shifu COMMAND
where COMMAND is one of:
    new <ModelSetName> [-t <NN|LR|SVM|DT>]  Create a new model set.
    init                                    Create initial ColumnConfig.json and upload to HDFS.
    stats                                   Calculate statistics on HDFS and update local ColumnConfig.json.
    stats –c                                Calculate feature correltion.
    varselect/varsel [-reset]               Variable selection, will update finalSelect in ColumnConfig.json.
    normalize/norm                          Normalize the columns with finalSelect as true.
    train [-dry]                            Train the model with the normalized data.
    posttrain                               Post-process data after training models.
    eval                                    Run all eval sets.
    eval -list                              Lis all eval set.
    eval -new     <EvalSetName>             Create a new eval set.
    eval -delete  <EvalSetName>             Delete an eval set.
    eval -run     <EvalSetName>             Run eval set evaluation.
    eval -score   <EvalSetName>             Scoring evaluation dataset.
    eval -norm    <EvalSetName>             Normalize evaluation dataset.
    eval -confmat <EvalSetName>             Compute the TP/FP/TN/FN based on scoring
    eval -perf    <EvalSetName>             Calculate the model performance based on confmat
    export [-t pmml|columnstats] [-c]       Export model to PMML format or export ColumnConfig.
    version|v|-v|-version                   Print version of current package.
    help|h|-h|-help                         Help message.

## useful shifu config

In [None]:
# default shifu config
import os
from pyshifu.ShifuConf import shifuConf

# increase java memory config in local, may fail when loading calculcated correlation matrix if memory is too small.
os.environ['SHIFU_OPTS'] = '-Xms4G -Xmx16G'

# increase hadoop map / reducer memory setting, may fail at hadoop jobs if too small
shifuConf.set('mapreduce.map.java.opts', '-Xms8000m -Xmx16000m -server -XX:MaxPermSize=64m -XX:PermSize=64m -XX:+UseParallelGC -XX:+UseParallelOldGC -XX:ParallelGCThreads=8 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps')
shifuConf.set('mapreduce.reduce.java.opts', '-Xms8000m -Xmx16000m -server -XX:MaxPermSize=64m -XX:PermSize=64m')
shifuConf.set('shifu.norm.shuffle.size', '200')
shifuConf.set('mapreduce.map.memory.mb','16000')
shifuConf.set('mapreduce.reduce.memory.mb','16000')
shifuConf.set('hadoopJobQueue', queue)
# to avoid norm parquet failure
shifuConf.set('parquet.enable.summary-metadata', 'false') 
shifuConf.save()




# load shifu column config

In [None]:
with open('path_to_column_config.json') as f:
    df = pd.DataFrame(json.load(f))

In [None]:
# WOE_ZSCORE = (WOE of Xi -  WOE_MEAN)/WOE_STD

# WOE_MEAN =  WOE * WOE_COUNT
# WOE_STD = sqrt(  ( SUM(WOE*WOE*COUNT )  - COUNT* (WOE_MEAN*WOE_MEAN)) / (COUNT-1) )

pos = v['columnBinning']['binCountPos']
neg = v['columnBinning']['binCountNeg']
woe = v['columnBinning']['binCountWoe']

count = 0
woeSum = 0
squaredSum = 0

for k, v in enumerate(pos):
    count += (pos[k] + neg[k])
    woeSum += (pos[k] + neg[k]) * woe[k]
    squaredSum += woe[k] * woe[k] * (pos[k] + neg[k])

print(woeSum / count)
print(np.sqrt((squaredSum - count * (woeSum / count) * (woeSum / count)) / (count - 1)))


# shifu filter expr

notes:
- can not use number value in list, `col in [1, 2]` will yeild empty set, no matter data type of col
- can not use integer value if col value is float, `col >= 1` evaludated as false if col is float type value, use `col >= 1.0` in stead

In [None]:
# eq
expr = "col = 1"

# in list
expr = "col =~ ['O', 'P', 'Y']"

# empty col
expr = "empty(col)"


# load and parse correlation

In [None]:
iv_df = None # should contain variable IV

corr_df = pd.read_csv(corr_file_path, skiprows=1,)
corr_df = corr_df.drop(columns=['Unnamed: 0']).rename(columns={'ColumnName': 'variable'})

In [None]:
corr_thres = 0.9
corr_rm = []

variables = corr_df['variable'].to_list()

for i, a in enumerate(variables):
    for b in variables[i+1:]:
        corr = corr_df[
            corr_df['variable'] == a
        ][b].iloc[0]
        
        corr = float(corr)
        if abs(corr) >= corr_thres:
            a_iv = iv_df[iv_df['variable'] == a]['iv'].iloc[0]
            b_iv = iv_df[iv_df['variable'] == b]['iv'].iloc[0]
            to_rm = a if a_iv < b_iv else b
            print(f'{a} iv: {a_iv}, {b} iv: {b_iv}, correlation: {corr}, rm: {to_rm}')
            corr_rm.append(to_rm)

print(f'correlation removed variable num: {len(corr_rm)}')

# reduce file normed file size

shifu.norm.shuffle.size: total file num

In [None]:
shifu_model.norm('-shuffle', '-Dshifu.norm.shuffle.size=100')

# shifu correlation memory config

In [None]:
# config pyshifu java opts
shifu.stats('-c', '-Dshifu.stats.corr.reuse=true') # reuse previous calculated stats



## shifu norm as parquet

In [None]:
data_set = 'xxx' # eval set name in your shifu config

shifu_model.eval('-norm', data_set, '-Dshifu.output.data.format=parquet')



# load shifu PSI

# parse Shifu SE error from log

In [None]:
def parse_SE_error(log_path):
    with open(log_path) as f:
        lines = f.readlines()
    
    train_err = []
    validation_err = []
    for line in lines:
        m = re.search(r'Training Error:([0-9]+\.[0-9]+)', line)
        if m:
            train_err.append(float(m.group(1)))
        m = re.search(r'Validation Error:([0-9]+\.[0-9]+)', line)
        if m:
            validation_err.append(float(m.group(1)))
            
    return pd.DataFrame(data={'traininng error': train_err, 'validation error': validation_err})
            
