# training config

In [None]:
import copy

from model_automation.utils.rmr import run_cmd
from pyshifu.ShifuConf import shifuConf
from pyshifu.ShifuEngine import Shifu


dev_data_dir = ''
oot_data_dir = ''

shifu_model_name = 'shifu_training'
shifu_local_dir = 'shifu_model'
shifu_hdfs_dir = ''
shifu_job_queue = 'risk_gds_focus'

weight_column = 'raw_dol_amt'
target_column = 'is_unauth_bad'

dev_filter_expr = "1 == 1"
oot_filter_expr = "1 == 1"

pos_tag = '1'
neg_tag = '0'

data_delimiter = '\x07'

dev_meta_columns_path = 'driver_meta_columns.txt'
oot_meta_columns_path = 'driver_meta_columns.txt'

candidate_vars_path = 'candidate_vars.txt'
categorical_vars_path = 'categorical_vars.txt'
force_rm_vars_path = 'force_rm_vars.txt'

seg_expr = None


In [None]:
dev_meta_columns = pd.read_csv(dev_meta_columns_path, names=['col'])['col'].to_list()
oot_meta_columns = pd.read_csv(oot_meta_columns_path, names=['col'])['col'].to_list()

candidate_vars = pd.read_csv(candidate_vars_path, names=['col'])['col'].to_list()
categorical_vars = pd.read_csv(categorical_vars_path, names=['col'])['col'].to_list()
force_remove_vars = pd.read_csv(force_rm_vars_path, names=['col'])['col'].to_list()


In [None]:
def init_or_load_shifu_instance(shifu_model_name, shifu_local_dir, shifu_hdfs_dir, reuse=True):
    if reuse:
        print(f'reusing shifu model {shifu_model_name}')
        shifu_model = Shifu.load(folder=shifu_local_dir, name=shifu_model_name)
        return shifu_model
    
    print('initializing new shifu model')
    local_dir = os.path.join(shifu_local_dir, shifu_model_name)
    if os.path.exists(local_dir):
        shutil.rmtree(local_dir)
    os.makedirs(os.path.dirname(local_dir), exist_ok=True)
    
    run_cmd(f"hadoop fs -rm -r -f -skipTrash {os.path.join(shifu_hdfs_dir, shifu_model_name)}")
    shifu_model = Shifu.new(folder=shifu_local_dir, name=shifu_model_name)
    return shifu_model

In [None]:
def config_shifu(shifu_model,
                 shifu_local_dir,
                 shifu_hdfs_dir,
                 shifu_job_queue,
                 data_delimiter,
                 dev_data_dir,
                 oot_data_dir,
                 weight_column,
                 target_column,
                 pos_tag,
                 neg_tag,
                 dev_filter_expr,
                 oot_filter_expr,
                 candidate_vars,
                 categorical_vars,
                 force_remove_vars,
                 dev_meta_columns,
                 oot_meta_columns,
                 seg_expr=None
                ):
    model_config = shifu_model.model_config
    local_shifu_model_dir = os.path.join(shifu_local_dir, shifu_model.name)

    # basic
    model_config.basic.customPaths = {'hdfsModelSetPath': shifu_hdfs_dir}

    # dataset
    model_config.dataSet.dataPath = dev_data_dir
    model_config.dataSet.dataDelimiter = data_delimiter
    model_config.dataSet.headerPath = os.path.join(dev_data_dir, '.pig_header')
    model_config.dataSet.headerDelimiter = data_delimiter
    model_config.dataSet.weightColumnName = weight_column
    model_config.dataSet.targetColumnName = target_column
    model_config.dataSet.metaColumnNameFile = 'columns/meta.column.names'
    model_config.dataSet.posTags = [pos_tag]
    model_config.dataSet.negTags = [neg_tag]
    model_config.dataSet.categoricalColumnNameFile = 'columns/categorical.column.names'
    model_config.dataSet.missingOrInvalidValues = ['', '*', '#', '?', 'null', '~', '.', 'NULL', 'NONE', 'None', 'none', '-999', 'NaN']
    model_config.dataSet.filterExpressions = dev_filter_expr
    
    if seg_expr is not None:
        model_config.dataSet.segExpressionFile = "columns/segments.file"
        # write segment expression
        seg_expr_file_path = os.path.join(local_shifu_model_dir, model_config.dataSet.segExpressionFile)
        os.makedirs(os.path.dirname(seg_expr_file_path), exist_ok=True)
        with open(seg_expr_file_path, 'w') as f:
            f.write(seg_expr)
            print(f'segment file written to {seg_expr_file_path}')
    
    with open(os.path.join(local_shifu_model_dir, model_config.dataSet.categoricalColumnNameFile), 'w') as f:
        f.write('\n'.join(categorical_vars))

    with open(os.path.join(local_shifu_model_dir, model_config.dataSet.metaColumnNameFile), 'w') as f:
        f.write('\n'.join(dev_meta_columns))
    
    # stats
    model_config.stats.maxNumBin = 10
    model_config.stats.cateMaxNumBin = 10
    model_config.stats.binningMethod = 'EqualTotal'
    model_config.stats.binningAlgorithm = 'SPDTI'
    
    # norm
    model_config.normalize.normType = "WOE_ZSCORE"
    model_config.normalize.stdDevCutOff = 6.0
    model_config.normalize.sampleNegOnly = False

    # varsel
    model_config.varSelect.forceEnable = True
    model_config.varSelect.candidateColumnNameFile = 'columns/candidate.column.names'
    model_config.varSelect.filterEnable = True
    model_config.varSelect.filterBy = 'IV'
    model_config.varSelect.filterNum = 1000
    model_config.varSelect.minIvThreshold = 0.001
    
    with open(os.path.join(local_shifu_model_dir, model_config.varSelect.forceRemoveColumnNameFile), 'w') as f:
        f.write('\n'.join(force_remove_vars))
        
    with open(os.path.join(local_shifu_model_dir, model_config.varSelect.candidateColumnNameFile), 'w') as f:
        f.write('\n'.join(candidate_vars))
    
    # GBT config
    model_config.train.baggingNum = 1
    model_config.train.validSetRate = 0.2
    model_config.train.numTrainEpochs = 10
    model_config.train.algorithm = 'GBT'
    model_config.train.params['TreeNum'] = 10
    model_config.train.params['FeatureSubsetStrategy'] = 'ALL'
    model_config.train.params['MaxDepth'] = 3
    model_config.train.params['Impurity'] = 'variance'
    model_config.train.params['LearningRate'] = 0.05
    model_config.train.params['MinInstancesPerNode'] = 200
    model_config.train.params['Loss'] = 'squared'
    model_config.train.params['MinInfoGain'] = 0.0

    # NN config
    # model_config.train.baggingNum = 4
    # model_config.train.baggingWithReplacement = False
    # model_config.train.baggingSampleRate = 0.9
    # model_config.train.sampleNegOnly = False
    # model_config.train.numTrainEpochs = 200
    # model_config.train.workerThreadCount = 4
    # model_config.train.algorithm = "NN"
    # model_config.train.params['Propagation'] = "R"
    # model_config.train.params['LearningRate'] = 0.05
    # model_config.train.params['DropoutRate'] = 0.1
    # model_config.train.params['NumHiddenNodes'] = [128, 64]
    # model_config.train.params['NumHiddenLayers'] = 2
    # model_config.train.params['L1orL2'] = 'L2'
    
    # eval
    assert len(model_config.evals) == 1
    eval2 = copy.deepcopy(model_config.evals[0])
    eval2.name = 'eval2'
    eval2.dataSet.dataPath = oot_data_dir
    eval2.dataSet.dataDelimiter = data_delimiter
    eval2.dataSet.headerPath = os.path.join(oot_data_dir, '.pig_header')
    eval2.dataSet.headerDelimiter = data_delimiter
    eval2.dataSet.filterExpressions = oot_filter_expr
    eval2.dataSet.weightColumnName = weight_column
    eval2.dataSet.posTags = [pos_tag]
    eval2.dataSet.negTags = [neg_tag]
    eval2.dataSet.targetColumnName = target_column
    eval2.dataSet.metaColumnNameFile = 'columns/eval.meta.column.names'
    eval2.scoreMetaColumnNameFile = 'columns/eval.meta.column.names'
    eval2.normAllColumns = False

    model_config.evals.append(eval2)
    
    shifu_model.save()
        
    with open(os.path.join(local_shifu_model_dir, eval2.dataSet.metaColumnNameFile), 'w') as f:
        f.write('\n'.join(dev_meta_columns))
    
    with open(os.path.join(local_shifu_model_dir, eval2.scoreMetaColumnNameFile), 'w') as f:
        f.write('\n'.join(dev_meta_columns))
        
    shifu_model.model_config.show()

In [None]:
os.environ['SHIFU_OPTS'] = '-Xms4G -Xmx16G'

shifuConf.set('hadoopJobQueue', shifu_job_queue)
shifuConf.set('mapreduce.map.memory.mb','16000')
shifuConf.set('mapreduce.reduce.memory.mb','16000')
shifuConf.set('mapreduce.map.java.opts', "-Xms4G -Xmx16G -server -XX:MaxPermSize=64m -XX:PermSize=64m -XX:+UseParallelGC -XX:+UseParallelOldGC -XX:ParallelGCThreads=8 -verbose:gc -XX:+PrintGC")
shifuConf.set('mapreduce.reduce.java.opts', "-Xms4G -Xmx16G -server -XX:MaxPermSize=64m -XX:PermSize=64m")
shifuConf.set('parquet.enable.summary-metadata', 'false')

shifuConf.save()
shifuConf.print_envs()

In [None]:
shifu_model = init_or_load_shifu_instance(shifu_model_name, shifu_local_dir, shifu_hdfs_dir, reuse=False)

config_shifu(shifu_model,
             shifu_job_queue=shifu_job_queue,
             shifu_local_dir=shifu_local_dir,
             shifu_hdfs_dir=shifu_hdfs_dir,
             data_delimiter=data_delimiter,
             dev_data_dir=dev_data_dir,
             oot_data_dir=oot_data_dir,
             weight_column=weight_column,
             target_column=target_column,
             pos_tag=pos_tag,
             neg_tag=neg_tag,
             dev_filter_expr=dev_filter_expr,
             oot_filter_expr=oot_filter_expr,
             candidate_vars=candidate_vars,
             categorical_vars=categorical_vars,
             force_remove_vars=force_remove_vars,
             dev_meta_columns=dev_meta_columns,
             oot_meta_columns=oot_meta_columns,
             seg_expr=seg_expr,
            )


In [None]:
# change config

shifu_model.model_config.train.baggingNum = 1
shifu_model.model_config.train.validSetRate = 0.2
shifu_model.model_config.train.numTrainEpochs = 10
shifu_model.model_config.train.algorithm = 'GBT'
shifu_model.model_config.train.params['TreeNum'] = 10
shifu_model.model_config.train.params['FeatureSubsetStrategy'] = 'ALL'
shifu_model.model_config.train.params['MaxDepth'] = 3
shifu_model.model_config.train.params['Impurity'] = 'variance'
shifu_model.model_config.train.params['LearningRate'] = 0.05
shifu_model.model_config.train.params['MinInstancesPerNode'] = 200
shifu_model.model_config.train.params['Loss'] = 'squared'
shifu_model.model_config.train.params['MinInfoGain'] = 0.0

shifu_model.save()
        
shifu_model.model_config.show()

In [None]:
shifu_model.init()
print('finish init')

shifu_model.stats()
print('finish stats')

shifu_model.export('-t', 'columnstats')
print('finish exporting columnstats')



In [None]:
shifu_model.norm()
print('finish norm')

shifu_model.varsel()
print('finish varsel')

In [None]:
shifu_model.train()
print('finish train')

shifu_model.eval('-score', 'eval2')
print('finish eval')


## way to run shifu FI
- set filterBy = 'FI' in shifu varsel config

In [None]:
# run shifu FI
shifu_model.varsel('-reset')
for i in range(5):   
    shifu_model.varsel()
    print(f'finish FI iteration {i}')

## way to run shifu correlation
- when exporting correlation file, you may exncounter OOM error (java exit code: 137), you can either: 1) increase map reduce memory, 2) increase notebook memeory.
- need to run stats first
- correlation file is saved to shifu_folder/shifu_model_name/correlation.csv

In [None]:
shifu_model.init()
print('finish init')

shifu_model.stats()
print('finish stats')

shifu_model.export('-t', 'columnstats')
print('finish exporting columnstats')

shifu_model.stats('-c', '-Dshifu.stats.corr.reuse=true')
print('finish shifu correlation stats')


## way to run shifu SE
- set filterBy = 'SE' in shifu varsel config
- SE file is saved to shifu_folder/shifu_model_name/varsel/se.0
- need to mannually save se file, or it will be overridded
- run `varsel -reset` before each round of `varsel` may run into race condition, when shifu has not generated `se.0` file.

In [None]:
# set filterBy = 'SE' in shifu varsel config

shifu_model.init()
print('finish init')

shifu_model.stats()
print('finish stats')

shifu_model.export('-t', 'columnstats')
print('finish exporting columnstats')

shifu_model.norm()
print('finish norm')

shifu_model.varsel('-reset')

for i in range(3):
    shifu_model.varsel()
    print(f'finish running SE iteration {i}')
    



# way to run shifu PSI

In [None]:
# change config
month_col = 'driver_monthly'
shifu_model.model_config.stats.psiColumnName = month_col
shifu_model.save()
shifu_model.model_config.show()

# need to run stats first
shifu_model.stats()
print('finish stats')

shifu_model.stats('-psi')
print('finish PSI stats')

shifu_model.export('-t', 'columnstats')
print('finish exporting columnstats')

# eval

In [None]:
%%time
# download to local

from automation_utils.common.hdfs import get_hdfs_to_local_csv
lib_path = "/projects/gds-focus/data/wzhao5/Python3"
if lib_path not in sys.path:
    sys.path.append(lib_path)
    
from fast_perf_v5 import fast_perf_v5


eval_score_data_dir = os.path.join(shifu_hdfs_dir, shifu_model_name, 'evals/eval2/EvalScore')
local_data_path = f'data/shifu_eval/{shifu_model_name}/eval2_eval_score_tmp.csv'
print(f'downloading data from {eval_score_data_dir}')

local_data_dir = os.path.dirname(local_data_path)
os.makedirs(local_data_dir, exist_ok=True)

get_hdfs_to_local_csv(eval_score_data_dir,
                      local_data_path,
                      os.path.join(eval_score_data_dir, '.pig_header'))
print(f'data download to {local_data_path}')

In [None]:
%%time

model_score_names = ['model0']
eval_set = 'eval2'
expt_name = 'model_101'
eval_data_path = local_data_path
eval_result_dir = os.path.dirname(eval_data_path)

print('model score names', model_score_names)
print('eval data path', eval_data_path)

In [None]:


excl_eval_result_path = os.path.join(eval_result_dir, f"gainchart_{expt_name}_{eval_set}.xlsx")
print(f"gainchart result path: {excl_eval_result_path}")

dim_list = [
    'driver_is_oot',
    'driver_is_cbp',
    'driver_is_fp',
    'driver_product',
    'driver_is_oot*driver_is_cbp',
    'driver_is_oot*driver_is_fp',
    'driver_is_oot*driver_product',
]

args = {
    'dataPath': eval_data_path,
    'delimiter': '\x07',
    'badList': ['driver_is_cc_bad_v1'],
    'scoreList': model_score_names,
    'dimList': dim_list,
    'xWeight': ['driver_dol_usd_amt', 'driver_dol_usd_amt_cap1k', 'driver_unit_wgt'],
    'yWeight': ['driver_dol_usd_amt', 'driver_dol_usd_amt_cap1k', 'driver_unit_wgt'],
    'weightAlias': ['dol','dol_cap1k', 'unit'],
    'filterExpr': {
        '$AND': {
            'driver_txn_status': 'Approved',
        },
    },
    'OP': 100,
    'TopOP': 100,
    'outputFile': excl_eval_result_path
}

print(f'start running gainchart')
perf = fast_perf_v5(**args)
perf.run()

print(f'finish running model performance evaluation: {expt_name} on {eval_set}')


In [None]:
df_out = perf.output.copy()
csv_eval_result_csv_path = os.path.join(eval_result_dir, f"gainchart_{expt_name}_{eval_set}.csv")
df_out.to_csv(csv_eval_result_csv_path, index=False)
print(f'eval result save to {csv_eval_result_csv_path}')

In [None]:
perf_df = pd.read_csv(csv_eval_result_csv_path, sep='\x07')

print('overall performance')
op_num = 5

ret = pd.pivot_table(perf_df[
    (perf_df['Metric'] == 'dol Catch_Rate')
    & (perf_df['dim_name'].isn([1, '1']))
    & (perf_df['dim_value'].isn([1, '1']))
],
                     index=['score'],
                     values=[f'OP{i+1}' for i in range(op_num)])

ret = ret[[f"OP{i+1}" for i in range(op_num)]]

print(ret)