# Use automtion FS pipeline

In [None]:
import os
import pandas as pd
import json

from pyshifu.ShifuConf import shifuConf
from model_automation.feature_selection import Data
from model_automation.feature_selection import FS_Pipeline
from model_automation.feature_selection.methods import *
from model_automation.utils.rmr import run_cmd



data_path = ''
target_column = ''
categorical = []
candidate = []
force_remove_list = []
data_filter_expr = '' # shifu filter

spark = None
job_queue = ''
job_name = ''
shifu_hdfs_folder = ''
local_output_path = ''


In [None]:
# config shifu
os.environ['SHIFU_OPTS'] = '-Xms8G -Xmx32G'

shifuConf.set('mapreduce.map.java.opts', '-Xms4000m -Xmx8000m -server -XX:MaxPermSize=64m -XX:PermSize=64m -XX:+UseParallelGC -XX:+UseParallelOldGC -XX:ParallelGCThreads=8 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps')
shifuConf.set('mapreduce.reduce.java.opts', '-Xms4000m -Xmx8000m -server -XX:MaxPermSize=64m -XX:PermSize=64m')
shifuConf.set('shifu.norm.shuffle.size', '200')
shifuConf.set('mapreduce.map.memory.mb','8192')
shifuConf.set('mapreduce.reduce.memory.mb','8192')
shifuConf.set('hadoopJobQueue', job_queue)
shifuConf.save()
shifuConf.print_envs()


data = Data(
    data_path=data_path,
    header_path=os.path.join(data_path, '.pig_header'),
    target_col=target_column,
    gcp_mode=False,
)

data.load_data(
    spark,
    load_from_VC=False,
    meta_var_list=meta_columns,
    cat_var_list=categorical,
)

# update config in dataset
fs_config = {
    "filterExpressions": data_filter_expr,
    'dataDelimiter': '\x07',
    'headerDelimiter': '\x07',
    'missingOrInvalidValues': [ "", "*", "#", "?", "null", "~", "NaN", ".","NULL"]
}
data.update_data_config(fs_config)


dashboard = data.data_dashboard
print('meta column num')
print(dashboard[dashboard['column_type'] == 'META'].count())

print(f'data config')
print(json.dumps(data.data_config, indent=4))


# run FS pipeline
pipeline = FS_Pipeline(
    data,
    pipeline_name=job_name,
    hdfsModelSetPath=shifu_hdfs_folder,
    folder=local_output_path,
    gcp_mode=False,
)

pipeline.add_step(SetCandidate(varlist=candidate))
pipeline.add_step(ForceRemove(varlist=force_remove_list))
pipeline.add_step(SelectByMetric(iv_cut=0.001, missing_rate_cut=0.98, categorical_distinct_cut=500))
pipeline.add_step(SelectByCorrelation(corr_thr=0.99))

feature_dashboard = pipeline.run()


# save feature selection result
final_list = feature_dashboard[feature_dashboard['Final Select'] == True]
final_list = final_list.index.to_list()
print(f'final selected variable num: {len(final_list)}')
feature_dashboard.to_csv(os.path.join(vars_path, f'{job_name}_feature_dashboard.csv'))
