# pymetry-automl

#### Imports

In [None]:
%matplotlib inline

import sys
import os.path

sys.path.append("/opt/symetry/python")

import pandas as pd
import numpy as np
import json

import time

import pymetry as pym
import pymetry.ds as pym_ds
import pymetry.project as pym_prj
import pymetry.utilities.jobs as pym_job
import pymetry.utilities.utils as pym_ut

import sklearn.metrics as skmetrics


# hide shap np depreciated warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import shap
shap.initjs()

In [None]:
pym.init_conf({
    "SERVER" : 'http://charm:8080',
    "SYM_KEY_ID" : "c1",
    "SYM_SECRET_KEY" : "A1ciUrDJgm5LIJU710bxeQ==",
})

#### Test Parameters

In [None]:
PROJECT_TYPE = 'cpu'
TRAIN_FILE = "../../../data/train_bank_BinaryHotEncode.csv"
TEST_FILE = "../../../data/test_bank_BinaryHotEncode.csv"

CLASS_MODEL_TYPE = 'lda'
CLASS_MODEL_TARGET_ATTRIBUTES = ['target']

ATTRIBUTE_NAMES = [
     "target","age","balance","day","duration","campaign","pdays","previous",
     "job_admin.","job_blue-collar","job_entrepreneur","job_housemaid",
     "job_management","job_retired","job_self-employed","job_services",
     "job_student","job_technician","job_unemployed","job_unknown","marital_divorced",
     "marital_married","marital_single","education_primary","education_secondary",
     "education_tertiary","education_unknown","default_no","default_yes",
     "housing_no","housing_yes","loan_no","loan_yes","contact_cellular","contact_telephone",
     "contact_unknown","month_apr","month_aug","month_dec","month_feb","month_jan",
     "month_jul","month_jun","month_mar","month_may","month_nov","month_oct",
     "month_sep","poutcome_failure","poutcome_other","poutcome_success","poutcome_unknown"
 ]

ATTRIBUTE_TYPES = [
         "B",
         "C","C","C","C","C","C","C",
         "B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B"
     ]

CLASS_MODEL_INPUT_ATTRIBUTES = ATTRIBUTE_NAMES.copy()
CLASS_MODEL_INPUT_ATTRIBUTES.remove(CLASS_MODEL_TARGET_ATTRIBUTES[0])

### Load Data

In [None]:
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)

print(train_df.shape)
print(test_df.shape)

smldf_train = pym_ut.pandas_df_to_sml_json(train_df)
smldf_test = pym_ut.pandas_df_to_sml_json(test_df)

# Create Base Project on SymetryML Server

In [None]:
prj_plain = 'bankPlain'
project_params = {
    "rf_type" : "rf_classifier",
    "rf_target_name" : CLASS_MODEL_TARGET_ATTRIBUTES[0],
    "rf_features" : "*",
}
rsp = pym_prj.create_project(prj_plain, project_type=PROJECT_TYPE, project_params=project_params)
print(rsp.content)

### Update Base Project

In [None]:
%%time

client_id = "pymetry-ipynb"

rsp = pym_prj.stream_data_to_project(prj_plain, smldf_train, client_id='pymetry')
print(rsp.content)


### Auto Select Model on Base Project

In [None]:
class_model_name = 'clr'
ml_context = {
    "targetAttributeNames": CLASS_MODEL_TARGET_ATTRIBUTES,
    "inputAttributeNames": CLASS_MODEL_INPUT_ATTRIBUTES,
    "extraParameters" : {
        "autoselect_grid_type": "autoselect_grid_type_normal",
        "selector_type": "selector_type_fw_bw"
    }
    
}
rsp = pym_prj.auto_select_df(ml_context,
                             prj_plain,
                             class_model_name,
                             smldf_test)
print(rsp.content)
pym_job.wait_for_job_finish(rsp)

### Assess model of Base Project

In [None]:
%%time
plain_predict_proba = pym_prj.make_predict_proba(test_df, prj_plain, class_model_name)
predict_result = plain_predict_proba(test_df)

base_auc = skmetrics.roc_auc_score(test_df[CLASS_MODEL_TARGET_ATTRIBUTES[0]], predict_result)
print("Base AUC : ", str(base_auc))

# Create AuoML Project

In [None]:
prj_auto = 'bankAuto'
project_params = {
    "rf_type" : "rf_classifier",
    "rf_target_name" : CLASS_MODEL_TARGET_ATTRIBUTES[0],
    "rf_features" : "*",

    "automl_project_is_automl" : "true",
    "automl_target_name" : CLASS_MODEL_TARGET_ATTRIBUTES[0],
    "automl_warmup_size" : "5000",
    "automl_use_feature_importance" : "false",
    "automl_use_svd" : "false",
    "automl_add_feature_interaction" : "true",
    "automl_feature_interaction_threshold" : "0.1"
}

rsp = pym_prj.create_project(prj_auto, project_type=PROJECT_TYPE, project_params=project_params)
print(rsp.content)

### Update AutoML Project

In [None]:
%%time

client_id = "pymetry-ipynb"
rsp = pym_prj.stream_data_to_project(prj_auto, smldf_train, client_id='pymetry')
print(rsp.content)

### Auto Select Model on AutoML Project

In [None]:
res = pym_prj.get_project_info(prj_auto)
automl_model_attributes = json.loads(res.content)['values']['smlInfo']['attributeNames']
automl_model_attributes.remove('target')

In [None]:
%%time

class_model_name = 'clr'
ml_context = {
    "targetAttributeNames": CLASS_MODEL_TARGET_ATTRIBUTES,
    "inputAttributeNames": automl_model_attributes,
    "extraParameters" : {
        "autoselect_grid_type": "autoselect_grid_type_normal",
        "selector_type": "selector_type_fw_bw"
    }
    
}
rsp = pym_prj.auto_select_df(ml_context,
                             prj_auto,
                             class_model_name,
                             smldf_test)
print(rsp.content)
pym_job.wait_for_job_finish(rsp)

In [None]:
%%time
auto_predict_proba = pym_prj.make_predict_proba(test_df, prj_auto, class_model_name)
predict_result = auto_predict_proba(test_df)
auto_auc = skmetrics.roc_auc_score(test_df[CLASS_MODEL_TARGET_ATTRIBUTES[0]], predict_result)
print("AutoML AUC : ", str(auto_auc))

# Create a project for SHAP w/ Transformed Data

### Transform Test Data Frame

In [None]:
%%time

res = pym_prj.transform_dataframe(prj_auto, smldf_train)
df_json = json.loads(res.content)['values']['dataframe']
train_x_df = pym_ut.sml_json_to_pandas_df(df_json)

res = pym_prj.transform_dataframe(prj_auto, smldf_test)
df_json = json.loads(res.content)['values']['dataframe']
test_x_df = pym_ut.sml_json_to_pandas_df(df_json)


In [None]:
print(train_x_df.shape)
print(test_x_df.shape)

In [None]:
prj_shap = 'bankShap'
project_params = {}

rsp = pym_prj.create_project(prj_shap, project_type=PROJECT_TYPE, project_params=project_params)
print(rsp.content)

### Update SHAP Project

In [None]:
%%time
smldf_train_x = pym_ut.pandas_df_to_sml_json(train_x_df)

rsp = pym_prj.stream_data_to_project(prj_shap, smldf_train_x, client_id='pymetry')
print(rsp.content)

### Create a Model on SHAP that's identical to Model on AutoML

In [None]:
rsp = pym_prj.get_model_info(prj_auto, class_model_name)
model_info = json.loads(rsp.content)
model_info = model_info['values']['modelInfo']

In [None]:
ml_context = {
    "targetAttributeNames": model_info['targetNames'],
    "inputAttributeNames": model_info['attributeNames'], 
}

rsp = pym_prj.build_model(ml_context, prj_shap, class_model_name, model_info['modelType'] )
print(rsp.content)
job_result = pym_job.wait_for_job_finish(rsp)

In [None]:
shap_predict_proba = pym_prj.make_predict_proba(test_x_df, prj_shap, class_model_name)

### Validate Prediction SHAP and AutoML Models

In [None]:
%%time

res = auto_predict_proba(test_df)
res[:10]

In [None]:
%%time

res = shap_predict_proba(test_x_df)
res[:10]

### Create SHAP Explainer

In [None]:
x_train_summary = shap.kmeans(train_x_df, 10)
explainer = shap.KernelExplainer(shap_predict_proba, x_train_summary)

#### Explain First Row

In [None]:
test_x_df_np = test_x_df.astype("float64")
shap_values = explainer.shap_values(test_x_df_np.iloc[0,:])
shap.force_plot(explainer.expected_value, shap_values, test_x_df_np.iloc[0,:])

#### Explain a sample of predictions

In [None]:
# Select a small sample size from test_df
test_x_df_np_sample = test_x_df_np.sample(n=50, random_state=42)
shap_values = explainer.shap_values(test_x_df_np_sample)
shap.summary_plot(shap_values, test_x_df_np_sample)

### Clear All Projects

In [None]:
clear = True

if clear:
    # Remove All Projects
    rsp = pym_prj.delete_project(prj_plain)
    print(rsp.content)

    rsp = pym_prj.delete_project(prj_auto)
    print(rsp.content)

    rsp = pym_prj.delete_project(prj_shap)
    print(rsp.content)