# env config

In [4]:
import os
import sys
import json
import re
from datetime import datetime

username = 'guxia'
repo_dir = f'/projects/gds-focus/data/{username}/UCC_LATAM23/'
secret_path = f'/projects/{username}/secret'
os.chdir(repo_dir)
for p in [secret_path, f"{repo_dir}/utils"]:
    if p not in sys.path:
        sys.path.append(p)

import pandas as pd
import numpy as np
import tensorflow as tf

import aml.cloud_v1 as cloud
cloud.notebook.authenticate_user()

from automation_utils.common.file import read_by_line

%reload_ext cloudmagics.bigquery
%config PPMagics.domain="ccg24-hrzana-gds-focus"
%config PPMagics.autolimit=0
%url -c horton


Initializing Google Cloud Authentication with your @paypalinc account.
	 Please ensure that you have applied the following roles:
	 PP_GCP_PROD_CONSOLE_SSO_ACCOUNT
	 PP_GCP_PROD_CONSOLE_USER_ACCOUNT
	 For new users, you may be redirected to a new URL to complete the authentication.


# export shifu norm

In [5]:
from automation_utils.zonkey import run_cmd

from pyScoring.graph import Graph
from pyScoring.shifu import ShifuTransformer
from pyScoring.model.model import merge_model_specs
from pyScoring.onnx.support.tf2.tf2_to_onnx import tf_model_to_onnx_as_spec
from pyScoring import UMEModel, InputBuilder, NormalizeBuilder, ConstantBuilder, ReNameBuilder, ModeltoNodeBuilder



Using jnius as backend.
Loaded scoring jar from /opt/conda/jars/scoring-all-in-one-0.7.0.1-jar-with-dependencies.jar
The installed tensorflow version is 2.x, the ModelCreator won't provided compatible for this version. Please use pyScoring.graph.Graph and pyScoring.tensorflow.TensorflowBuilder or pyScoring.keras.KerasTransformer to build your model

In [6]:
def update_shifu_column_config(shifu_folder, features):
    """
    update shifu column config, set finalSelect flag with respect to feature_list.
    """
    # save a ColumnConfig backup cause we will change finalSelect column of it.
    if not os.path.exists(os.path.join(shifu_folder, "ColumnConfig_export_bkp.json")):
        src_file = os.path.join(shifu_folder, "ColumnConfig.json")
        dst_file = os.path.join(shifu_folder, "ColumnConfig_export_bkp.json")
        run_cmd(f"cp {src_file} {dst_file}")
        print(f"backup shifu column config for {shifu_folder}")
    
    cc_path = os.path.join(shifu_folder, "ColumnConfig.json")

    # modify column config json based on feature list
    # columns with finalSelect being false will be ginored in followed norm expose step.
    cc = json.load(open(cc_path, "r"))
    features_set = set(features)
    for elem in cc:
        if elem["columnName"] not in features_set:
            elem["finalSelect"] = False
        else:
            elem["finalSelect"] = True
    print("{} variables selected".format(sum(i["finalSelect"] for i in cc)))
    
    # store updated column config
    with open(cc_path, "w") as f:
        json.dump(cc, f)
        

In [7]:
def save_one_spec(tf_model_path,
                  model_name,
                  norm_nodes,
                  tf_features,
                  tf_output_names,
                  model_spec_folder,
                  score_scale=False,
                  lower_bound=0,
                  upper_bound=1000):
    """
    Convert tf model as well as shifu norm layer into one UME.
    """
    # load tensorflow model
    print("tf mmoe model path: ", tf_model_path)
    tf_model = tf.keras.models.load_model(tf_model_path)
    
    # convert to spec, rename covnerted spec output if necessary
    tf_output_tmp_names = [src + "_tmp" for src in tf_output_names]
    if score_scale:
        tf_spec = tf_model_to_onnx_as_spec(tf_model=tf_model, 
                                           output_mappings=tf_output_tmp_names,
                                           debug=False)
    else:
        tf_spec = tf_model_to_onnx_as_spec(tf_model=tf_model, 
                                           output_mappings=tf_output_names, 
                                           debug=False)
        
    # bridge of norm output and tf model input
    # feature in tf_features should have suffix and must equal to shifu norm layer output.
    input_nodes = InputBuilder(inputs=tf_features,
                               output=tf_spec.inputs[0]).build()
    
    
    # scale output of tf model
    score_scale_nodes = []
    if score_scale:
        for src_tmp, src in zip(tf_output_tmp_names, tf_output_names):
            score_scale_node = NormalizeBuilder(src + "_norm", src_tmp, src)\
                                .interval(0, 1)\
                                .map_to(lower_bound, upper_bound)
            score_scale_nodes.append(score_scale_node.build())

    # put all nodes together and output final model sepc
    model_spec_file_name = f"{model_name}_spec.m"
    merged_spec = merge_model_specs(specs=[tf_spec], 
                                    extra_nodes=[input_nodes] + score_scale_nodes + norm_nodes, 
                                    model_name=model_spec_file_name)

    os.makedirs(model_spec_folder, exist_ok=True)
    merged_spec.save(model_spec_folder)
    
    print(model_spec_file_name.lower())
    
    return merged_spec, model_spec_file_name.lower()

Some context about MMoE:
In MMoE we use shifu to run variable selection and variable normalization for each sub segment. But when it comes
to deployment, we need concate the variable normalization layer togother with tensorflow layer for make one single
UME. Thus we need to export shifu norm layer as UME, then concate it with tensorflow model UME.

In [8]:
# parameters

working_dir = os.getcwd()

model_name = 'mmoe_debug'
model_ck_num = 2
# in this case, tf model produces two outputs
tf_output_names = [f'{model_name}_is_cc_bad', f'{model_name}_loss_reg'] 
weight_columns = ["driver_dol_wgt"]
target_columns = ["driver_is_cc_bad"]

# tensorflow model checkpoint folder
model_ckpt_dir = os.path.join(working_dir, "model_ckpt") 
# to load feature names used by tensorflow model. the features order should be the same with tensor feed into tf
# model in training.
model_asset_dir = os.path.join(working_dir, 'model_asset') 
# save exported tensorflow model spec
model_spec_folder = os.path.join(working_dir, 'model_export', model_name) 

# segments and features
# local shifu model root folder
local_shifu_folder = 'shifu_model'

shifu_models = [
    "varsel_ucc_latam_seg0",    # overall
    "varsel_ucc_latam_seg1",    # usdamt_500
    "varsel_ucc_latam_seg2",    # usdamt_200
    "varsel_ucc_latam_seg3",    # usdamt_100
    "varsel_ucc_latam_seg4",    # cbp, without stc/cc
    "varsel_ucc_latam_seg5",    # ucc21_400
    "varsel_ucc_latam_seg6",    # ccbinbadrt_0.008
    "varsel_ucc_latam_seg7",    # cbp, with stc/cc
]


# selected segment index
seg_idx = [0,3,5,6,7]
suffix = [f"__seg{idx}" for idx in range(len(shifu_models))]

In [9]:
# get segment specific variables
# in read features, features of seg 0 dont have `__seg0` suffix while other segment does.
features = read_by_line(f"{model_asset_dir}/{model_name}/features.txt")
feat_df = pd.DataFrame(features, columns=['name'])
feat_df['seg_idx'] = feat_df['name'].map(lambda x: int(x.split('__seg')[-1]) if '__seg' in x else 0)

seg_features = {}
for idx in seg_idx:
    seg_features[idx] = feat_df[feat_df['seg_idx'] == idx]['name']\
                            .map(lambda x: x.split('__seg')[0] if '__seg' in x else x)\
                            .to_list()
    

# modify shifu column config
norm_nodes = []
for idx in seg_idx:
    shifu_model = shifu_models[idx]
    shifu_model_folder = f"{local_shifu_folder}/{shifu_model}"
    print(f'processing shifu model {shifu_model}')
    update_shifu_column_config(shifu_model_folder, seg_features[idx])
    
    shifu_norm = ShifuTransformer(shifu_model_folder)
    # with suffix parameter, outputs of shifu norm will be like xxx`{suffix}`
    norm_nodes.extend(shifu_norm.create_shifu_transformation_nodes(postfix=suffix[idx]))
    
    
# append seg0 suffix to features in overall segment (seg0)
tf_features = []
for f in features:
    if '__seg' not in f:
        f += '__seg0'
    tf_features.append(f)
    
    
# norm spec
graph = Graph()
graph.add_nodes(norm_nodes)
norm_spec = graph.generate_model_by_graph(model_name=model_name+"_norm_spec", optimization=False)

processing shifu model varsel_ucc_latam_seg0
776 variables selected
processing shifu model varsel_ucc_latam_seg3
368 variables selected
processing shifu model varsel_ucc_latam_seg5
370 variables selected
processing shifu model varsel_ucc_latam_seg6
362 variables selected
processing shifu model varsel_ucc_latam_seg7
380 variables selected


This will apply various optimizations to reduce model size or improve performance. You can also use following code to do the optimization later.
	>>> from pyScoring.model.optimizer import optimize 
	>>> optimize(model)



In [10]:
tf_model_path = f"{model_ckpt_dir}/{model_name}/model-" + "{:02d}.ckpt".format(model_ck_num)

model_spec, _ = save_one_spec(tf_model_path,
                           model_name,
                           norm_nodes,
                           tf_features,
                           tf_output_names,
                           model_spec_folder, 
                           score_scale=False
                          )


tf mmoe model path:  /projects/gds-focus/data/guxia/UCC_LATAM23/model_ckpt/mmoe_debug/model-02.ckpt
checking onnx model spec...
mmoe_debug_spec.m


# validate 

## validate norm layer

In [239]:
# normed original data
normed_sample_data_path = os.path.join(working_dir, 'model_export', 'norm_sample_rows.csv') 

# original raw data
raw_sample_data_path = os.path.join(working_dir, 'model_export', 'validation_raw_data_500_stc_cc.csv')

# orignal raw data
raw_df = pd.read_csv(raw_sample_data_path, dtype=str)

# normed original data
norm_df = pd.read_csv(normed_sample_data_path, dtype=str)

In [240]:
raw_df.head()

Unnamed: 0,driver_trans_id,driver_pmt_start_date,driver_pmt_start_ts,driver_usd_amt,driver_cg_type_code,driver_cg_3pc_weight,driver_ucc_sf_static_rmr_score,driver_ucc21_model_score1,driver_ucc_latam_cbp_model_score1,driver_sndr_id,...,stc_pp_addr_city_match_score,stc_pp_addr_state_match_score,stc_pp_name_match_score,stc_pp_email_match_score,stc_pp_ip_match_score,stc_customer_dof,ucc_cc_engagement,ucc_trust_variable,ucc_trust_combine,ucc_cc_segment_crime
0,2.1128560877329136e+16,2022-08-12,2022-08-12 13:29:10.0,2.5,0,1.0,216.8049,195.1117,,5593627300869059724,...,,,,,,,,,,
1,2.114925079895421e+16,2022-08-31,2022-08-31 15:13:47.0,7.47,0,1.0,859.3881,550.1766,,1259348676648328678,...,,,,,,,,,,
2,2.114395072030918e+16,2022-08-26,2022-08-26 06:20:44.0,0.5,0,1.0,317.9399,62.3186,,5616219136527822036,...,,,,,,,,,,
3,2.114925297435199e+16,2022-08-31,2022-08-31 18:56:32.0,2.48,0,1.0,359.0957,110.2269,,1933795179558290733,...,,,,,,,,,,
4,2.1137367341078196e+16,2022-08-20,2022-08-20 11:12:32.0,44.03,0,1.0,700.7126,331.4491,,6090514464929197745,...,,,,,,,,,,


In [232]:
norm_df.head()

Unnamed: 0,driver_trans_id,driver_is_cc_bad,driver_dol_wgt,driver_pmt_start_date,driver_pmt_start_ts,driver_usd_amt,driver_cg_type_code,driver_cg_3pc_weight,driver_ucc_sf_static_rmr_score,driver_ucc21_model_score1,...,stc_pp_addr_city_match_score__seg7,stc_pp_addr_state_match_score__seg7,stc_pp_name_match_score__seg7,stc_pp_email_match_score__seg7,stc_pp_ip_match_score__seg7,stc_customer_dof__seg7,ucc_cc_engagement__seg7,ucc_trust_variable__seg7,ucc_trust_combine__seg7,ucc_cc_segment_crime__seg7
0,2.112396468094745e+16,0,1.0,2022-08-08,2022-08-08 05:56:08.0,53.72,0,1.0,974.3114,682.3746,...,0.21330915,2.2596936,1.397657,1.6443917,1.9279009,0.5118835,-0.21281977,-0.448634,-0.6465488,-0.3358863
1,2.114285489639743e+16,0,7.52,2022-08-25,2022-08-25 11:38:18.0,7.52,0,1.0,395.6728,67.6076,...,0.21330915,2.2596936,1.397657,1.6443917,1.9279009,0.5118835,-0.21281977,-0.448634,-0.6465488,-0.3358863
2,2.1129668413384884e+16,0,1.0,2022-08-13,2022-08-13 12:48:07.0,7.54,0,1.0,895.3738,499.1982,...,0.21330915,2.2596936,1.397657,1.6443917,1.9279009,0.5118835,-0.21281977,-0.448634,-0.6465488,-0.3358863
3,2.1138256452693504e+16,0,2.47,2022-08-21,2022-08-21 08:38:23.0,2.47,0,1.0,515.7795,153.0184,...,0.21330915,2.2596936,1.397657,1.6443917,1.9279009,0.5118835,-0.21281977,-0.448634,-0.6465488,-0.3358863
4,2.114485786934201e+16,0,2.5,2022-08-27,2022-08-27 16:21:19.0,2.5,0,1.0,344.549,67.9198,...,0.21330915,2.2596936,1.397657,1.6443917,1.9279009,0.5118835,-0.21281977,-0.448634,-0.6465488,-0.3358863


In [300]:
def validate_norm_layer(ume_normed, original_normed, features):
    for c in features:
        if '__seg' not in c:
            ume = ume_normed[c + '__seg0']
        else:
            ume = ume_normed[c]
        original = original_normed[c]
        if abs(float(ume) - float(original)) > 1e-5:
            print(f"{c} mismatch, origianl: {original}, ume normed: {ume}")
        

In [270]:
trans_ids = ['21123964680947451.000000000000000000',
             '21142854896397431.000000000000000000',
             '21129668413384885.000000000000000000',
             '21138256452693504.000000000000000000',
             '21144857869342011.000000000000000000',
             '21133859058991650.000000000000000000',
             '21149250798954206.000000000000000000',
             '21121965966476916.000000000000000000',
             '21135155945282571.000000000000000000',
             '21133857158987217.000000000000000000',
             '21134957035247967.000000000000000000',
             '21123057465095772.000000000000000000']

ume_norm_df = norm_spec.predict_pandas(raw_df)

start prediction for 500 rows


In [298]:
for trans_id in trans_ids:
    print(trans_id)
    
    original_normed = norm_df[norm_df['driver_trans_id'] == trans_id].iloc[0].to_dict()
    original_data = raw_df[raw_df['driver_trans_id'] == trans_id].iloc[0].to_dict()
    ume_normed = norm_spec.predict_row(original_data)
    
    validate_norm_layer(ume_normed, original_normed, features)

21123964680947451.000000000000000000
21142854896397431.000000000000000000
21129668413384885.000000000000000000
21138256452693504.000000000000000000
21144857869342011.000000000000000000
21133859058991650.000000000000000000
21149250798954206.000000000000000000
21121965966476916.000000000000000000
21135155945282571.000000000000000000
21133857158987217.000000000000000000
21134957035247967.000000000000000000
21123057465095772.000000000000000000


## validate final scoring

In [345]:
# normed original data
normed_sample_data_path = os.path.join(working_dir, 'model_export', 'norm_sample_rows.csv') 

# original raw data
raw_sample_data_path = os.path.join(working_dir, 'model_export', 'validation_raw_data_500_stc_cc.csv') 

raw_df = pd.read_csv(raw_sample_data_path, dtype=str)
norm_df = pd.read_csv(normed_sample_data_path, dtype=str)

In [332]:
tf_model_path = f"{model_ckpt_dir}/{model_name}/model-" + "{:02d}.ckpt".format(model_ck_num)

tf_model = tf.keras.models.load_model(tf_model_path)


In [334]:
# use tf model to predict normed data
original_normed_matrix = norm_df[features].astype(float).to_numpy()

orignal_score = tf_model.predict(original_normed_matrix)

orignal_score[0]

array([[0.03939489],
       [0.08837193],
       [0.03855717],
       [0.03417599],
       [0.0083147 ],
       [0.06309408],
       [0.04143983],
       [0.040133  ],
       [0.04914853],
       [0.03512523],
       [0.02677456],
       [0.07090282]], dtype=float32)

In [343]:
# use UME to predict original raw data

for i, trans_id in enumerate(norm_df['driver_trans_id'].to_list()):
    row = raw_df[raw_df['driver_trans_id'] == trans_id].iloc[0].to_dict()
    score = model_spec.predict_row(row)
    print(score['mmoe_debug_is_cc_bad'])
    if abs(score['mmoe_debug_is_cc_bad'] - orignal_score[0][i]) > 1e-5:
        print(f"mismatch, tf score: {orignal_score[0][i]}, ume score: {score['mmoe_debug_is_cc_bad']}")

0.03939485549926758
0.08837199211120605
0.03855717182159424
0.034175992012023926
0.008314669132232666
0.06309407949447632
0.041439831256866455
0.040132999420166016
0.049148499965667725
0.03512522578239441
0.026774555444717407
0.07090282440185547
