# env config

In [None]:

import os
import sys
import json
import re
from datetime import datetime

username = 'guxia'
repo_dir = f'/projects/gds-focus/data/{username}/UCC_LATAM23/'
secret_path = f'/projects/{username}/secret'
os.chdir(repo_dir)
for p in [secret_path, f"{repo_dir}/utils"]:
    if p not in sys.path:
        sys.path.append(p)

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import ModelCheckpoint

import aml.cloud_v1 as cloud
cloud.notebook.authenticate_user()

from automation_utils.common.file import read_by_line
from model_automation.deep_learning_pipeline import data_prepare
from model_automation.utils.rmr import run_cmd

from MMoE import MMoE
from decorators import timeit

%reload_ext cloudmagics.bigquery
%config PPMagics.domain="ccg24-hrzana-gds-focus"
%config PPMagics.autolimit=0
%url -c horton


# config

In [None]:
working_dir = os.getcwd()
tfrecord_data_dir = "data/tfrecord"

dev_data = "UCC_LATAM_DEV_MULTI_SEG_0330_stc_cc"
oot_data = "UCC_LATAM_OOT_MULTI_SEG_0330_stc_cc"

ucc_meta_cols_path = f"{working_dir}/assets/ucc_meta_columns.txt"
candidate_vars_path = f"{working_dir}/assets/candidate_variables.txt"
categorical_vars_path = f"{working_dir}/assets/categorical_variables.txt"

weight_columns = ["driver_dol_wgt"]
target_columns = ["driver_is_cc_bad"]

model_ckpt_dir = os.path.join(working_dir, "model_ckpt")
model_eval_dir = os.path.join(working_dir, 'model_eval')
model_asset_dir = os.path.join(working_dir, 'model_asset')
model_log_dir = os.path.join(working_dir, 'model_log')


# segments and features
shifu_models = [
    "varsel_ucc_latam_seg0",    # overall
    "varsel_ucc_latam_seg1",    # usdamt_500
    "varsel_ucc_latam_seg2",    # usdamt_200
    "varsel_ucc_latam_seg3",    # usdamt_100
    "varsel_ucc_latam_seg4",    # iscbp_1, without stc/cc
    "varsel_ucc_latam_seg5",    # ucc21_400
    "varsel_ucc_latam_seg6",    # ccbinbadrt_0.008
    "varsel_ucc_latam_seg7",    # iscbp_1, with stc/cc
]

stc_cc_vars = """
stc_pp_addr_line_match_score
stc_pp_addr_zip_match_score
stc_pp_addr_city_match_score
stc_pp_addr_state_match_score
stc_pp_name_match_score
stc_pp_email_match_score
stc_pp_ip_match_score
stc_customer_dof
ucc_cc_engagement
ucc_trust_variable
ucc_trust_combine
ucc_cc_segment_crime
""".strip().split()

# hyper parameter

In [None]:
model_name = 'mmoe_debug'

multi_seg_name = ["overall", "usdamt_100", "ucc21_400", "bad_cc", "iscbp",]
seg_feat_num = []
seg_idx = [0,3,5,6,7]

batch_size = 1024
dropout = 0.5
learning_rate = 5e-4

expert_structure = [128, 128]
gate_structure = [64]

overall_top = 550
seg_top = 200

# column preparation

In [None]:
with open(ucc_meta_cols_path) as f:
    ucc_meta_columns = f.read().strip('\n').split('\n')
    ucc_meta_columns += ['CcbinhmcWAXRtBadCntTxn90d30_03']
    
with open(categorical_vars_path) as f:
    categorical_columns = f.read().strip('\n').split('\n')

with open(candidate_vars_path) as f:
    candidate_columns = f.read().strip('\n').split('\n')

In [None]:
# load clean features
clean_features_file = "{}/assets/clean_features_{}.txt".format(working_dir, datetime.today().strftime("%Y%m%d"))

if os.path.exists(clean_features_file):
    print(f"load clean features from {clean_features_file}")
    clean_features = read_by_line(clean_features_file)

else:
    print(f"{clean_features_file} does not exist, load from SolCat")
    from PySolCat import SolCatOnlineVarLoader

    onlineVarLoader = SolCatOnlineVarLoader('PAZ')
    available_vars = onlineVarLoader.get_online_var_by_cp_status(checkpoint='ConsolidatedFunding', status=['audit clean', 'implemented'])
    clean_features = [var.__dict__['_variable_name'] for var in available_vars]

    with open(clean_features_file, 'w') as f:
        f.write("\n".join(clean_features))


In [None]:
# load selected features
seg_dfs = []

for model in shifu_models:
    if os.path.exists(f"shifu_model/{model}/ColumnConfig_export_bkp.json"):
        with open(f"shifu_model/{model}/ColumnConfig_export_bkp.json") as f:
            cfg = json.load(f)
    else:
        with open(f"shifu_model/{model}/ColumnConfig.json") as f:
            cfg = json.load(f)

    df_cfg = pd.DataFrame(cfg)
    df_cfg['iv'] = df_cfg['columnStats'].map(lambda x: x['iv'])
    df_cfg['missingPercentage'] = df_cfg['columnStats'].map(lambda x: x['missingPercentage'])

    df_se = pd.read_csv(f"shifu_model/{model}/varsel/se.0", sep='\t', names=['No.', 'Name', 'Mean', 'RMS', 'Variance'])

    df = df_se.merge(right=df_cfg[['columnName', 'iv', 'missingPercentage', 'finalSelect']], how='left', left_on='Name', right_on='columnName')

    df['isClean'] = df['Name'].isin(clean_features)

    df_sel = df[df['finalSelect']]
    df_sel = df_sel.reset_index().drop(columns=['index']).reset_index().rename(columns={'index': 'se_rank'})
    df_sel['iv_rank'] = df_sel['iv'].rank(ascending=False)
    df_sel['miss_rank'] = df_sel['missingPercentage'].rank()
    df_sel = df_sel[df_sel['isClean']]
    seg_dfs.append(df_sel)
    


In [None]:
feature_columns = []

assert max(seg_idx) < len(seg_dfs)

for i in seg_idx:
    if i == 7:
        # specific to this notebook. cbp with(seg7)/without(seg4) STC/CC variables.
        df = seg_dfs[4]
    else:
        df = seg_dfs[i]
    
    if i == 0:
        se_top, iv_top = overall_top, overall_top
    else:
        se_top, iv_top = seg_top, seg_top

    # top ranking SE & top ranking IV
    var_set = set(df[df['se_rank']< se_top]['Name'].tolist()) | set(df[df['iv_rank'] < iv_top]['Name'].tolist())
    if i == 7:
        var_set = var_set | set(stc_cc_vars)
    var_list = list(var_set)
    if i > 0:
        var_list = [x + f"__seg{i}" for x in var_list]
    print("seg", i, len(var_list))
    feature_columns.extend(var_list)
    
# colunmn in Shifu ColumnConfig doesnt contain suffix. But when you export Shifu norm layer, you need to append suffix to
# each output variables of norm layer.
df = pd.DataFrame(feature_columns, columns=['col_name'])
df['seg_idx'] = df['col_name'].map(lambda x: int(x.split("__seg")[-1]) if "__seg" in x else 0)
df.sort_values(by=['seg_idx', 'col_name'], ascending=True, inplace=True)

seg_feat_num = df.groupby(['seg_idx']).size().tolist()
feature_columns = df['col_name'].tolist()

assert len(seg_feat_num) == len(multi_seg_name)
print(len(feature_columns))
print(seg_feat_num)

# save features to file
feature_save_path = os.path.join(model_asset_dir, model_name, 'features.txt')
os.makedirs(os.path.dirname(feature_save_path), exist_ok=True)

with open(feature_save_path, 'w') as f:
    f.write("\n".join(feature_columns))
print(f'feature columns written to {feature_save_path}')

In [None]:
# feature stats
print("feature dim = ", len(feature_columns))
var_names = set()
for var in feature_columns:
    var_names.add(var.split("__seg")[0])
print("distinct feature name", len(var_names))


for c in set(weight_columns + target_columns):
    if c not in ucc_meta_columns:
        continue
    ucc_meta_columns.remove(c)

# data preparation

In [None]:
@tf.function
def filter_other_bad(feat, target, wgt):
    if target >= 2.0:
        return False
    return True

In [None]:
@tf.function
def update_target_vec(feat, target, wgt):
    is_bad = tf.cast(target, tf.float32)
    is_good = tf.math.subtract(1.0, is_bad)
    log1p_usd_amt = tf.math.log1p(wgt)
    bad_regression = tf.math.subtract(tf.math.multiply_no_nan(is_bad, log1p_usd_amt), tf.math.multiply_no_nan(is_good, log1p_usd_amt))
    # for target column, use tuple if multiple target, otherwise tf will fail to calculate loss
    new_target = (is_bad, bad_regression)
    return (feat, new_target, wgt)

In [None]:
@tf.function
def update_wgt(feat, target, wgt):
    log1p_usd_amt = tf.math.log1p(wgt)
    return (feat, target, log1p_usd_amt)

In [None]:
@tf.function
def filter_oot(fvec, target, wgt, meta):
    monthly = meta[38]
    return tf.math.logical_or(tf.math.equal(monthly, '2022-11-01'),
                              tf.math.logical_or(tf.math.equal(monthly, '2022-12-01'),
                                                 tf.math.equal(monthly, '2023-01-01')))


In [None]:
prepare = data_prepare()

In [None]:
# DEV
devdata_file_list = prepare.get_filenames(os.path.join(tfrecord_data_dir, dev_data))

dev_dataset = prepare.make_dataset(
    filenames=devdata_file_list,
    feature_list=feature_columns,
    target_list=target_columns,
    weight_list=weight_columns,
    batch_size=batch_size,
)

In [None]:
# OOT
ootdata_file_list = prepare.get_filenames(os.path.join(tfrecord_data_dir, oot_data))

oot_dataset = prepare.make_dataset(
    filenames=ootdata_file_list,
    feature_list=feature_columns,
    target_list=target_columns,
    weight_list=weight_columns,
    meta_list=ucc_meta_columns,
    batch_size=batch_size * 10,
    shuffle=False,
)


In [None]:
# process and cache dataset

filtered_oot_dataset = oot_dataset.unbatch().filter(filter_oot).batch(batch_size*20)

processed_dev_dataset = dev_dataset.unbatch()\
                                    .filter(filter_other_bad)\
                                    .map(update_target_vec)\
                                    .map(update_wgt)\
                                    .batch(batch_size)


# all transformation made to dataset must happend before cache api call. Once loading dataset from
# cahced data, you cannnot make any transformation to it any more.
tf_cache_dir = os.path.join(working_dir, 'tf_cache_dir', f'{model_name}_oot')
print(f"caching dataset to {tf_cache_dir}")
run_cmd(f"rm -r {tf_cache_dir}")
filtered_oot_dataset = filtered_oot_dataset.cache(tf_cache_dir)

tf_cache_dir = os.path.join(working_dir, 'tf_cache_dir', f'{model_name}_dev')
print(f"caching dataset to {tf_cache_dir}")
run_cmd(f"rm -r {tf_cache_dir}")
processed_dev_dataset = processed_dev_dataset.cache(tf_cache_dir).shuffle(batch_size * 100, reshuffle_each_iteration=True)

# model training

In [None]:
@timeit
def model_prediction(models, ds, meta_list):
    """
    run prediction for models.
    models: list of tuple, first element is model name, second element is tf model
    """
    final_df = pd.DataFrame()
    batch_cnt = 0
    for r in ds:
        batch_cnt += 1
        if batch_cnt % 100 == 0:
            print(batch_cnt)
        scores = []
        for _, model in models:
            score = model.predict(r[0])[0] # model specific, we produce two outputs and first one is is_cc_bad binary classification output
            scores.append(score)
        metas = r[-1].numpy()
        target_vec = r[1]
        data = np.concatenate([metas] + [target_vec] + scores, axis=1)
        df = pd.DataFrame(data, columns=meta_list + ['driver_is_cc_bad'] + [f'{name}_score' for name, _ in models])
        for c in meta_list:
            df[c] = df[c].str.decode('UTF-8')
        final_df = pd.concat([final_df, df])
    return final_df


In [None]:
def append_segments(df):
    def _to_float(x, default_val='0'):
        try:
            return float(x)
        except:
            return default_val

    # apepnd segments flag
    df['asp_1000'] = df['driver_capture_usd_amt'].map(lambda x: 1 if _to_float(x, 0) >= 1000 else 0)
    df['is_oot'] = df['driver_monthly'].map(lambda x: 1 if x >= '2022-11-01' else 0)
    # cap amt
    df['dol_cap_1000'] = df['driver_capture_usd_amt'].map(lambda x: 1000.0 if _to_float(x, 0) >= 1000.0 else x)
    
    print(f'eval result record num: {df.shape[0]}')
    return df


In [None]:
model = MMoE(
    expert_dim=seg_feat_num,
    expert_names=multi_seg_name,
    expert_structure=expert_structure,
    expert_activation=tf.nn.relu,
    gate_structure=gate_structure,
    gate_activation=tf.nn.relu,
    task_num=2,
    task_names=['cc_bad', 'loss_reg'],
    dropout=dropout,
)

model.compile(
    loss={
        'Task_cc_bad': 'binary_crossentropy',
        'Task_loss_reg': 'mse',
    },
    optimizer=Adam(learning_rate=learning_rate),
    metrics={
        'Task_cc_bad': ['accuracy', tf.keras.metrics.AUC()],
        'Task_loss_reg': 'mse'
    },
    # , loss_weight=[1,0.5]
)

model.summary()


In [None]:
%%time

epoch_num = 10
ckdir = os.path.join(model_ckpt_dir, model_name)
initial_epoch = 0

os.makedirs(os.path.join(model_ckpt_dir, model_name), exist_ok=True)

# training process
print(f'begin model training')
while (not os.path.exists(ckdir)) or (len(os.listdir(ckdir)) < epoch_num):
    ck_num = len(os.listdir(ckdir))
    print(f'existing ck num: {ck_num}')
    initial_epoch = ck_num
    if ck_num > 0:
        ckpt_file = sorted(os.listdir(ckdir))[-1]
        print(f'loading from {ckpt_file}')
        model = tf.keras.models.load_model(os.path.join(ckdir, ckpt_file))

    ck_filepath = os.path.join(ckdir, "model-{epoch:02d}.ckpt")
    ck_callback = ModelCheckpoint(
        ck_filepath,
        monitor='loss',
        verbose=1,
        save_best_only=False,
        mode='min',
        save_weights_only=False,
        save_freq='epoch',
    )

    log_dir = os.path.join(model_log_dir, model_name)
    model_log_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

    history = model.fit(
        processed_dev_dataset,
        use_multiprocessing=True,
        workers=32,
        initial_epoch=initial_epoch,
        epochs=epoch_num,
        verbose=1,
        max_queue_size=64,
        shuffle=True,
        callbacks=[ck_callback, model_log_callback],
    )

    print(history.history)



In [None]:
%%time

# run prediction
print(f'begin model prediction')
os.makedirs(os.path.join(model_eval_dir, model_name), exist_ok=True)
model_files = os.listdir(os.path.join(model_ckpt_dir, model_name))

ckpt_models = []
for f in model_files:
    print(f'loading {f}')
    m = re.search(r'(\d+)', f)
    if not m:
        raise ValueError(f'cannot find epoch in {f}')
    ck = m[1]
    name = f'{model_name}_ck{ck}'
    print(f'adding model {name}')
    ckpt_model = tf.keras.models.load_model(os.path.join(model_ckpt_dir, model_name, f))
    ckpt_models.append((name, ckpt_model))

eval_df = model_prediction(ckpt_models,
                           filtered_oot_dataset,
                           ucc_meta_columns)

eval_df = append_segments(eval_df)

eval_df = eval_df[eval_df.is_oot == 1]

eval_result_path = os.path.join(model_eval_dir, model_name, f'eval_data.csv')
eval_df.to_csv(eval_result_path, index=False)
print(f"model prediction result saved to {eval_result_path}")


# model evaluation

In [None]:
p = f'/projects/gds-focus/data/wzhao5/Python3'
if p not in sys.path:
    sys.path.append(p)


predict_data_path = os.path.join(model_eval_dir, model_name, f'eval_data.csv')
print(f'prediction data path: {predict_data_path}')

df = pd.read_csv(predict_data_path, nrows=10)
score_names = [c for c in df.columns if model_name in c]
print('loaded score names: \n{}'.format('\n'.join(score_names)))


In [None]:
%time

from fast_perf_v5 import fast_perf_v5

args = {
    'dataPath': predict_data_path,
    'delimiter': ',',
    'badList': ['driver_is_cc_bad'],
    'scoreList': score_names,
    'dimList': [
        'driver_monthly',
        'driver_monthly*driver_is_cbp',
    ],
    
    'xWeight': ['dol_cap_1000'],
    'yWeight': ['dol_cap_1000'],
    'weightAlias': ['dol_cap_1000'],
    'filterExpr': {
        '$AND': {
            'driver_decline_type': 'Approved',
            'driver_capture_usd_amt': {'$notnull': ''},
        },
    },
    
    'OP' : 100,
    'TopOP': 20,
    'outputFile': os.path.join(model_eval_dir, model_name, f'{model_name}_gainchart.xlsx'),
}

perf = fast_perf_v5(**args)
perf.run()

df_out = perf.output.copy()
df_out.to_csv(os.path.join(model_eval_dir, model_name, f'{model_name}_gainchart.csv'))

In [None]:
gain_chart_data_path = os.path.join(model_eval_dir, model_name, f'{model_name}_gainchart.csv')
gain_chart = pd.read_csv(gain_chart_data_path)

ret = pd.pivot_table(gain_chart[(gain_chart['Metric'] == 'dol_cap_1000 Catch_Rate') 
                                & (gain_chart['dim_name'] == 'driver_monthly')
                               ],
                     columns='dim_value',
                     index=['score'],
                     values=['OP4'])

ret['avg'] = ret.mean(axis=1)

ret

In [None]:
ret = pd.pivot_table(gain_chart[(gain_chart['Metric'] == 'dol_cap_1000 Catch_Rate') 
                                & (gain_chart['dim_name'] == 'driver_monthly*driver_is_cbp')
                                & (gain_chart['dim_2'] == 1)
                               ],
                     columns='dim_1',
                     index=['score'],
                     values=['OP4'])

ret['avg'] = ret.mean(axis=1)

ret