# use pyScoring and UME to run predict

In [None]:
model_id = ''

# set data dir that contains txn with variables
data_dir = ''

# to where you want to save scoring result
score_data_save_dir = ''

In [None]:
from automation_utils.spark.session import get_spark
from py_dpu import load_pig, save_pig
from PyGuanjia import GuanjiaClient
from pyScoring import ModelScorer
from pyScoring import UMEModel

from credentials import GUANJIA_PASSWORD, PAZ_PASSWORD


spark = get_spark(app_name='pyscoring_prediction',
                  executor_instances=16,
                  executor_instances_min=16,
                  executor_instances_max=128,
                  queue='risk_gds_focus',
                  load_all_jars=True)



- prepare UME file, either download from guanjia or exported locally

In [None]:
guanjia = GuanjiaClient(username=os.environ['USER'], isQA=False, isPAZ=True)
guanjia.login(passwd=PAZ_PASSWORD)


model_dir = 'local_model_dir'
if os.path.exists(model_dir):
    shutil.rmtree(model_dir)

ret_model_dir = guanjia.download_model(modelId=model_id,
                                       folderName=model_dir)

print(f"model ume downloaded to {ret_model_dir}")

model_path = [p for p in os.listdir(ret_model_dir) if '.m' in p]
model_path = model_path[0]
print(f"model_path: {model_path}")



# data meta columns
data_meta_cols = [] 
print(f'loading data from {data_dir}')

In [None]:
scorer = ModelScorer(spark=spark)

data_df = load_pig(spark, data_dir)

ume = UMEModel(os.path.join(ret_model_dir, model_path))

- need specify col_names_for_default_outputs here, or pyscoring will generate one for you
- can use multi ume model paths here

In [None]:
scoring_df = scorer.create_score_df(input_df=data_df,
                                    mfile_paths=[os.path.join(ret_model_dir, model_path)], 
                                    col_names_for_default_outputs=[ume.outputs[0]],
                                   )

# or keep all columns & score columns if you want
scoring_df = scoring_df.select(data_meta_cols + ume.outputs)
scoring_df.printSchema()

save_pig(spark, scoring_df.coalesce(32), result_save_dir)
print(f"finish saving data to {result_save_dir}")


# VarLangExpressionBuilder

In [None]:
clip_layer_score_name = 'clip_score'


In [None]:
from pyScoring import VarLangExpressionBuilder

builder = VarLangExpressionBuilder(clip_layer_score_name,
                                   f"""if ({numeric_layer_score_name} > 1000.0) 1000.0
                                    else if ({numeric_layer_score_name} < 0.0) 0.0
                                    else {numeric_layer_score_name}""", 
                                   {f'{numeric_layer_score_name}': DataType.DOUBLE.getId()})
model.add_node(builder.build())

# Langbuilder

In [None]:
builder = LangBuilder()


node2 = builder.build("""
                        main:
                        (case
                            (<
                                ref:numeric_score
                                con:0.0
                            )
                        
                            con:0.0
                            (>
                                ref:numeric_score
                                con:1000.0
                            )
                            con:1000.0
                            ref:numeric_score
                        )
                        """, 
                        output='output_score')


# convert UME as graph node

In [None]:
from pyScoring import ModeltoNodeBuilder

ume = None # UME model spec
builder = ModeltoNodeBuilder(name='ems_score', 
                             computeLayers=ume.model.getComputeLayers(),
                             resultMap={'model_score1': ume.outputs[0]}
                            )
node = builder.build()

# check node bining config in UME

In [None]:
ume_model.print_graph(level='all', node_names=['node_name_or_variable_name'])

# ensembled submodel and final model together 

In [None]:
# assume final model's input format is: {sub_model_name}_model_score1
final_model_name = 'UCC21'

sub_model_name = [
    'UCC21_LOW_RISK',
    'UCC21_UCC',
    'UCC21_APPR',
    'UCC21_OVERALL',
]

# prefix of each ume model directory. full directory will be f'{model_dir}_{model_name}.
model_dir = 'model_dir'

final_ume_save_dir = 'final_ume'


In [None]:
from pyScoring.graph import Graph
from pyScoring import ModeltoNodeBuilder


os.makedirs(final_ume_save_dir, exist_ok=True)


# mapping from model_name to UME
model_umes = {}

for model_name in [final_model_name] + sub_model_name:
    ret_model_dir = f'{model_dir}_{model_name}'
    model_ume_path =[f for f in os.listdir(ret_model_dir) if '.m' in f][0]
    model_ume_path = os.path.join(ret_model_dir, model_ume_path)

    print(f'loading ume from {model_ume_path}')
    ume = UMEModel(model_ume_path)
    model_umes[model_name] = ume


In [None]:
# ensemble nodes
nodes = []
outputs = []
for model_name, ume in model_umes.items():
    print(f'process model {model_name}')
    if len(ume.outputs) > 1:
        print(f'multi outputs detected in model {model_name}, will only take first one: {ume.outputs[0]}')
    
    builder = ModeltoNodeBuilder(name=f'{model_name}_intermediate',
                                 computeLayers=ume.model.getComputeLayers(),
                                 resultMap={f'{model_name}_model_score1': ume.outputs[0]}
                                )
    node = builder.build()
    print(f'node name: {node.name}')

    print(f'node output name: {node.outputs[0]}')
    outputs.append(node.outputs[0])
    nodes.append(node)
    
    
# final UME
graph = Graph()
graph.add_nodes(nodes)
final_ume = graph.generate_model_by_graph(model_name=f"ensembled_{final_model_name}",
                                          model_outputs=outputs,
                                          optimization=False)

print(f'final model outputs', final_ume.outputs)
final_ume.save(final_ume_save_dir)
print(f'final model ume saved to {final_ume_save_dir}')

# expose output of intermediate node in UME

In [None]:
graph = ume.convert_to_graph()
model_obj = graph.generate_model_by_graph(model_name='model_name_x', model_outputs=['Your_new_outputs'])

# model score normalization

In [None]:
source score: UCC24

target score: normalized target: normed UCC24 / UCC23_v2_normed_score


In [None]:
score_df = None # pandas df which contains source score & target score
project_map_save_path = '' # score projection save path

# align ume output
aligned_score_name = 'aligned_score'
cliped_score_name = 'final_score'


merged_ume_path = '' # original ume path

bucket_num = 100

In [None]:
input_score_df: bin table: min_x , max_x

target_score_df: bin table: min_y , max_y



In [None]:

# build score mapping
pjmp = SAP.SAP_GEN_MAP(
    input_score_df=score_df,
    target_score_df=score_df,
    binnumber=bucket_num,
    input_score_name=ckpt,
    target_score_name=target_score_col,
    input_weight_name=cap_amt_col,
    target_weight_name=cap_amt_col,
)

pjmp.to_csv(project_map_save_path, sep='\x07', index=False)
print(f'projection map saved to {project_map_save_path}')


print('projection map')
print(pjmp)
print('=' * 120)


print(f'loading origin UME from {merged_ume_path}')
original_ume = UMEModel(merged_ume_path)

original_ume_output_name = original_ume.outputs[0]
print(f'original ume output name: {original_ume_output_name}')


# map original score to target score, using pjmp
aligned_ume =SAP.SAP_GEN_SPEC(
    origin_model_spec=original_ume,
    map_table=pjmp,
    new_spec_name=f"aligned_ume",
    new_layer_name='align_score_layer',
    new_output_name=aligned_score_name,
    orginal_output_position=0
)


print(f'aligned model ume outputs')
print(aligned_ume.outputs)


In [None]:
# norm source score: binary search xmin

def norm_by_pjmp_v2(df, source_score_col, normed_score_col, pjmp):
    pjmp = pjmp.sort_values(by='xmin')
    x_mins = pjmp['xmin']
    min_x, min_y= min(pjmp["xmin"]), min(pjmp["ymin"])
    max_x, max_y = max(pjmp["xmax"]), max(pjmp["ymax"])
    
    def proj(x):
        if x <= min_x:
            return min_y
        
        if x >= max_x:
            return max_y

        selected_chart = pjmp.loc[x_mins.searchsorted(x)-1]
        Y_MAX = selected_chart["ymax"]
        Y_MIN = selected_chart["ymin"]
        X_MAX = selected_chart["xmax"]
        X_MIN = selected_chart["xmin"]`
        x_out = Y_MIN + ((Y_MAX - Y_MIN) * (x - X_MIN))/ (X_MAX - X_MIN)
        return int(np.floor(min(1000.0, max(0.0,  x_out))))
        
    df[normed_score_col] = df[source_score_col].map(proj)
    
    return df

# UME prediction pandas dataframe

- when reading data with variables, do specify `dtype=str` ortherwise unexpected score result maybe found
- remember to set keep_default_na as False, or empty string will replaced as NaN

In [None]:
df = pd.read_csv('data/wzhao5_1700634531.csv', 
                 sep='\x07', 
                 dtype=str, 
                 keep_default_na=False)


# package shifu norm layer & tensorflow model

- all shifu columns that are marked finalSelect=True will create a norm node. Mark column as finalSelect=False if you want to exclude it.

In [None]:
import os

from pyScoring.shifu import ShifuTransformer
from pyScoring.onnx.support.tf2.tf2_to_onnx import tf_model_to_onnx_as_spec
from pyScoring import UMEModel, InputBuilder, NormalizeBuilder, ConstantBuilder, ReNameBuilder, ModeltoNodeBuilder, RegressionBuilder, NumericizeBuilder, LangBuilder, ClipBuilder


shifu_mode_path = '' # path to shifu folder
tf_export = None # loaded tensorflow model checkpoint
tf_output_names = ['tf_score'] # output name for tensorflow model
tf_inputs = ['var_name'] # variable names feed to tensorflow model
model_ume_name = '' # packaged ume output name


In [None]:
shifu_norm = ShifuTransformer(shifu_mode_path)
norm_nodes = shifu_norm.create_shifu_transformation_nodes()

tf_spec = tf_model_to_onnx_as_spec(tf_model=tf_export, output_mappings=tf_output_names, debug=False)

input_names = []
for x in tf_inputs:
    input_names.append(x)
input_nodes = InputBuilder(inputs=input_names, 
                           output=tf_spec.inputs[0]).build()


merged_spec = merge_model_specs(specs=[tf_spec], 
                                extra_nodes=[input_nodes]+norm_nodes,

In [None]:
                                model_outputs=tf_output_names,  
                                model_name=model_ume_name+'_tf')