In [0]:
spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type",
            "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id",  dbutils.secrets.get(scope="dbs-scope-prod-kv-CDH", key="cdh-adb-client-id"))
spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get(scope="dbs-scope-prod-kv-CDH", key="cdh-adb-client-secret"))
spark.conf.set("fs.azure.account.oauth2.client.endpoint", dbutils.secrets.get(scope="dbs-scope-prod-kv-CDH", key="cdh-adb-tenant-id-endpoint"))



In [0]:
import os
os.environ["CUDA_ROOT"]='/usr/local/cuda'
os.environ["PYTENSOR_FLAGS"]='allow_gc=False' #,floatX=float64'
from pytensor.configdefaults import config
import pymc as pm
import pymc_bart as pmb
import pyspark.pandas as pd
import mlflow
import pyspark

In [0]:
# M = 200 # number of trees
# DRAWS = 200
# TUNE = 100
# CORES = 4
# SPLIT_RULES = "[pmb.ContinuousSplitRule(), pmb.ContinuousSplitRule(), pmb.OneHotSplitRule(), pmb.OneHotSplitRule()]"

experiment_id=dbutils.jobs.taskValues.get("cdh-ml-init",
                                          "experiment_id",
                                          debugValue='ccb9d87b45ca41f286e3c33cc5f40b68')

run_name = dbutils.jobs.taskValues.get("cdh-ml-init", 
                                         "run_name", 
                                         debugValue="test2")

run_id = dbutils.jobs.taskValues.get("cdh-ml-run",
                                    "run_id_main",
                                    debugValue = "5c4b0bab2668466ea9ac022e482adc35")

M = dbutils.jobs.taskValues.get("cdh-ml-init", "M", debugValue=200)
DRAWS = dbutils.jobs.taskValues.get("cdh-ml-init", "DRAWS", debugValue=1000)
TUNE = dbutils.jobs.taskValues.get("cdh-ml-init", "TUNE", debugValue=1000)
#CORES = dbutils.jobs.taskValues.get("cdh-ml-init", "CORES", debugValue=4)
SPLIT_RULES = dbutils.jobs.taskValues.get("cdh-ml-init", "SPLIT_RULES", debugValue="[pmb.ContinuousSplitRule(), pmb.ContinuousSplitRule(), pmb.OneHotSplitRule(), pmb.OneHotSplitRule()]")
ALPHA=0.95

In [0]:
mlflow.set_experiment(experiment_id=experiment_id)

In [0]:
from pyspark.sql.functions import coalesce
time="days"
event="event"
lung = spark.table("cdh_reference_data.ml_lung_cancer") #.withColumn('karno', coalesce('ph_karno', 'pat_karno'))

print(lung)

# configure analytic dataset
# adjust time to months
#lung["months"] = np.ceil(lung["time"]/30)
#lung["weeks"] = np.ceil(lung["time"]/7)
#lung["sex2"] = lung["sex"]-1
#lung["expired"] = lung.status - 1

# karno try categorical and continuous
###x = pd.concat([time, lung[["age","sex2","karno"]]], axis=1)

# sklearn set-up
#y_sk = ssf.get_y_sklearn(lung["expired"],lung["time"])
#def get_y_sklearn(status, t_event):
#    y = np.array(list(zip(np.array(status, dtype="bool"), t_event)), dtype=[("Status","?"),#("Survival_in_days", "<f8")])
#    return y
x_sk = lung[["age","female","karno"]]
y_sk = lung[["expired","weeks"]].rename(columns={"expired": "Status", "weeks": "Survival_in_days"})
print( x_sk)
print( y_sk)
print(len(lung))

In [0]:
# long time use spark explode
b_tr_t, b_tr_expired, b_tr_x = ssf.surv_pre_train2(data_x_n = x_sk, data_y=y_sk, X_TIME=True)


print(len( b_tr_expired))
print(len( b_tr_t))
print( len(b_tr_x))

In [0]:
# create the counterfactual test dataset
# First half is male second half is female
b_te_x = ssf.get_bart_test(x_out = x_sk, T = np.unique(b_tr_t))
print(b_te_x[:,2])
#assume all femail
b_te_x[:,2] = 0



In [0]:
#assume all male
b_te_x2 = b_te_x.copy()
b_te_x2[:,2] = 1

In [0]:
b_te_x3 = np.concatenate([b_te_x, b_te_x2], axis=0)
df_b_te_x3=pd.DataFrame(b_te_x3)
print(df_b_te_x3)

In [0]:
# BART
# M = 200 # number of trees
# DRAWS = 2000
# TUNE = 1000
# CORES = 4
# SPLIT_RULES = "[pmb.ContinuousSplitRule(), pmb.ContinuousSplitRule(), pmb.OneHotSplitRule(), pmb.OneHotSplitRule()]"

    # run pymc
off = sp.norm.ppf(np.mean(b_tr_expired))
with pm.Model() as bart:
    x_data = pm.MutableData("x", b_tr_x)
    f = pmb.BART("f", X=x_data, Y=b_tr_expired, m=M, alpha = ALPHA, split_rules=eval(SPLIT_RULES))
    z = pm.Deterministic("z", f + off)
    mu = pm.Deterministic("mu", pm.math.invprobit(z))
    y_pred = pm.Bernoulli("y_pred", p=mu, observed=b_tr_expired, shape=x_data.shape[0])
    bdata = pm.sample(random_seed=2, draws=DRAWS, tune = TUNE) # use system value, cores=CORES)


In [0]:
with bart:
# pm.set_data({"x":pd.DataFrame(test_x), "off":off_test})
    pm.set_data({"x":pd.DataFrame(b_te_x3)})
    pp = pm.sample_posterior_predictive(bdata, var_names = ["y_pred", "f", "z", "mu"])


In [0]:
with mlflow.start_run(experiment_id=experiment_id, run_id=run_id) as run:

    # get survival
    x_out = np.concatenate([x_sk.to_numpy(), x_sk.to_numpy()], axis=0)
    bart_sv_fx = ssf.get_sv_fx(pp, x_out)

    # get the original and counterfactual
    og_shp = x_sk.shape[0]
    or_bart_sv_fx = bart_sv_fx[0:og_shp,:]
    cf_bart_sv_fx = bart_sv_fx[og_shp:, :]

    # get mean and quantile
    or1 = or_bart_sv_fx.mean(axis=0)
    orp = np.quantile(or_bart_sv_fx, q=[0.05,0.95], axis=0)
    cf1 = cf_bart_sv_fx.mean(axis=0)
    cfp = np.quantile(cf_bart_sv_fx, q=[0.05,0.95], axis=0)


    plt_time = np.unique(b_tr_t)

    # plot
    fig = plt.figure()
    plt.step(plt_time, or1, label = "male", color="darkblue")
    plt.step(plt_time, orp[0], color="darkblue", alpha=.4)
    plt.step(plt_time, orp[1], color="darkblue", alpha=.4)
    plt.step(plt_time, cf1, label = "female", color="darkorange")
    plt.step(plt_time, cfp[0], color="darkorange", alpha=.4)
    plt.step(plt_time, cfp[1], color="darkorange", alpha=.4)
    plt.legend()
    mlflow.log_figure(fig, "male_female.png")