In [1]:
import src.data.breathe_data as breathe_data
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders

import pandas as pd
import numpy as np

## Breathe data processing
Applied get_bin_for_value to all inputs and updated excel file

## Slicing algorithm handling convergence (loops)

In [2]:
df = breathe_data.load_o2_fev1_df_from_excel()
df.head()
# Creat factor graph to initialize variables
(
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
) = var_builders.o2sat_fev1_point_in_time_model_shared_healthy_vars(160, 40, "Male")

key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
HFEV1.set_factor_node_key(key_hfev1)
HO2Sat.set_factor_node_key(key_ho2sat)

vars = [AR, IA]
shared_vars = [HFEV1, HO2Sat]
obs_vars = [ecFEV1.name, O2Sat.name]


def infer_and_plot_for_id(df_for_ID, shared_vars, vars, obs_vars, diff_threshold=1e-8):
    for shared_var in shared_vars:
        shared_var.reset()
    df_for_ID = df_for_ID.reset_index(drop=True)
    print(f"\nID: {df_for_ID.ID.iloc[0]}")
    print(f"Amount of data: {len(df_for_ID)}")

    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]
    _, inf_alg, HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat = (
        mb.o2sat_fev1_point_in_time_model_shared_healthy_vars(height, age, sex)
    )

    df_query_res, df_res_before_convergence = slicing.query_across_days(
        df_for_ID, inf_alg, shared_vars, vars, obs_vars, diff_threshold
    )

    save = True

    slicing.plot_posterior_validation(
        df_res_before_convergence,
        HFEV1,
        HO2Sat,
        df_for_ID,
        ecFEV1,
        O2Sat,
        colorscale=[[0, "white"], [0.33, "yellow"], [0.66, "green"], [1, "blue"]],
        save=save,
    )

    slicing.plot_query_res(
        df_for_ID,
        ecFEV1,
        O2Sat,
        df_query_res,
        AR,
        IA,
        HFEV1,
        HO2Sat,
        colorscale=[[0, "white"], [0.33, "yellow"], [0.66, "green"], [1, "blue"]],
        save=save,
    )
    return df_res_before_convergence


df_for_ID = df[df["ID"] == "101"]
a = df_for_ID.groupby("ID").apply(
    lambda df_for_ID: infer_and_plot_for_id(
        df_for_ID, shared_vars, vars, obs_vars, diff_threshold=1e-8
    )
)


ID: 101
Amount of data: 816
epoch 0
Epoch 0 - Posteriors' diff for Healthy FEV1 (L): 1.97999999930535
Epoch 0 - Posteriors' diff for Healthy O2 saturation (%): 1.8932202988527096
epoch 1
Epoch 1 - Posteriors' diff for Healthy FEV1 (L): 4.9587853955288224e-14
Epoch 1 - Posteriors' diff for Healthy O2 saturation (%): 0.002437753740841535
epoch 2
Epoch 2 - Posteriors' diff for Healthy FEV1 (L): 5.674749992731154e-26
Epoch 2 - Posteriors' diff for Healthy O2 saturation (%): 3.470987982972452e-29
All diffs are below 1e-08, running another epoch to get all posteriors
epoch 3
Epoch 3 - Posteriors' diff for Healthy FEV1 (L): 0.0
Epoch 3 - Posteriors' diff for Healthy O2 saturation (%): 0.0


# Selecting the stopping criteria

In [None]:
# Algorithm used to select the stopping criteria
# Moved to src.inference.long_inf_slicing and updated to be general to any shared variables input

# def query_across_days(
#     df,
#     belief_propagation,
#     shared_variables: List[SharedVariableNode],
#     variables: List[str],
#     n_epochs,
# ):
#     epoch = 0
#     df_res_hfev1 = pd.DataFrame(index=HFEV1.get_bins_str())

#     df_res = pd.DataFrame(
#         columns=["Epoch"] + list(map(lambda v: v.name, shared_variables))
#     )

#     post_hfev1_old_epoch = HFEV1._uniform_prior()
#     post_ho2sat_old_epoch = HO2Sat._uniform_prior()
#     while True:
#         print(f"epoch {epoch}")

#         post_hfev1_old_day = HFEV1._uniform_prior()
#         post_ho2sat_old_day = HO2Sat._uniform_prior()
#         diffs_hfev1_day = np.array([])
#         diffs_ho2sat_day = np.array([])
#         for i in range(len(df)):
#             day = df["Date Recorded"].iloc[i].strftime("%Y-%m-%d")

#             def build_evidence(variables):
#                 evidence = {}
#                 for variable in variables:
#                     idx_obs = df[variable].iloc[i]
#                     evidence[variable] = idx_obs
#                 return evidence

#             evidence = build_evidence(variables)

#             def build_virtual_evidence(shared_variables):
#                 virtual_evidence = {}
#                 for shared_var in shared_variables:
#                     virtual_message = shared_var.get_virtual_message(day)
#                     if virtual_message is not None:
#                         virtual_evidence[shared_var.name] = virtual_message
#                 return virtual_evidence

#             virtual_evidence = build_virtual_evidence(shared_variables)

#             var_to_infer = list(map(lambda v: v.name, shared_variables))

#             # Query the graph
#             res, messages = belief_propagation.query(
#                 var_to_infer, evidence, virtual_evidence, get_messages=True
#             )

#             # Save message for current day
#             for shared_var in shared_variables:
#                 shared_var.add_message(day, messages[shared_var.graph_key])

#             post_hfev1_old_day, diff_hfev1_day = get_diff(
#                 res, post_hfev1_old_day, HFEV1
#             )
#             post_ho2sat_old_day, diff_ho2sat_day = get_diff(
#                 res, post_ho2sat_old_day, HO2Sat
#             )
#             # print(
#             #     f"Epoch {epoch}, day {i} - Diff hfev1 {diff_hfev1_day}, diff ho2sat {diff_ho2sat_day}"
#             # )
#             diffs_hfev1_day = np.append(diffs_hfev1_day, post_ho2sat_old_day)
#             diffs_ho2sat_day = np.append(diffs_ho2sat_day, diff_ho2sat_day)

#         # print(
#         #     f"Epoch {epoch} - Sum daily diffs for HFEV1: {diffs_hfev1_day.sum()}, and HO2Sat: {diffs_ho2sat_day.sum()}"
#         # )
#         post_hfev1_old_epoch, diff_hfev1_epoch = get_diff(
#             res, post_hfev1_old_epoch, HFEV1
#         )
#         post_ho2sat_old_epoch, diff_ho2sat_epoch = get_diff(
#             res, post_ho2sat_old_epoch, HO2Sat
#         )
#         print(
#             f"Epoch {epoch} - Posteriors' diff for HFEV1: {diff_hfev1_epoch}, and HO2Sat: {diff_ho2sat_epoch}"
#         )

#         # Create new row df with epoch, and on shared variables array per row cel
#         new_row = [epoch] + list(map(lambda v: res[v.name].values, shared_variables))
#         # Same but as df
#         new_row = pd.DataFrame(
#             [new_row], columns=["Epoch"] + list(map(lambda v: v.name, shared_variables))
#         )

#         df_res = pd.concat([df_res, new_row], ignore_index=True)
#         df_res_hfev1[f"{epoch}"] = res[HFEV1.name].values

#         if epoch >= n_epochs:
#             return df_res, df_res_hfev1
#         epoch += 1


# def get_diff(res, old, var):
#     new = res[var.name].values
#     diff = np.abs(new - old).sum()
#     return new, diff

# Check that using aggregate message and using messages multiplication gives the same result

In [17]:
import src.data.helpers as dh

In [19]:
df_mult = dh.load_excel(
    dh.get_path_to_main()
    + "/ExcelFiles/BR/long_inf_res_ID101_messages_multiplication.xlsx",
    [HFEV1.name, HO2Sat.name],
)
df_agg = dh.load_excel(
    dh.get_path_to_main() + "/ExcelFiles/BR/long_inf_res_ID101_using_agg_m.xlsx",
    [HFEV1.name, HO2Sat.name],
)

In [22]:
for i in range(len(df_mult)):
    assert np.allclose(df_mult.loc[i, HFEV1.name], df_agg.loc[i, HFEV1.name], atol=1e-8)

In [23]:
for i in range(len(df_mult)):
    assert np.allclose(
        df_mult.loc[i, HO2Sat.name], df_agg.loc[i, HO2Sat.name], atol=1e-8
    )