In [1]:
import src.data.breathe_data as bd
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders
import src.inference.helpers as ih
from plotly.subplots import make_subplots
import src.data.helpers as dh
import src.models.helpers as mh
import plotly.graph_objects as go
import logging

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)


import pandas as pd
import numpy as np

### Run this to bypass the two next sections (get unblocked fev1 and preprocessd data)

In [2]:
# df = bd.load_meas_from_excel(
#     "BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id_light"
# )
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id")

KeyboardInterrupt: 

# Get unblocked FEV1

In [2]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan")

In [3]:
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
df["ecFEF2575%ecFEV1"] = df["FEF2575"] / df["FEV1"] * 100
print(df.shape)
print("IDs: ", df["ID"].nunique())
df.head()

(41260, 16)
IDs:  352


Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,100.874827,43.51145
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,98.816157,51.145038
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.01047,98.816157,53.076923
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,Male,173.0,53,3.610061,97.150104,36.01047,35.456463,100.874827,46.875


In [4]:
# Add unblocked FEV1
def get_unblocked_fev1_for_ID(df):
    # Find index where ecFEV1 is max
    idx = df["ecFEV1"].idxmax()
    df["Max ecFEV1"] = df.loc[idx, "ecFEV1"]
    df["Max ecFEF2575"] = df.loc[idx, "ecFEF2575"]
    return df


df = (
    df.groupby("ID")
    .apply(get_unblocked_fev1_for_ID)
    .drop(columns=["ID"])
    .reset_index()
    .drop(columns=["level_1"])
)

In [5]:
# How often is max ecFEV1 obtained at the same time as max ecFEF2575?
# Filter idx where max ecFEV1 = ecFEV1
idx_max = df["Max ecFEV1"] == df["ecFEV1"]
df_tmp = df[idx_max].copy()
df_tmp["Drop from max ecFEF2575"] = np.abs(
    df_tmp["Max ecFEF2575"] - df_tmp["ecFEF2575"]
)

(df_tmp["Drop from max ecFEF2575"] == 0).value_counts()
# Most of the time, max ecFEV1 is obtained at the same time as max ecFEF2575

Drop from max ecFEF2575
True     663
False    268
Name: count, dtype: int64

# Preprocess breathe Data

In [6]:
# Process the data for the usual variable resolution
# (HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat, ecFEF2575prctecFEV1) = (
#     var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
#         160, 40, "Male"
#     )
# )

# Process the data for the light model
(HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat) = (
    var_builders.o2sat_fev1_point_in_time_model_shared_healthy_vars_light(
        160, 40, "Male"
    )
)

In [7]:
# Applied get_bin_for_value to all inputs and updated excel file
df[f"idx {ecFEV1.name}"] = df.apply(
    lambda row: ih.get_bin_for_value(row["ecFEV1"], ecFEV1)[1], axis=1
)
# df[f"idx {ecFEF2575prctecFEV1.name}"] = df.apply(
#     lambda row: ih.get_bin_for_value(row["ecFEF2575%ecFEV1"], ecFEF2575prctecFEV1)[1],
#     axis=1,
# )
df[f"idx {O2Sat.name}"] = df.apply(
    lambda row: ih.get_bin_for_value(row["O2 Saturation"], O2Sat)[1], axis=1
)

In [8]:
# df.to_excel(
#     dh.get_path_to_main()
#     + "/ExcelFiles/BR/BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id_light.xlsx",
#     index=False,
# )

# Use approximate inference (slicing)

In [94]:
# Plot function
def plot_one_and_two_days_models_res(
    df_query_res_one_day,
    df_query_res_one_day_fef,
    df_query_res_two_days,
    df_query_res_two_days_fef,
    HFEV1,
    HO2Sat,
    AR,
    IA,
    df,
    save=False,
):

    lay_row = [{}, {}, {}, {}]
    lay_full_row = [{"colspan": 4}, None, None, None]
    # 4 rows: prior, posterior for healthy var, then ar day 1, ar day 3
    fig = make_subplots(
        rows=8,
        cols=4,
        specs=[
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_full_row,
            lay_full_row,
        ],
    )

    # Day
    title_day = f"ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[0, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEF2575%ecFEV1']:.2f}%"
    # Max
    title_max_FEV1_day = f"ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[1, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEF2575%ecFEV1']:.2f}%"

    title_prefix = f"Comparing one vs two days model ID={df_query_res_two_days_fef.loc[0, 'ID']}, {df_query_res_two_days_fef.loc[0, 'Age']}y, {df_query_res_two_days_fef.loc[0, 'Height']}cm, {df_query_res_two_days_fef.loc[0, 'Sex']}"

    # One day
    hfev1_one_day = df_query_res_one_day.loc[0, HFEV1.name]
    ho2sat_one_day = df_query_res_one_day.loc[0, HO2Sat.name]
    ar_one_day = df_query_res_one_day.loc[0, AR.name]
    ia_one_day = df_query_res_one_day.loc[0, IA.name]

    # One day fef
    hfev1_one_day_fef = df_query_res_one_day_fef.loc[0, HFEV1.name]
    ho2sat_one_day_fef = df_query_res_one_day_fef.loc[0, HO2Sat.name]
    ar_one_day_fef = df_query_res_one_day_fef.loc[0, AR.name]
    ia_one_day_fef = df_query_res_one_day_fef.loc[0, IA.name]

    # Two days
    ar_day_1 = df_query_res_two_days.loc[0, AR.name]
    ar_max = df_query_res_two_days.loc[1, AR.name]
    ia_day_1 = df_query_res_two_days.loc[0, IA.name]
    ia_max = df_query_res_two_days.loc[1, IA.name]
    hfev1_two_days = df_query_res_two_days.loc[0, HFEV1.name]
    ho2sat_two_days = df_query_res_two_days.loc[0, HO2Sat.name]

    # Two days fef
    ar_day_1_fef = df_query_res_two_days_fef.loc[0, AR.name]
    ar_max_fef = df_query_res_two_days_fef.loc[1, AR.name]
    ia_day_1_fef = df_query_res_two_days_fef.loc[0, IA.name]
    ia_max_fef = df_query_res_two_days_fef.loc[1, IA.name]
    hfev1_two_days_fef = df_query_res_two_days_fef.loc[0, HFEV1.name]
    ho2sat_two_days_fef = df_query_res_two_days_fef.loc[0, HO2Sat.name]

    # Priors
    ih.plot_histogram(
        fig, HFEV1, HFEV1.cpt, HFEV1.a, HFEV1.b, 1, 1, HFEV1.name + " prior", "#009e73"
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        HO2Sat.cpt,
        HO2Sat.a,
        HO2Sat.b,
        1,
        2,
        HO2Sat.name + " prior",
        "#0072b2",
    )
    # Posteriors
    # One days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day,
        HFEV1.a,
        HFEV1.b,
        2,
        1,
        "HFEV1" + " one day",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day_fef,
        HFEV1.a,
        HFEV1.b,
        2,
        3,
        "HFEV1" + " one day with FEF25-75",
        "#009e73",
    )
    # One days HO2Sat
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_one_day,
        HO2Sat.a,
        HO2Sat.b,
        2,
        2,
        "HO2Sat" + " one day",
        "#0072b2",
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_one_day_fef,
        HO2Sat.a,
        HO2Sat.b,
        2,
        4,
        "HO2Sat" + " one day with FEF25-75",
        "#0072b2",
    )
    # One days AR
    ih.plot_histogram(
        fig,
        AR,
        ar_one_day,
        AR.a,
        AR.b,
        3,
        1,
        "AR" + " one day",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_one_day_fef,
        AR.a,
        AR.b,
        3,
        3,
        "AR" + " one day with FEF25-75",
        colour="#d55e00",
    )
    # One days IA
    ih.plot_histogram(
        fig,
        IA,
        ia_one_day,
        IA.a,
        IA.b,
        3,
        2,
        IA.name + " one day",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_one_day_fef,
        IA.a,
        IA.b,
        3,
        4,
        IA.name + " one day with FEF25-75",
        "#cc79a7",
    )
    # Two days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_two_days,
        HFEV1.a,
        HFEV1.b,
        4,
        1,
        "HFEV1" + " shared posterior",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_two_days_fef,
        HFEV1.a,
        HFEV1.b,
        4,
        3,
        "HFEV1" + " shared posterior with FEF25-75",
        "#009e73",
    )
    # Two days HO2Sat
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_two_days,
        HO2Sat.a,
        HO2Sat.b,
        4,
        2,
        "HO2Sat" + " shared posterior",
        "#0072b2",
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_two_days_fef,
        HO2Sat.a,
        HO2Sat.b,
        4,
        4,
        "HO2Sat" + " shared posterior with FEF25-75",
        "#0072b2",
    )
    # Two days AR
    ih.plot_histogram(
        fig,
        AR,
        ar_day_1,
        AR.a,
        AR.b,
        5,
        1,
        "AR" + " day 1",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_max,
        AR.a,
        AR.b,
        6,
        1,
        "AR" + " max",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_day_1_fef,
        AR.a,
        AR.b,
        5,
        3,
        "AR" + " day 1 with FEF25-75",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_max_fef,
        AR.a,
        AR.b,
        6,
        3,
        "AR" + " max with FEF25-75",
        colour="#d55e00",
    )
    # Two days IA
    ih.plot_histogram(
        fig,
        IA,
        ia_day_1,
        IA.a,
        IA.b,
        5,
        2,
        "IA" + " day 1",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_max,
        IA.a,
        IA.b,
        6,
        2,
        "IA" + " max",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_day_1_fef,
        IA.a,
        IA.b,
        5,
        4,
        "IA" + " day 1 with FEF25-75",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_max_fef,
        IA.a,
        IA.b,
        6,
        4,
        "IA" + " max with FEF25-75",
        "#cc79a7",
    )
    # Plot ecFEV1 profile
    fig.add_trace(
        go.Scatter(y=df["ecFEV1"], x=df["Date Recorded"], mode="markers"),
        row=7,
        col=1,
    )
    fig.update_yaxes(
        # nticks=20,
        title="ecFEV1 (L)",
        row=7,
        col=1,
    )
    fig.add_trace(
        go.Scatter(y=df["O2 Saturation"], x=df["Date Recorded"], mode="markers"),
        row=8,
        col=1,
    )
    fig.update_yaxes(
        # nticks=20,
        title="O2 saturation (%)",
        row=8,
        col=1,
    )
    # Add drug therapy
    drug_df = bd.load_drug_therapies()
    drug_df = drug_df[drug_df["DrugTherapyType"] != "Unknown"]
    drug_df_for_ID = drug_df[drug_df.ID == df_query_res_one_day.loc[0, "ID"]]

    def drug_therapy_color_dict():
        return {
            "Trikafta": "green",
            "Ivacaftor": "purple",
            "Symkevi": "purple",
            "Orkambi": "purple",
        }

    for _, row in drug_df_for_ID.iterrows():
        start_date = row.DrugTherapyStartDate
        stop_date = row.DrugTherapyStopDate
        if pd.isnull(stop_date):
            stop_date = df["Date Recorded"].max()
        fig.add_vrect(
            x0=start_date,
            # y0=0,
            x1=stop_date,
            # y1=100,
            fillcolor=drug_therapy_color_dict()[row.DrugTherapyType],
            opacity=0.08,
            layer="below",
            line_width=0,
            name=row.DrugTherapyType,
            row=7,
            col=1,
        )
        fig.add_vrect(
            x0=start_date,
            # y0=0,
            x1=stop_date,
            # y1=100,
            fillcolor=drug_therapy_color_dict()[row.DrugTherapyType],
            opacity=0.08,
            layer="below",
            line_width=0,
            name=row.DrugTherapyType,
            row=8,
            col=1,
        )
        # Add annotation
        fig.add_annotation(
            x=start_date,
            y=df["ecFEV1"].max() * 1.02,
            # xref="x",
            # yref="paper",
            text=row.DrugTherapyType,
            showarrow=False,
            font=dict(size=8),
            row=7,
            col=1,
        )

    # Change marker size for plot row 7
    fig.update_traces(marker=dict(size=3), selector=dict(mode="markers"))

    fig.update_xaxes(title_standoff=5)

    fig.update_layout(
        title=f"{title_prefix}<br><br>Current day:   {title_day}<br>Max FEV1 day:{title_max_FEV1_day}",
        width=1000,
        height=1000,
        font=dict(size=8),
        showlegend=False,
    )

    if save:
        fig.write_image(
            dh.get_path_to_main()
            + f"PlotsBreathe/Two days model/{title_prefix}, {title_day}.pdf"
        )
    else:
        fig.show()
    return -1

In [150]:
def infer_and_plot_for_id(df_for_ID, debug, diff_threshold=1e-8):
    df_for_ID = df_for_ID.reset_index(drop=True)
    print(f"\nID: {df_for_ID.ID.iloc[0]}")
    print(f"#datapoints: {len(df_for_ID)}")

    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]
    (
        _,
        inf_alg,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(height, age, sex)

    # Set variables parametrisation
    key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    vars = [AR, IA]
    shared_vars = [HFEV1, HO2Sat]
    obs_vars = [ecFEV1.name, O2Sat.name]
    obs_vars_fef = [ecFEV1.name, O2Sat.name, ecFEF2575prctecFEV1.name]

    # Find the max FEV1 values
    # Given an ID, get the data which maximises ecFEV1, then ecFEF2575, then O2 Saturation
    idx_max_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    # Randomly select 4 entries for this ID
    # Select 1 entry's index in the bottom 10 percentile of ecFEV1
    bottom_10 = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.1)).argmax()
    # Select 1 entry's index in the 50% percentile
    median = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.5)).argmax()
    # Select 1 entry's index in the top 10% percentile
    top_10 = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.8)).argmax()
    idx_list = [bottom_10, median, top_10]
    # idx_list = list(np.random.choice(df_for_ID.index, 4))

    for i, _ in df_for_ID.iloc[idx_list].iterrows():
        df_one_day = df_for_ID.iloc[[i]]
        df_two_days = df_for_ID.iloc[[i, idx_max_FEV1]]

        df_query_res_one_day, _, _ = slicing.query_across_days(
            df_one_day, inf_alg, shared_vars, vars, obs_vars, diff_threshold, debug
        )
        # reset shared vars
        for var in shared_vars:
            var.reset()
        df_query_res_one_day_fef, _, _ = slicing.query_across_days(
            df_one_day, inf_alg, shared_vars, vars, obs_vars_fef, diff_threshold, debug
        )
        for var in shared_vars:
            var.reset()
        df_query_res_two_days, _, _ = slicing.query_across_days(
            df_two_days, inf_alg, shared_vars, vars, obs_vars, diff_threshold, debug
        )
        for var in shared_vars:
            var.reset()
        df_query_res_two_days_fef, _, _ = slicing.query_across_days(
            df_two_days, inf_alg, shared_vars, vars, obs_vars_fef, diff_threshold, debug
        )
        for var in shared_vars:
            var.reset()

        plot_one_and_two_days_models_res(
            df_query_res_one_day,
            df_query_res_one_day_fef,
            df_query_res_two_days,
            df_query_res_two_days_fef,
            HFEV1,
            HO2Sat,
            AR,
            IA,
            df_for_ID,
            save=True,
        )

    return (
        df_query_res_one_day,
        df_query_res_one_day_fef,
        df_query_res_two_days,
        df_query_res_two_days_fef,
    )


# df_for_ID = df[df["ID"] == "113"]
# (
#     df_query_res_one_day,
#     df_query_res_one_day_fef,
#     df_query_res_two_days,
#     df_query_res_two_days_fef,
# ) = infer_and_plot_for_id(df_for_ID, debug=False, diff_threshold=1e-2)


df.groupby("ID").apply(
    lambda df_for_ID: infer_and_plot_for_id(df_for_ID, debug=False, diff_threshold=1e-6)
)


ID: 101
#datapoints: 1680

ID: 102
#datapoints: 263

ID: 103
#datapoints: 375

ID: 104
#datapoints: 222

ID: 105
#datapoints: 6

ID: 106
#datapoints: 335

ID: 107
#datapoints: 126

ID: 108
#datapoints: 289

ID: 109
#datapoints: 94

ID: 110
#datapoints: 8

ID: 111
#datapoints: 124

ID: 112
#datapoints: 114

ID: 113
#datapoints: 714

ID: 114
#datapoints: 133

ID: 115
#datapoints: 43

ID: 116
#datapoints: 619

ID: 117
#datapoints: 270

ID: 118
#datapoints: 108

ID: 119
#datapoints: 88

ID: 120
#datapoints: 388

ID: 121
#datapoints: 87

ID: 122
#datapoints: 257

ID: 123
#datapoints: 1128

ID: 124
#datapoints: 33

ID: 125
#datapoints: 334

ID: 126
#datapoints: 60

ID: 127
#datapoints: 165

ID: 128
#datapoints: 39

ID: 129
#datapoints: 5

ID: 130
#datapoints: 282

ID: 131
#datapoints: 29

ID: 132
#datapoints: 27

ID: 133
#datapoints: 1066

ID: 134
#datapoints: 97

ID: 135
#datapoints: 26

ID: 138
#datapoints: 195

ID: 139
#datapoints: 276

ID: 140
#datapoints: 116

ID: 141
#datapoints: 201


ID
101    ([ID], [ID], [ID, Day], [ID, Day])
102    ([ID], [ID], [ID, Day], [ID, Day])
103    ([ID], [ID], [ID, Day], [ID, Day])
104    ([ID], [ID], [ID, Day], [ID, Day])
105    ([ID], [ID], [ID, Day], [ID, Day])
                      ...                
549    ([ID], [ID], [ID, Day], [ID, Day])
550    ([ID], [ID], [ID, Day], [ID, Day])
552    ([ID], [ID], [ID, Day], [ID, Day])
553    ([ID], [ID], [ID, Day], [ID, Day])
554    ([ID], [ID], [ID, Day], [ID, Day])
Length: 353, dtype: object

In [95]:
(
    _,
    inf_alg,
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = mb.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(170, 30, "Female")

plot_one_and_two_days_models_res(
    df_query_res_one_day,
    df_query_res_one_day_fef,
    df_query_res_two_days,
    df_query_res_two_days_fef,
    HFEV1,
    HO2Sat,
    AR,
    IA,
    df_for_ID,
)

-1

In [None]:
print(
    f"HFEV1 max diff between 2 days: {max(df_query_res_two_days.loc[0, HFEV1.name] - df_query_res_two_days.loc[1, HFEV1.name]):.2g}"
)
print(
    f"HO2Sat max diff between 2 days: {max(df_query_res_two_days.loc[0, HO2Sat.name] - df_query_res_two_days.loc[1, HO2Sat.name]):.2g}"
)
# Stop criteria: difference of the last HFEV1 inferred between two epochs is les than a threshold (1e-6)

# Use exact inference

In [4]:
import src.data.breathe_data as bd
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders
import src.inference.helpers as ih
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np

# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id")
df_for_ID = df[df["ID"] == "101"]
idx_max_FEV1 = df_for_ID.sort_values(
    by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
).index[0]

df_for_ID.iloc[[0, idx_max_FEV1]]

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,Max ecFEV1,Max ecFEF2575,idx ecFEV1 (L),idx O2 saturation (%)
0,101,2019-01-25,1.31,97,0.54,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374,1.79,1.14,1,24
1576,101,2023-07-31,1.79,98,1.14,1.79,1.15,Male,173.0,53,3.610061,97.150104,49.583647,49.583647,100.874827,63.687151,1.79,1.14,1,24


In [5]:
height = df_for_ID.Height.iloc[0]
age = df_for_ID.Age.iloc[0]
sex = df_for_ID.Sex.iloc[0]
(
    model,
    inf_alg,
    HFEV1,
    HO2Sat,
    ecFEV1,
    AR,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    # ecFEF2575prctecFEV1,
    ecFEV1_2,
    AR_2,
    O2SatFFA_2,
    IA_2,
    UO2Sat_2,
    O2Sat_2,
    # ecFEF2575prctecFEV1_2,
) = mb.o2_sat_fev1_two_days_model_light(height, age, sex)

In [11]:
# vars = [HFEV1, HO2Sat, AR, IA, AR_2, IA_2]
vars = [AR, IA]
obs_vars = [[ecFEV1, 1.31], [O2Sat, 97], [ecFEV1_2, 1.79], [O2Sat_2, 98]]

res = ih.infer(inference_alg=inf_alg, variables=vars, evidences=obs_vars)

  phi.values = phi.values / phi1.values


# Exact vs approximate inference

In [3]:
df1 = df.copy()

In [10]:
for id in ['421']: #df1.ID.unique():
    df_for_ID = df1[df1["ID"] == id].reset_index()
    idx_init = df_for_ID.index[0]

    # Set values
    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]

    idx_max_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    (
        _,
        inf_alg_approx,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
    ) = mb.o2sat_fev1_point_in_time_model_shared_healthy_vars_light(height, age, sex)

    # Set variables parametrisation
    key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    # Set inputs for approximate inference
    shared_vars_approx = [HFEV1, HO2Sat]
    vars_approx = []
    obs_var_names_approx = [ecFEV1.name, O2Sat.name]

    (
        model,
        inf_alg_exact,
        HFEV1_exact,
        HO2Sat,
        ecFEV1,
        AR,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEV1_2,
        AR_2,
        O2SatFFA_2,
        IA_2,
        UO2Sat_2,
        O2Sat_2,
    ) = mb.o2_sat_fev1_two_days_model_light(height, age, sex)
    shared_vars_exact = [HFEV1, HO2Sat]

    # Take random element from df_for_ID.index
    chosen_idx_list = np.random.choice(df_for_ID.index, 1)

    for i in chosen_idx_list:  # df_for_ID.iterrows():
        if i == idx_max_FEV1:
            print(f"Max FEV1 entry found (index {i}), skipping to avo")
            continue
        df_two_days = df_for_ID.iloc[[i, idx_max_FEV1]]

        # Approximate inference
        df_query_res, df_res_before_convergence, shared_vars_final = (
            slicing.query_across_days(
                df_two_days,
                inf_alg_approx,
                shared_vars_approx,
                vars_approx,
                obs_var_names_approx,
                1e-8,
                debug=False,
                auto_reset_shared_vars=True,
            )
        )
        hfev1_approx = df_query_res.loc[0, HFEV1.name]
        ho2sat_approx = df_query_res.loc[0, HO2Sat.name]

        # Exact inference
        fev1 = df_two_days.reset_index().loc[0, "ecFEV1"]
        fev1_max = df_two_days.reset_index().loc[1, "ecFEV1"]
        o2sat = df_two_days.reset_index().loc[0, "O2 Saturation"]
        o2sat_max = df_two_days.reset_index().loc[1, "O2 Saturation"]
        obs_vars = [
            [ecFEV1, fev1],
            [O2Sat, o2sat],
            [ecFEV1_2, fev1_max],
            [O2Sat_2, o2sat_max],
        ]

        res_exact = ih.infer(
            inference_alg=inf_alg_exact,
            variables=shared_vars_exact,
            evidences=obs_vars,
            debug=False,
        )
        hfev1_exact = res_exact[HFEV1.name].values
        ho2sat_exact = res_exact[HO2Sat.name].values

        # Save values
        # Kullback–Leibler divergence
        # it is the expectation of the logarithmic difference between the probabilities P and Q, where the expectation is taken using the probabilities P.
        def calc_kullback_leibler_divergence(p, q):
            # Replace 0s in q with 1e-10
            q[q == 0] = 1e-20
            p[p == 0] = 1e-20
            return sum(np.log(p / q) * p)

        kl_div_hfev1 = calc_kullback_leibler_divergence(hfev1_exact, hfev1_approx)
        kl_div_ho2sat = calc_kullback_leibler_divergence(ho2sat_exact, ho2sat_approx)

        def calc_relative_diff(exact, approx):
            exact_bis = exact.copy()
            exact_bis[np.abs(exact) < 0.0001] = 0
            approx_bis = approx.copy()
            approx_bis[np.abs(approx) < 0.0001] = 0
            # To perform a safe division, if hfev1_exact_bis is 0, then we divide by 1
            denominator_exact = exact_bis.copy()
            denominator_exact[np.abs(exact) < 0.0001] = 1
            return np.sum(
                np.abs((np.divide(exact_bis - approx_bis, denominator_exact) * 100))
            )

        def max_diff(exact, approx):
            """
            In the end I will just apply the max diff because 1/ it's easiest to interpret,
            and 2/ it reveals the changes in the highest values, which is what we are interested in
            """
            return max(np.abs(exact - approx))

        hfev1_relative_diff = calc_relative_diff(hfev1_exact, hfev1_approx)
        ho2sat_relative_diff = calc_relative_diff(ho2sat_exact, ho2sat_approx)

        df1.at[df_for_ID.loc[i, "index"], "HFEV1 exact inf"] = hfev1_exact
        df1.loc[df_for_ID.loc[i, "index"], "HFEV1 approx inf"] = hfev1_approx

        df1.loc[df_for_ID.loc[i, "index"], "HFEV1 max diff"] = hfev1_relative_diff
        df1.loc[df_for_ID.loc[i, "index"], "HO2Sat max diff"] = ho2sat_relative_diff

        df1.loc[df_for_ID.loc[i, "index"], "HFEV1 relative diff"] = hfev1_relative_diff
        df1.loc[df_for_ID.loc[i, "index"], "HO2Sat relative diff"] = (
            ho2sat_relative_diff
        )

        df1.loc[df_for_ID.loc[i, "index"], "HFEV1 KL div"] = kl_div_hfev1
        df1.loc[df_for_ID.loc[i, "index"], "HO2Sat KL div"] = kl_div_ho2sat

  phi.values = phi.values / phi1.values


ValueError: setting an array element with a sequence.

In [5]:
df1.sort_values(by="HFEV1 max diff", ascending=False).head(10)
# df1.sort_values(by="HFEV1 relative diff", ascending=False).head(10)
# df1.sort_values(by="HFEV1 KL div", ascending=False).head(10)
# The same 8 entries are in the top 10. Meaning that both are similar indicators of the difference between the two models
# The difference is small

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,...,Max ecFEV1,Max ecFEF2575,idx ecFEV1 (L),idx O2 saturation (%),HFEV1 max diff,HO2Sat max diff,HFEV1 relative diff,HO2Sat relative diff,HFEV1 KL div,HO2Sat KL div
35689,421,2022-02-21,1.47,97,0.74,1.47,0.74,Male,170.0,29,...,1.47,0.67,1,24,9.669759,2.113878,9.669759,2.113878,4.208741e-06,1.41911e-05
41194,550,2022-11-30,0.64,95,0.49,0.93,0.49,Male,170.0,59,...,1.25,0.52,0,23,8.434901,2.545717,8.434901,2.545717,1.912907e-06,1.291251e-05
36823,452,2023-05-08,1.39,98,0.79,1.5,0.81,Male,173.0,26,...,1.78,0.78,1,24,6.576789,2.86792,6.576789,2.86792,4.384312e-06,2.183457e-05
32309,345,2021-03-14,1.58,97,0.74,1.58,0.76,Male,175.0,25,...,1.66,0.84,1,24,5.056835,3.142864,5.056835,3.142864,5.458545e-06,2.289108e-05
40703,529,2022-06-03,1.68,97,2.33,1.95,2.33,Male,159.5,33,...,1.95,2.33,1,24,3.585763,0.909947,3.585763,0.909947,2.989799e-06,5.6924e-06
35008,409,2022-05-20,1.36,99,0.58,1.45,0.58,Male,177.0,35,...,1.45,0.58,1,25,2.55881,0.238676,2.55881,0.238676,6.500061e-07,4.242843e-07
32790,361,2023-04-24,1.36,99,0.75,1.43,0.75,Male,182.0,45,...,1.78,0.78,1,25,1.942583,0.289388,1.942583,0.289388,3.013924e-07,4.0213e-07
37567,484,2023-02-18,1.63,97,0.61,1.63,0.62,Male,174.0,35,...,1.64,0.64,1,24,1.564747,1.878069,1.564747,1.878069,2.219758e-07,3.866661e-06
30414,309,2021-01-18,1.48,96,0.79,1.58,0.88,Female,166.0,37,...,1.78,0.95,1,23,1.51448,0.663886,1.51448,0.663886,2.862195e-07,3.084474e-06
11488,149,2021-03-03,1.23,97,0.63,1.24,0.63,Female,171.0,43,...,1.73,0.77,1,24,1.344787,1.306372,1.344787,1.306372,1.495597e-07,6.76627e-06


In [None]:
# Spot cases where the error is high
# The lower the value, the higher the error -> remove the lowest probabilities