In [1]:
import src.data.breathe_data as bd
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders
import src.inference.helpers as ih
from plotly.subplots import make_subplots
import src.data.helpers as dh
import src.models.helpers as mh
import plotly.graph_objects as go
import logging

logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
from pgmpy.inference.ExactInference import VariableElimination

import pandas as pd
import numpy as np

# Data
## Run this to bypass the two next sections (get unblocked fev1 and preprocessd data)

In [2]:
# df = bd.load_meas_from_excel(
#     "BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id_light"
# )
# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id")
# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx")
# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx_light")
# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id_light")

## Get max FEV1

In [2]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan_conservative_smoothing")

In [3]:
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
df["ecFEF2575%ecFEV1"] = df["FEF2575"] / df["FEV1"] * 100
print(df.shape)
print("IDs: ", df["ID"].nunique())
df.head()

(41260, 16)
IDs:  352


Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.54,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.57,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,100.874827,43.51145
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,98.816157,51.145038
3,101,2019-01-28,1.3,96.0,0.69,1.3,0.69,Male,173.0,53,3.610061,97.150104,36.01047,36.01047,98.816157,53.076923
4,101,2019-01-29,1.28,98.0,0.6,1.28,0.6,Male,173.0,53,3.610061,97.150104,35.456463,35.456463,100.874827,46.875


In [4]:
# Add unblocked FEV1
def get_unblocked_fev1_for_ID(df):
    # Find index where ecFEV1 is max
    idx = df["ecFEV1"].idxmax()
    df["Max ecFEV1"] = df.loc[idx, "ecFEV1"]
    df["Max ecFEF2575"] = df.loc[idx, "ecFEF2575"]
    return df


df = (
    df.groupby("ID")
    .apply(get_unblocked_fev1_for_ID)
    .drop(columns=["ID"])
    .reset_index()
    .drop(columns=["level_1"])
)

In [5]:
# How often is max ecFEV1 obtained at the same time as max ecFEF2575?
# Filter idx where max ecFEV1 = ecFEV1
idx_max = df["Max ecFEV1"] == df["ecFEV1"]
df_tmp = df[idx_max].copy()
df_tmp["Drop from max ecFEF2575"] = np.abs(
    df_tmp["Max ecFEF2575"] - df_tmp["ecFEF2575"]
)

(df_tmp["Drop from max ecFEF2575"] == 0).value_counts()
# Most of the time, max ecFEV1 is obtained at the same time as max ecFEF2575

Drop from max ecFEF2575
True     373
False     49
Name: count, dtype: int64

## Preprocess breathe Data

In [5]:
# Process the data for the usual variable resolution
# (HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat, ecFEF2575prctecFEV1) = (
#     var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
#         160, 40, "Male"
#     )
# )

# Process the data for the light model
(
    HFEV1,
    uecFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars_light(
    160, 40, "Male", ecfev1_noise_model_cpt_suffix="_std_0.7"
)

In [6]:
# Applied get_bin_for_value to all inputs and updated excel file
# BEWARE: USING FEV1 INSTEAD OF ECFEV1 HERE, to change
df[f"idx {ecFEV1.name}"] = df.apply(
    lambda row: ecFEV1.get_bin_idx_for_value(row["ecFEV1"]), axis=1
)
df[f"idx {ecFEF2575prctecFEV1.name}"] = df.apply(
    lambda row: ecFEF2575prctecFEV1.get_bin_idx_for_value(row["ecFEF2575%ecFEV1"]),
    axis=1,
)
df[f"idx {O2Sat.name}"] = df.apply(
    lambda row: O2Sat.get_bin_for_value(row["O2 Saturation"]), axis=1
)

In [8]:
df.to_excel(
    dh.get_path_to_main()
    + "/ExcelFiles/BR/BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx_light.xlsx",
    index=False,
)

# Two days model with approximate inference (slicing)

## Plot 3 examples of the two days inference for each ID

### Detailed plot

In [8]:
# Plot function
def plot_one_and_two_days_models_res(
    df_query_res_one_day,
    df_query_res_one_day_fef,
    df_query_res_two_days,
    df_query_res_two_days_fef,
    HFEV1,
    HO2Sat,
    AR,
    IA,
    df,
    save=False,
):

    lay_row = [{}, {}, {}, {}]
    lay_full_row = [{"colspan": 4}, None, None, None]
    # 4 rows: prior, posterior for healthy var, then ar day 1, ar day 3
    fig = make_subplots(
        rows=8,
        cols=4,
        specs=[
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_row,
            lay_full_row,
            lay_full_row,
        ],
    )

    # Day
    title_day = f"ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[0, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEF2575%ecFEV1']:.2f}%"
    # Max
    title_max_FEV1_day = f"ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[1, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEF2575%ecFEV1']:.2f}%"

    title_prefix = f"Comparing one vs two days model ID={df_query_res_two_days_fef.loc[0, 'ID']}, {df_query_res_two_days_fef.loc[0, 'Age']}y, {df_query_res_two_days_fef.loc[0, 'Height']}cm, {df_query_res_two_days_fef.loc[0, 'Sex']}"

    # One day
    hfev1_one_day = df_query_res_one_day.loc[0, HFEV1.name]
    ho2sat_one_day = df_query_res_one_day.loc[0, HO2Sat.name]
    ar_one_day = df_query_res_one_day.loc[0, AR.name]
    ia_one_day = df_query_res_one_day.loc[0, IA.name]

    # One day fef
    hfev1_one_day_fef = df_query_res_one_day_fef.loc[0, HFEV1.name]
    ho2sat_one_day_fef = df_query_res_one_day_fef.loc[0, HO2Sat.name]
    ar_one_day_fef = df_query_res_one_day_fef.loc[0, AR.name]
    ia_one_day_fef = df_query_res_one_day_fef.loc[0, IA.name]

    # Two days
    ar_day_1 = df_query_res_two_days.loc[0, AR.name]
    ar_max = df_query_res_two_days.loc[1, AR.name]
    ia_day_1 = df_query_res_two_days.loc[0, IA.name]
    ia_max = df_query_res_two_days.loc[1, IA.name]
    hfev1_two_days = df_query_res_two_days.loc[0, HFEV1.name]
    ho2sat_two_days = df_query_res_two_days.loc[0, HO2Sat.name]

    # Two days fef
    ar_day_1_fef = df_query_res_two_days_fef.loc[0, AR.name]
    ar_max_fef = df_query_res_two_days_fef.loc[1, AR.name]
    ia_day_1_fef = df_query_res_two_days_fef.loc[0, IA.name]
    ia_max_fef = df_query_res_two_days_fef.loc[1, IA.name]
    hfev1_two_days_fef = df_query_res_two_days_fef.loc[0, HFEV1.name]
    ho2sat_two_days_fef = df_query_res_two_days_fef.loc[0, HO2Sat.name]

    # Priors
    ih.plot_histogram(
        fig, HFEV1, HFEV1.cpt, HFEV1.a, HFEV1.b, 1, 1, HFEV1.name + " prior", "#009e73"
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        HO2Sat.cpt,
        HO2Sat.a,
        HO2Sat.b,
        1,
        2,
        HO2Sat.name + " prior",
        "#0072b2",
    )
    # Posteriors
    # One days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day,
        HFEV1.a,
        HFEV1.b,
        2,
        1,
        "HFEV1" + " one day",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day_fef,
        HFEV1.a,
        HFEV1.b,
        2,
        3,
        "HFEV1" + " one day with FEF25-75",
        "#009e73",
    )
    # One days HO2Sat
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_one_day,
        HO2Sat.a,
        HO2Sat.b,
        2,
        2,
        "HO2Sat" + " one day",
        "#0072b2",
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_one_day_fef,
        HO2Sat.a,
        HO2Sat.b,
        2,
        4,
        "HO2Sat" + " one day with FEF25-75",
        "#0072b2",
    )
    # One days AR
    ih.plot_histogram(
        fig,
        AR,
        ar_one_day,
        AR.a,
        AR.b,
        3,
        1,
        "AR" + " one day",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_one_day_fef,
        AR.a,
        AR.b,
        3,
        3,
        "AR" + " one day with FEF25-75",
        colour="#d55e00",
    )
    # One days IA
    ih.plot_histogram(
        fig,
        IA,
        ia_one_day,
        IA.a,
        IA.b,
        3,
        2,
        IA.name + " one day",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_one_day_fef,
        IA.a,
        IA.b,
        3,
        4,
        IA.name + " one day with FEF25-75",
        "#cc79a7",
    )
    # Two days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_two_days,
        HFEV1.a,
        HFEV1.b,
        4,
        1,
        "HFEV1" + " shared posterior",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_two_days_fef,
        HFEV1.a,
        HFEV1.b,
        4,
        3,
        "HFEV1" + " shared posterior with FEF25-75",
        "#009e73",
    )
    # Two days HO2Sat
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_two_days,
        HO2Sat.a,
        HO2Sat.b,
        4,
        2,
        "HO2Sat" + " shared posterior",
        "#0072b2",
    )
    ih.plot_histogram(
        fig,
        HO2Sat,
        ho2sat_two_days_fef,
        HO2Sat.a,
        HO2Sat.b,
        4,
        4,
        "HO2Sat" + " shared posterior with FEF25-75",
        "#0072b2",
    )
    # Two days AR
    ih.plot_histogram(
        fig,
        AR,
        ar_day_1,
        AR.a,
        AR.b,
        5,
        1,
        "AR" + " day 1",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_max,
        AR.a,
        AR.b,
        6,
        1,
        "AR" + " max",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_day_1_fef,
        AR.a,
        AR.b,
        5,
        3,
        "AR" + " day 1 with FEF25-75",
        colour="#d55e00",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_max_fef,
        AR.a,
        AR.b,
        6,
        3,
        "AR" + " max with FEF25-75",
        colour="#d55e00",
    )
    # Two days IA
    ih.plot_histogram(
        fig,
        IA,
        ia_day_1,
        IA.a,
        IA.b,
        5,
        2,
        "IA" + " day 1",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_max,
        IA.a,
        IA.b,
        6,
        2,
        "IA" + " max",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_day_1_fef,
        IA.a,
        IA.b,
        5,
        4,
        "IA" + " day 1 with FEF25-75",
        "#cc79a7",
    )
    ih.plot_histogram(
        fig,
        IA,
        ia_max_fef,
        IA.a,
        IA.b,
        6,
        4,
        "IA" + " max with FEF25-75",
        "#cc79a7",
    )
    # Plot ecFEV1 profile
    fig.add_trace(
        go.Scatter(y=df["ecFEV1"], x=df["Date Recorded"], mode="markers"),
        row=7,
        col=1,
    )
    fig.update_yaxes(
        # nticks=20,
        title="ecFEV1 (L)",
        row=7,
        col=1,
    )
    fig.add_trace(
        go.Scatter(y=df["O2 Saturation"], x=df["Date Recorded"], mode="markers"),
        row=8,
        col=1,
    )
    fig.update_yaxes(
        # nticks=20,
        title="O2 saturation (%)",
        row=8,
        col=1,
    )
    # Add drug therapy
    drug_df = bd.load_drug_therapies()
    drug_df = drug_df[drug_df["DrugTherapyType"] != "Unknown"]
    drug_df_for_ID = drug_df[drug_df.ID == df_query_res_one_day.loc[0, "ID"]]

    def drug_therapy_color_dict():
        return {
            "Trikafta": "green",
            "Ivacaftor": "purple",
            "Symkevi": "purple",
            "Orkambi": "purple",
        }

    for _, row in drug_df_for_ID.iterrows():
        start_date = row.DrugTherapyStartDate
        stop_date = row.DrugTherapyStopDate
        if pd.isnull(stop_date):
            stop_date = df["Date Recorded"].max()
        fig.add_vrect(
            x0=start_date,
            # y0=0,
            x1=stop_date,
            # y1=100,
            fillcolor=drug_therapy_color_dict()[row.DrugTherapyType],
            opacity=0.08,
            layer="below",
            line_width=0,
            name=row.DrugTherapyType,
            row=7,
            col=1,
        )
        fig.add_vrect(
            x0=start_date,
            # y0=0,
            x1=stop_date,
            # y1=100,
            fillcolor=drug_therapy_color_dict()[row.DrugTherapyType],
            opacity=0.08,
            layer="below",
            line_width=0,
            name=row.DrugTherapyType,
            row=8,
            col=1,
        )
        # Add annotation
        fig.add_annotation(
            x=start_date,
            y=df["ecFEV1"].max() * 1.02,
            # xref="x",
            # yref="paper",
            text=row.DrugTherapyType,
            showarrow=False,
            font=dict(size=8),
            row=7,
            col=1,
        )

    # Change marker size for plot row 7
    fig.update_traces(marker=dict(size=3), selector=dict(mode="markers"))

    fig.update_xaxes(title_standoff=5)

    fig.update_layout(
        title=f"{title_prefix}<br><br>Current day:   {title_day}<br>Max FEV1 day:{title_max_FEV1_day}",
        width=1000,
        height=1000,
        font=dict(size=8),
        showlegend=False,
    )

    path = f"{dh.get_path_to_main()}PlotsBreathe/Two days model/{title_prefix}, {title_day}.pdf"
    if save:
        fig.write_image(path)
    else:
        fig.show()

    return -1

### AR, HFEV1 specific plots

In [22]:
# Plot function
def plot_one_and_two_days_models_res(
    df_query_res_one_day,
    df_query_res_one_day_fef,
    df_query_res_two_days,
    df_query_res_two_days_fef,
    HFEV1,
    HO2Sat,
    AR,
    IA,
    df,
    save=False,
    plot_type_title="",
):

    fig = make_subplots(rows=2, cols=4, vertical_spacing=0.25)

    # Day
    title_day = f"ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[0, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[0, 'ecFEF2575%ecFEV1']:.2f}%"
    # Max
    title_max_FEV1_day = f"ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEV1']:.2g}, O2 Saturation={df_query_res_two_days_fef.loc[1, 'O2 Saturation']:.2g}%, ecFEF2575%ecFEV1={df_query_res_two_days_fef.loc[1, 'ecFEF2575%ecFEV1']:.2f}%"

    title_prefix = f"Comparing one vs two days model ID={df_query_res_two_days_fef.loc[0, 'ID']}, {df_query_res_two_days_fef.loc[0, 'Age']}y, {df_query_res_two_days_fef.loc[0, 'Height']}cm, {df_query_res_two_days_fef.loc[0, 'Sex']}"

    # One day
    hfev1_one_day = df_query_res_one_day.loc[0, HFEV1.name]
    ho2sat_one_day = df_query_res_one_day.loc[0, HO2Sat.name]
    ar_one_day = df_query_res_one_day.loc[0, AR.name]
    ia_one_day = df_query_res_one_day.loc[0, IA.name]

    # One day fef
    hfev1_one_day_fef = df_query_res_one_day_fef.loc[0, HFEV1.name]
    ho2sat_one_day_fef = df_query_res_one_day_fef.loc[0, HO2Sat.name]
    ar_one_day_fef = df_query_res_one_day_fef.loc[0, AR.name]
    ia_one_day_fef = df_query_res_one_day_fef.loc[0, IA.name]

    # Two days
    ar_day_1 = df_query_res_two_days.loc[0, AR.name]
    ar_max = df_query_res_two_days.loc[1, AR.name]
    ia_day_1 = df_query_res_two_days.loc[0, IA.name]
    ia_max = df_query_res_two_days.loc[1, IA.name]
    hfev1_two_days = df_query_res_two_days.loc[0, HFEV1.name]
    ho2sat_two_days = df_query_res_two_days.loc[0, HO2Sat.name]

    # Two days fef
    ar_day_1_fef = df_query_res_two_days_fef.loc[0, AR.name]
    ar_max_fef = df_query_res_two_days_fef.loc[1, AR.name]
    ia_day_1_fef = df_query_res_two_days_fef.loc[0, IA.name]
    ia_max_fef = df_query_res_two_days_fef.loc[1, IA.name]
    hfev1_two_days_fef = df_query_res_two_days_fef.loc[0, HFEV1.name]
    ho2sat_two_days_fef = df_query_res_two_days_fef.loc[0, HO2Sat.name]

    # Priors
    ih.plot_histogram(
        fig, HFEV1, HFEV1.cpt, HFEV1.a, HFEV1.b, 1, 1, HFEV1.name + " prior", "#009e73"
    )
    ih.plot_histogram(
        fig,
        AR,
        AR.cpt,
        AR.a,
        AR.b,
        2,
        1,
        AR.name + " prior",
        "#d55e00",
        clean_ticks=True,
    )
    # One days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day,
        HFEV1.a,
        HFEV1.b,
        1,
        2,
        "HFEV1",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_one_day_fef,
        HFEV1.a,
        HFEV1.b,
        1,
        3,
        "HFEV1" + " with FEF25-75",
        "#009e73",
    )
    # One days AR
    ih.plot_histogram(
        fig, AR, ar_one_day, AR.a, AR.b, 2, 2, "AR", colour="#d55e00", clean_ticks=True
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_one_day_fef,
        AR.a,
        AR.b,
        2,
        3,
        "AR" + " with FEF25-75",
        colour="#d55e00",
        clean_ticks=True,
    )
    # Two days HFEV1
    ih.plot_histogram(
        fig,
        HFEV1,
        hfev1_two_days_fef,
        HFEV1.a,
        HFEV1.b,
        1,
        4,
        "HFEV1" + " with FEF25-75",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        AR,
        ar_day_1_fef,
        AR.a,
        AR.b,
        2,
        4,
        "AR" + " day 1 with FEF25-75",
        colour="#d55e00",
        clean_ticks=True,
    )

    # Change marker size for plot row 7
    fig.update_traces(marker=dict(size=3), selector=dict(mode="markers"))

    fig.update_xaxes(title_standoff=5)

    fig.update_layout(
        title=f"{title_prefix}<br><br>Current day:   {title_day}<br>Max FEV1 day:{title_max_FEV1_day}",
        width=800,
        height=350,
        font=dict(size=8),
        showlegend=False,
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False),
    )

    path = f"{dh.get_path_to_main()}PlotsBreathe/Two days model/{plot_type_title} - {title_prefix}, {title_day}.pdf"
    if save:
        fig.write_image(path)
    else:
        fig.show()

    return -1

### Run plotting fn

In [24]:
# Code to plot 3 examples of the two days inference for each ID
def infer_and_plot_for_id(df_for_ID, debug, diff_threshold=1e-8):
    df_for_ID = df_for_ID.reset_index(drop=True)
    print(f"\nID: {df_for_ID.ID.iloc[0]}")
    print(f"#datapoints: {len(df_for_ID)}")

    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]
    (
        _,
        inf_alg,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(height, age, sex)
    # (
    #     _,
    #     inf_alg,
    #     HFEV1,
    #     uecFEV1,
    #     ecFEV1,
    #     AR,
    #     HO2Sat,
    #     O2SatFFA,
    #     IA,
    #     UO2Sat,
    #     O2Sat,
    #     ecFEF2575prctecFEV1,
    # ) = mb.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
    #     height, age, sex
    # )

    # Set variables parametrisation
    key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    # key_hfev1 = f"['{uecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    vars = [AR, IA]
    shared_vars = [HFEV1, HO2Sat]
    obs_vars = [ecFEV1.name, O2Sat.name]
    obs_vars_fef = [ecFEV1.name, O2Sat.name, ecFEF2575prctecFEV1.name]

    # Find the max FEV1 values
    # Given an ID, get the data which maximises ecFEV1, then ecFEF2575, then O2 Saturation
    idx_max_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    # Randomly select 4 entries for this ID
    # Select 1 entry's index in the bottom 10 percentile of ecFEV1
    bottom_10 = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.1)).argmax()
    # Select 1 entry's index in the 50% percentile
    # median = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.5)).argmax()
    # Select 1 entry's index in the top 10% percentile
    # top_10 = (df_for_ID["ecFEV1"] == df_for_ID["ecFEV1"].quantile(0.2)).argmax()
    # idx_list = [bottom_10, median, top_10]
    idx_list = [bottom_10]
    # idx_list = list(np.random.choice(df_for_ID.index, 4))

    for i, _ in df_for_ID.iloc[idx_list].iterrows():
        df_one_day = df_for_ID.iloc[[i]]
        df_two_days = df_for_ID.iloc[[i, idx_max_FEV1]]

        df_query_res_one_day, _, _ = slicing.query_forwardly_across_days(
            df_one_day, inf_alg, shared_vars, vars, obs_vars, diff_threshold, debug
        )
        df_query_res_one_day_fef, _, _ = slicing.query_forwardly_across_days(
            df_one_day, inf_alg, shared_vars, vars, obs_vars_fef, diff_threshold, debug
        )
        df_query_res_two_days, _, _ = slicing.query_forwardly_across_days(
            df_two_days, inf_alg, shared_vars, vars, obs_vars, diff_threshold, debug
        )
        df_query_res_two_days_fef, _, _ = slicing.query_forwardly_across_days(
            df_two_days, inf_alg, shared_vars, vars, obs_vars_fef, diff_threshold, debug
        )
        plot_one_and_two_days_models_res(
            df_query_res_one_day,
            df_query_res_one_day_fef,
            df_query_res_two_days,
            df_query_res_two_days_fef,
            HFEV1,
            HO2Sat,
            AR,
            IA,
            df_for_ID,
            save=True,
            plot_type_title="Summary",
        )

    return (
        df_query_res_one_day,
        df_query_res_one_day_fef,
        df_query_res_two_days,
        df_query_res_two_days_fef,
    )


# df_for_ID = df[df["ID"] == "113"]
# (
#     df_query_res_one_day,
#     df_query_res_one_day_fef,
#     df_query_res_two_days,
#     df_query_res_two_days_fef,
# ) = infer_and_plot_for_id(df_for_ID, debug=False, diff_threshold=1e-2)


res = (
    df[
        df.ID.isin(
            [
                "132",
                "146",
                "177",
                "180",
                "202",
                "527",
                "101",
                "117",
                "131",
                "134",
                "191",
                "139",
                "253",
            ]
        )
    ]
    .groupby("ID")
    .apply(
        lambda df_for_ID: infer_and_plot_for_id(
            df_for_ID, debug=False, diff_threshold=1e-6
        )
    )
)


ID: 101
#datapoints: 1680

ID: 117
#datapoints: 270

ID: 131
#datapoints: 29

ID: 132
#datapoints: 27

ID: 134
#datapoints: 97

ID: 139
#datapoints: 276

ID: 146
#datapoints: 278

ID: 177
#datapoints: 251

ID: 180
#datapoints: 262

ID: 191
#datapoints: 213

ID: 202
#datapoints: 130

ID: 253
#datapoints: 60

ID: 527
#datapoints: 5


In [None]:
print(
    f"HFEV1 max diff between 2 days: {max(df_query_res_two_days.loc[0, HFEV1.name] - df_query_res_two_days.loc[1, HFEV1.name]):.2g}"
)
print(
    f"HO2Sat max diff between 2 days: {max(df_query_res_two_days.loc[0, HO2Sat.name] - df_query_res_two_days.loc[1, HO2Sat.name]):.2g}"
)
# Stop criteria: difference of the last HFEV1 inferred between two epochs is les than a threshold (1e-6)

## Save the airway resistance for each day using the two days model
The airway resistance inferred using the two days model is believed to be the best estimate of the true ariway resistance that we can have. I will use it to redefine F3 (the CPT linking FEF25-75 to airway resistance). 

In [3]:
# Code to save the airway resistance for each day using the two days model
def infer_for_id(df_for_ID, debug, diff_threshold=1e-8):
    df_for_ID = df_for_ID.reset_index(drop=True)
    print(f"\nID: {df_for_ID.ID.iloc[0]}")
    print(f"#datapoints: {len(df_for_ID)}")

    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]
    ecfev1_noise_model_cpt_suffix = "_std_0.23"
    (
        _,
        inf_alg,
        HFEV1,
        uecFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
        height, age, sex, ecfev1_noise_model_cpt_suffix=ecfev1_noise_model_cpt_suffix
    )

    # Set variables parametrisation
    key_hfev1 = f"['{uecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    # key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    vars = [AR]
    shared_vars = [HFEV1, HO2Sat]
    # obs_vars = [ecFEV1.name]
    # obs_vars = [ecFEV1.name, O2Sat.name]
    obs_vars = [ecFEV1.name, ecFEF2575prctecFEV1.name]
    # obs_vars = [ecFEV1.name, O2Sat.name, ecFEF2575prctecFEV1.name]

    # Find the max FEV1 values
    # Given an ID, get the data which maximises ecFEV1, then ecFEF2575, then O2 Saturation
    idx_max_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    # For each entry, create a two_days data structure that hold the current day as well as the day where the max FEV1 is observed
    # If the two idx are the same, then run a one day model.
    # Adding the max FEV1 information to the model input allows a better estimation of the HFEV1, hereby reducing the shared uncertainty between AR and HFEV1.

    # Save information into a df
    df_current_day_res = pd.DataFrame({})

    # Get precompupted messages to speedup inference
    arr = np.ones(AR.card)
    arr /= arr.sum()
    uniform_from_o2_side = {
        "['O2 saturation if fully functional alveoli (%)', 'Healthy O2 saturation (%)', 'Airway resistance (%)'] -> Airway resistance (%)": arr
    }
    precomp_messages = uniform_from_o2_side

    for i, _ in df_for_ID.iterrows():
        if i != idx_max_FEV1:
            df_two_days = df_for_ID.iloc[[i, idx_max_FEV1]]
        else:
            df_two_days = df_for_ID.iloc[[i]]

        df_query_res_two_days, _, _ = slicing.query_forwardly_across_days(
            df_two_days,
            inf_alg,
            shared_vars,
            vars,
            obs_vars,
            diff_threshold,
            precomp_messages.copy(),
            debug,
        )

        new_row = df_query_res_two_days.loc[
            # 0, ["ID", "Day", HFEV1.name, HO2Sat.name, AR.name, IA.name]
            0,
            ["ID", "Day", HFEV1.name, HO2Sat.name, AR.name],
        ]
        df_current_day_res = pd.concat([df_current_day_res, pd.DataFrame(new_row).T])

    return df_current_day_res


# df_for_ID = df[df["ID"] == "101"]
# res = infer_for_id(df_for_ID, debug=False, diff_threshold=1e-6).reset_index(drop=True)

res = df.groupby("ID").apply(
    lambda df_for_ID: infer_for_id(
        df_for_ID, debug=False, diff_threshold=1e-6
    ).reset_index(drop=True)
)


ID: 101
#datapoints: 1680

ID: 102
#datapoints: 263

ID: 103
#datapoints: 375

ID: 104
#datapoints: 222

ID: 105
#datapoints: 6

ID: 106
#datapoints: 335

ID: 107
#datapoints: 126

ID: 108
#datapoints: 289

ID: 109
#datapoints: 94

ID: 110
#datapoints: 8

ID: 111
#datapoints: 124

ID: 112
#datapoints: 114

ID: 113
#datapoints: 714

ID: 114
#datapoints: 133

ID: 115
#datapoints: 43

ID: 116
#datapoints: 619

ID: 117
#datapoints: 270

ID: 118
#datapoints: 108

ID: 119
#datapoints: 88

ID: 120
#datapoints: 388

ID: 121
#datapoints: 87

ID: 122
#datapoints: 257

ID: 123
#datapoints: 1128

ID: 124
#datapoints: 33

ID: 125
#datapoints: 334

ID: 126
#datapoints: 60

ID: 127
#datapoints: 165

ID: 128
#datapoints: 39

ID: 129
#datapoints: 5

ID: 130
#datapoints: 282

ID: 131
#datapoints: 29

ID: 132
#datapoints: 27

ID: 133
#datapoints: 1066

ID: 134
#datapoints: 97

ID: 135
#datapoints: 26

ID: 138
#datapoints: 195

ID: 139
#datapoints: 276

ID: 140
#datapoints: 116

ID: 141
#datapoints: 201


In [4]:
res = res.reset_index(drop=True)

In [5]:
res.to_excel(
    f"{dh.get_path_to_main()}ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_ecFEV1_ecFEF2575_ecfev1noisestd0.23.xlsx",
    index=False,
)

In [6]:
res

Unnamed: 0,ID,Day,Healthy FEV1 (L),Healthy O2 saturation (%),Airway resistance (%)
0,101,2019-01-25,"[8.888450155689279e-05, 0.00010466336733997926...","[1.301313441818203e-35, 8.428052860278614e-31,...","[8.201938384420302e-11, 4.845853906070325e-10,..."
1,101,2019-01-26,"[8.888450155689279e-05, 0.00010466336733997926...","[1.301313441818203e-35, 8.428052860278614e-31,...","[8.201938384420302e-11, 4.845853906070325e-10,..."
2,101,2019-01-27,"[8.888450155689279e-05, 0.00010466336733997926...","[1.301313441818203e-35, 8.428052860278614e-31,...","[8.201938384420302e-11, 4.845853906070325e-10,..."
3,101,2019-01-28,"[8.888450155689279e-05, 0.00010466336733997926...","[1.301313441818203e-35, 8.428052860278614e-31,...","[8.201938384420302e-11, 4.845853906070325e-10,..."
4,101,2019-01-29,"[8.888450155689279e-05, 0.00010466336733997926...","[1.301313441818203e-35, 8.428052860278614e-31,...","[8.201938384420302e-11, 4.845853906070325e-10,..."
...,...,...,...,...,...
41255,553,2023-10-08,"[5.4333570954118006e-08, 7.32922355066062e-08,...","[1.515464415190392e-34, 8.2040210808894e-30, 1...","[3.586353045071328e-13, 1.7485851234805522e-12..."
41256,553,2023-10-11,"[5.816215361310191e-08, 7.77132899197018e-08, ...","[1.515464415190391e-34, 8.204021080889399e-30,...","[4.0364040356365383e-13, 2.0683874143639004e-1..."
41257,553,2023-11-06,"[5.4333570954118006e-08, 7.32922355066062e-08,...","[1.515464415190392e-34, 8.2040210808894e-30, 1...","[3.586353045071328e-13, 1.7485851234805522e-12..."
41258,553,2023-11-08,"[5.4333570954118006e-08, 7.32922355066062e-08,...","[1.515464415190392e-34, 8.2040210808894e-30, 1...","[3.586353045071328e-13, 1.7485851234805522e-12..."


# Two days model with exact inference

In [None]:
import src.data.breathe_data as bd
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders
import src.inference.helpers as ih
from plotly.subplots import make_subplots

import pandas as pd
import numpy as np

# df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx_and_heighest_obs_per_id")
df_for_ID = df[df["ID"] == "101"]
idx_max_FEV1 = df_for_ID.sort_values(
    by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
).index[0]

df_for_ID.iloc[[0, idx_max_FEV1]]

In [None]:
height = df_for_ID.Height.iloc[0]
age = df_for_ID.Age.iloc[0]
sex = df_for_ID.Sex.iloc[0]
(
    model,
    inf_alg,
    HFEV1,
    HO2Sat,
    ecFEV1,
    AR,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    # ecFEF2575prctecFEV1,
    ecFEV1_2,
    AR_2,
    O2SatFFA_2,
    IA_2,
    UO2Sat_2,
    O2Sat_2,
    # ecFEF2575prctecFEV1_2,
) = mb.o2_sat_fev1_two_days_model_light(height, age, sex)

In [None]:
# vars = [HFEV1, HO2Sat, AR, IA, AR_2, IA_2]
vars = [AR, IA]
obs_vars = [[ecFEV1, 1.31], [O2Sat, 97], [ecFEV1_2, 1.79], [O2Sat_2, 98]]

res = ih.infer(inference_alg=inf_alg, variables=vars, evidence=obs_vars)

# 1 day, exact vs approximate inference

In [None]:
df1 = df.copy()

In [None]:
df1.columns

In [None]:
df_res = pd.DataFrame({})

for id in df1.ID.unique():
    df_for_ID = df1[df1["ID"] == id].reset_index()
    idx_init = df_for_ID.index[0]

    # Set values
    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]

    idx_max_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    (
        _,
        inf_alg_approx,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
    ) = mb.o2sat_fev1_point_in_time_model_shared_healthy_vars_light(height, age, sex)

    # Set variables parametrisation
    key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    # Set inputs for approximate inference
    shared_vars_approx = [HFEV1, HO2Sat]
    vars_approx = []
    obs_var_names_approx = [ecFEV1.name, O2Sat.name]

    (
        model,
        inf_alg_exact,
        HFEV1_exact,
        HO2Sat,
        ecFEV1,
        AR,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEV1_2,
        AR_2,
        O2SatFFA_2,
        IA_2,
        UO2Sat_2,
        O2Sat_2,
    ) = mb.o2_sat_fev1_two_days_model_light(height, age, sex)
    shared_vars_exact = [HFEV1, HO2Sat]

    # Take random element from df_for_ID.index
    chosen_idx_list = np.random.choice(df_for_ID.index, 1)

    for i in chosen_idx_list:  # df_for_ID.iterrows():
        if i == idx_max_FEV1:
            print(f"Max FEV1 entry found (index {i}), skipping to avo")
            continue
        df_two_days = df_for_ID.iloc[[i, idx_max_FEV1]]

        # Approximate inference
        df_query_res, df_res_before_convergence, shared_vars_final = (
            slicing.query_forwardly_across_days(
                df_two_days,
                inf_alg_approx,
                shared_vars_approx,
                vars_approx,
                obs_var_names_approx,
                1e-8,
                debug=False,
                auto_reset_shared_vars=True,
            )
        )
        hfev1_approx = df_query_res.loc[0, HFEV1.name]
        ho2sat_approx = df_query_res.loc[0, HO2Sat.name]

        # Exact inference
        fev1 = df_two_days.reset_index().loc[0, "ecFEV1"]
        fev1_max = df_two_days.reset_index().loc[1, "ecFEV1"]
        o2sat = df_two_days.reset_index().loc[0, "O2 Saturation"]
        o2sat_max = df_two_days.reset_index().loc[1, "O2 Saturation"]
        obs_vars = [
            [ecFEV1, fev1],
            [O2Sat, o2sat],
            [ecFEV1_2, fev1_max],
            [O2Sat_2, o2sat_max],
        ]

        res_exact = ih.infer(
            inference_alg=inf_alg_exact,
            variables=shared_vars_exact,
            evidence=obs_vars,
            debug=False,
        )
        hfev1_exact = res_exact[HFEV1.name].values
        ho2sat_exact = res_exact[HO2Sat.name].values

        # Add row to df_res
        row = {
            "ID": df_for_ID.loc[i, "ID"],
            "Date Recorded": df_for_ID.loc[i, "Date Recorded"],
            "Age": df_for_ID.loc[i, "Age"],
            "Height": df_for_ID.loc[i, "Height"],
            "Sex": df_for_ID.loc[i, "Sex"],
            "HFEV1 exact inf": hfev1_exact,
            "HFEV1 approx inf": hfev1_approx,
            "HO2Sat exact inf": ho2sat_exact,
            "HO2Sat approx inf": ho2sat_approx,
        }

        dt_tmp = pd.Series(row).to_frame().T

        df_res = pd.concat([df_res, dt_tmp])

In [None]:
# Metrics to compare the models


# Kullback–Leibler divergence
# it is the expectation of the logarithmic difference between the probabilities P and Q, where the expectation is taken using the probabilities P.
def calc_kullback_leibler_divergence(p, q):
    # Replace 0s in q with 1e-10
    q[q == 0] = 1e-20
    p[p == 0] = 1e-20
    return sum(np.log(p / q) * p)


def calc_relative_diff(exact, approx):
    exact_bis = exact.copy()
    exact_bis[np.abs(exact) < 0.0001] = 0
    approx_bis = approx.copy()
    approx_bis[np.abs(approx) < 0.0001] = 0
    # To perform a safe division, if hfev1_exact_bis is 0, then we divide by 1
    denominator_exact = exact_bis.copy()
    denominator_exact[np.abs(exact) < 0.0001] = 1
    return np.sum(np.abs((np.divide(exact_bis - approx_bis, denominator_exact) * 100)))


def max_diff(exact, approx):
    """
    In the end I will just apply the max diff because 1/ it's easiest to interpret,
    and 2/ it reveals the changes in the highest values, which is what we are interested in
    """
    return max(np.abs(exact - approx))


df_res["HFEV1 max diff"] = df_res.apply(
    lambda row: max_diff(row["HFEV1 exact inf"], row["HFEV1 approx inf"]), axis=1
)
df_res["HO2Sat max diff"] = df_res.apply(
    lambda row: max_diff(row["HO2Sat exact inf"], row["HO2Sat approx inf"]), axis=1
)

# df1.loc[df_for_ID.loc[i, "index"], "HFEV1 relative diff"] = calc_relative_diff(
#     hfev1_exact, hfev1_approx
# )
# df1.loc[df_for_ID.loc[i, "index"], "HO2Sat relative diff"] = calc_relative_diff(
#     ho2sat_exact, ho2sat_approx
# )

# kl_div_hfev1 = calc_kullback_leibler_divergence(hfev1_exact, hfev1_approx)
# kl_div_ho2sat = calc_kullback_leibler_divergence(ho2sat_exact, ho2sat_approx)
# df1.loc[df_for_ID.loc[i, "index"], "HFEV1 KL div"] = kl_div_hfev1
# df1.loc[df_for_ID.loc[i, "index"], "HO2Sat KL div"] = kl_div_ho2sat

In [None]:
df_res.sort_values(by="HFEV1 max diff", ascending=False)

In [None]:
df1.sort_values(by="HFEV1 max diff", ascending=False).head(10)
# df1.sort_values(by="HFEV1 relative diff", ascending=False).head(10)
# df1.sort_values(by="HFEV1 KL div", ascending=False).head(10)
# The same 8 entries are in the top 10. Meaning that both are similar indicators of the difference between the two models
# The difference is small

In [None]:
# Spot cases where the error is high
# The lower the value, the higher the error -> remove the lowest probabilities

# N days, exact vs approximate inference

In [None]:
# Exact inference: I need a modular way to create Bayesian Networks - that get bigger with the number of days
# Approx inference: just add more days as input

In [3]:
# Obj: study HFEV1, HO2Sat approx posteriors deviation from exact value with increasing number of days
# Uses model with FEV1, O2Sat

n_days_consec_max = 3
with_max_FEV1_after_first_round = True
with_max_FEV1_at_first_round = False

df_diffs = pd.DataFrame({})
first_run = True
debug = False
n_days_total_max = (
    n_days_consec_max + 1 if with_max_FEV1_after_first_round else n_days_consec_max
)

with_max_FEV1 = with_max_FEV1_at_first_round

# Run the first twice, once without the max FEV1 index, then with
for n_days_consec in [1] + list(range(1, n_days_consec_max + 1)):
    # for n_days_consec in list(range(1, n_days_consec_max + 1)):
    df_res = pd.DataFrame({})
    n_days_total = n_days_consec + 1 if with_max_FEV1 else n_days_consec
    print("days concse", n_days_consec, "days total", n_days_total)
    if debug:
        print(f"{n_days_total} days, {n_days_consec} consecutive days")

    for id in df.ID.unique():
        # for id in ["101", "102"]:
        if debug:
            print(f"ID: {id}")
        df_for_ID = df[df["ID"] == id].reset_index()

        # Set values
        height = df_for_ID.Height.iloc[0]
        age = df_for_ID.Age.iloc[0]
        sex = df_for_ID.Sex.iloc[0]

        idx_max_FEV1 = df_for_ID.sort_values(
            by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
        ).index[0]

        # If df length is less than n_days, skip
        if len(df_for_ID) < n_days_total_max:
            print(f"Too few data, skipping ID {id}")
            continue

        # If idx_max_FEV1 is in the first n_days, skip
        if idx_max_FEV1 < n_days_total_max:
            print(f"Max FEV1 in the first {n_days_total_max} days, skipping ID {id}")
            continue

        # Variable listing used the set up parameters, not to infer
        (
            _,
            inf_alg_approx,
            HFEV1,
            ecFEV1,
            AR,
            HO2Sat,
            O2SatFFA,
            IA,
            UO2Sat,
            O2Sat,
        ) = mb.o2sat_fev1_point_in_time_model_shared_healthy_vars_light(
            height, age, sex
        )

        # Set variables parametrisation
        key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
        key_ho2sat = (
            f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
        )
        HFEV1.set_factor_node_key(key_hfev1)
        HO2Sat.set_factor_node_key(key_ho2sat)

        # Set inputs for approximate inference
        shared_vars_approx = [HFEV1, HO2Sat]
        vars_approx = []
        obs_var_names_approx = [ecFEV1.name, O2Sat.name]

        (
            model_exact,
            inf_alg_exact,
            HFEV1,
            HO2Sat,
            AR_vars,
            ecFEV1_vars,
            O2SatFFA_vars,
            IA_vars,
            UO2Sat_vars,
            O2Sat_vars,
        ) = mb.o2_sat_fev1_n_days_model_light(n_days_total, height, age, sex)
        shared_vars_exact = [HFEV1, HO2Sat]

        # Take random element from df_for_ID.index
        np.random.seed(0)

        chosen_idx_sequence = [idx_max_FEV1]
        # !!! BEWARE THIS IS SEQUENTIAL INDICES, NOT SEQUENTIAL DAYS !!!
        while idx_max_FEV1 in chosen_idx_sequence:
            chosen_idx_sequence = np.random.choice(
                list(range(0, len(df_for_ID) - n_days_total)), 1
            )
            # For each entry in idx_list, add consecutive idx to get n_days
            chosen_idx_sequence = list(
                range(chosen_idx_sequence[0], chosen_idx_sequence[0] + n_days_consec)
            )

        if with_max_FEV1:
            df_n_days = df_for_ID.iloc[chosen_idx_sequence + [idx_max_FEV1]]
        else:
            df_n_days = df_for_ID.iloc[chosen_idx_sequence]

        # Approximate inference
        df_query_res, df_res_before_convergence, shared_vars_final = (
            slicing.query_forwardly_across_days(
                df_n_days,
                inf_alg_approx,
                shared_vars_approx,
                vars_approx,
                obs_var_names_approx,
                1e-8,
                debug=False,
                auto_reset_shared_vars=True,
            )
        )
        hfev1_approx = df_query_res.loc[0, HFEV1.name]
        ho2sat_approx = df_query_res.loc[0, HO2Sat.name]

        # Exact inference
        obs_vars_exact = []
        for j in range(n_days_total):
            ecfev1_obs = df_n_days.reset_index().loc[j, "ecFEV1"]
            o2sat_obs = df_n_days.reset_index().loc[j, "O2 Saturation"]
            obs_vars_exact = obs_vars_exact + [
                [ecFEV1_vars[j], ecfev1_obs],
                [O2Sat_vars[j], o2sat_obs],
            ]

        res_exact = ih.infer(
            inference_alg=inf_alg_exact,
            variables=shared_vars_exact,
            evidence=obs_vars_exact,
            debug=False,
        )
        hfev1_exact = res_exact[HFEV1.name].values
        ho2sat_exact = res_exact[HO2Sat.name].values

        # Add row to df_res
        row = {
            "ID": df_for_ID.loc[chosen_idx_sequence[0], "ID"],
            "Day 1": df_for_ID.loc[chosen_idx_sequence[0], "Date Recorded"],
            f"{n_days_total} days HFEV1 exact inf": hfev1_exact,
            f"{n_days_total} days HFEV1 approx inf": hfev1_approx,
            f"{n_days_total} days HO2Sat exact inf": ho2sat_exact,
            f"{n_days_total} days HO2Sat approx inf": ho2sat_approx,
        }

        dt_tmp = pd.Series(row).to_frame().T

        df_res = pd.concat([df_res, dt_tmp])

        def max_diff(exact, approx):
            """
            In the end I will just apply the max diff because 1/ it's easiest to interpret,
            and 2/ it reveals the changes in the highest values, which is what we are interested in
            """
            return max(np.abs(exact - approx))

        df_res[f"{n_days_total} days HFEV1 max diff"] = df_res.apply(
            lambda row: max_diff(
                row[f"{n_days_total} days HFEV1 exact inf"],
                row[f"{n_days_total} days HFEV1 approx inf"],
            ),
            axis=1,
        )
        df_res[f"{n_days_total} days HO2Sat max diff"] = df_res.apply(
            lambda row: max_diff(
                row[f"{n_days_total} days HO2Sat exact inf"],
                row[f"{n_days_total} days HO2Sat approx inf"],
            ),
            axis=1,
        )

    if first_run:
        df_diffs = df_res[
            [
                "ID",
                "Day 1",
                f"{n_days_total} days HFEV1 exact inf",
                f"{n_days_total} days HFEV1 approx inf",
                f"{n_days_total} days HFEV1 max diff",
                f"{n_days_total} days HO2Sat max diff",
            ]
        ]
        first_run = False
        if with_max_FEV1_after_first_round:
            with_max_FEV1 = True
        else:
            with_max_FEV1 = False
    else:
        # Merge df_res with df_diffs
        df_diffs = pd.concat(
            [
                df_diffs,
                df_res[
                    [
                        f"{n_days_total} days HFEV1 exact inf",
                        f"{n_days_total} days HFEV1 approx inf",
                        f"{n_days_total} days HFEV1 max diff",
                        f"{n_days_total} days HO2Sat max diff",
                    ]
                ],
            ],
            axis=1,
        )

days concse 1 days total 1


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

### Create and save full df_diffs

In [None]:
right_cols = [
    "ecFEV1",
    "O2 Saturation",
    "idx ecFEV1 (L)",
    "idx O2 saturation (%)",
    "Max ecFEV1",
    "ID",
    "Date Recorded",
    "Age",
    "Sex",
    "Height",
]
df_diffs1 = df_diffs
df_diffs = df_diffs1.merge(
    df[right_cols], right_on=["ID", "Date Recorded"], left_on=["ID", "Day 1"]
).drop(columns=["Date Recorded"])

In [None]:
df_diffs[f"idx Max ecFEV1"] = df_diffs.apply(
    lambda row: ih.get_bin_for_value(row["Max ecFEV1"], ecFEV1)[1], axis=1
)

In [None]:
# if merged two times
# df_diffs = df_diffs.drop(columns=['ecFEV1_x',
#        'O2 Saturation_x', 'idx ecFEV1 (L)_x', 'idx O2 saturation (%)_x',
#        'Age_x', 'Sex_x', 'Height_x'])
# Rename columns
df_diffs = df_diffs.rename(
    columns={
        "ecFEV1_y": "ecFEV1",
        "O2 Saturation_y": "O2 Saturation",
        "idx ecFEV1 (L)_y": "idx ecFEV1 (L)",
        "idx O2 saturation (%)_y": "idx O2 saturation (%)",
        "Age_y": "Age",
        "Sex_y": "Sex",
        "Height_y": "Height",
    }
)

In [None]:
# df_diffs.to_excel(
#     dh.get_path_to_main()
#     + "ExcelFiles/BR/n_days_model_approx_vs_exact_inference_diffs - with max FEV1.xlsx",
#     index=False,
# )

In [None]:
# Load diff data
df_diffs = bd.load_meas_from_excel(
    "n_days_model_approx_vs_exact_inference_diffs - with max FEV1",
    bypass_sanity_checks=True,
    str_cols_to_arrays=[
        "1 days HFEV1 exact inf",
        "1 days HFEV1 approx inf",
        "2 days HFEV1 exact inf",
        "2 days HFEV1 approx inf",
        "3 days HFEV1 exact inf",
        "3 days HFEV1 approx inf",
        "4 days HFEV1 exact inf",
        "4 days HFEV1 approx inf",
    ],
    date_cols=["Day 1"],
)

### Stability of the approximate inference

In [None]:
# Create boxplots for each n days models
import plotly.express as px

var = "HO2Sat"
# var = "HFEV1"

maxfev1 = " (max FEV1)" if with_max_FEV1_after_first_round else ""

ref_cols = ["ID", "Day 1", "Age", "Sex", "Height"]

col_to_plot = ref_cols + [
    f"{n_days} days {var} max diff" for n_days in range(1, n_days_total_max + 1)
]
new_col_names = (
    ref_cols
    + ["1 day"]
    + [f"{n_days} days{maxfev1}" for n_days in range(2, n_days_total_max + 1)]
)

df_diffs_plot = df_diffs[col_to_plot]
# Rename columns
df_diffs_plot.columns = new_col_names

df_diffs_melted = df_diffs_plot.melt(
    # id_vars=["ID", "Day 1", "Day 1 idx", "Age", "Sex", "Height"],
    id_vars=ref_cols,
    value_vars=new_col_names,
    var_name="N days model",
    value_name=f"Max absolute difference for {var}",
)

title = f"Exact vs approximate inference for {var} ({len(df_diffs_plot)} datapoints)"

fig = px.box(
    df_diffs_melted,
    x="N days model",
    y=f"Max absolute difference for {var}",
    color="N days model",
    title=title,
    hover_data=ref_cols,
)
# fig.update_yaxes(type="log")
fig.update_layout(showlegend=False, width=800, height=400)
fig.update_yaxes(tickvals=[1e-4, 0.001, 0.003, 0.005, 0.007])
fig.show()
fig.write_image(
    dh.get_path_to_main() + f"PlotsBreathe/N days model/{title}{maxfev1}.pdf"
)

NameError: name 'with_max_FEV1_after_first_round' is not defined

### Why are some values 1000x higher?

In [None]:
id = "331"
a = df_res[df_res.ID == id].loc[0, "4 days HFEV1 approx inf"]
e = df_res[df_res.ID == id].loc[0, "4 days HFEV1 exact inf"]
df_res[df_res.ID == id]

Unnamed: 0,ID,Day 1,4 days HFEV1 exact inf,4 days HFEV1 approx inf,4 days HO2Sat exact inf,4 days HO2Sat approx inf,4 days HFEV1 max diff,4 days HO2Sat max diff
0,331,2022-06-24,"[0.0, 0.9692460478872158, 0.0307539506935627, ...","[0.0, 0.969246048778613, 0.030753949802166513,...","[0.0, 0.0, 1.3200912938369406e-19, 4.288186805...","[0.0, 0.0, 1.320066007473432e-19, 4.2880918256...",8.913972e-10,9.497991e-10


In [None]:
# select ID 122 and date recorded 2021-12-28
import datetime

df[(df.ID == id) & (df["Date Recorded"] == datetime.date(2021, 6, 28))]
# Find the max FEV1 index for id 122
# df[df.ID == id].sort_values(by=["ecFEV1", "O2 Saturation"], ascending=False).iloc[0:2]

In [None]:
# highest diffs
# 122
df.iloc[[6258, 6259, 6260, 6120]]
# 232
df.iloc[[21815, 21816, 21817, 21819]]

# 0 diff (if max ecFEV1 is > 5L)
df.iloc[[4320, 4321, 4322, 4106]]

# smallest nonzero diff
# 331
df.iloc[[31307, 31308, 31309, 31205]]
# 261
df.iloc[[25566, 25567, 25568, 25573]]

# When max FEV1 is over 5 L, then probability of HFEV1 in range 5-6L is 1
# The most uncertain HFEV1 is the biggest the difference between the two models? -> when one bin overly wins, the max diff is low

# The approximate model is most accurate when the exact answer has strong beliefs for one bin

In [None]:
np.set_printoptions(precision=2)
e = df_res[df_res.ID == "122"].loc[0, "HFEV1 exact inf"]
a = df_res[df_res.ID == "122"].loc[0, "HFEV1 approx inf"]
print("exact inference ", e)
print("approx inference", a)
print("absolute diff   ", np.abs(e - a))
print(f"relative diff (%)", np.abs(e - a) / e * 100)
# How to print array with 2 decimal values only

In [None]:
df_diffs[df_diffs["4 days HFEV1 max diff"] != 0].sort_values(
    by=f"{n_days_total_max} days HFEV1 max diff", ascending=True
)
# df_diffs.drop(columns=["1 days HFEV1 max diff", "1 days HO2Sat max diff"])

#### Does HFEV1 diff correlates with max ecFEV1 ?

In [None]:
# Does HFEV1 diff correlates with high max ecFEV1?
title = f"Does the HFEV1 diff correlates with max ecFEV1 values?"
var = "HFEV1"
idx_col = "idx Max ecFEV1"
# idx_col = "idx ecFEV1 (L)"

fig = px.box(
    df_diffs,
    x=idx_col,
    y=f"4 days {var} max diff",
    # color="N days model",
    # title=title,
    # hover_data=ref_cols,
)
val_counts = df_diffs.value_counts(idx_col).sort_index()
xaxislabels = list(
    map(lambda x: f"{x[0]} (#{x[1]})", zip(val_counts.index, val_counts.values))
)
# Update x axis labels
fig.update_xaxes(tickvals=val_counts.index, ticktext=xaxislabels)
# Add 10e-4 p on y axis label
fig.update_yaxes(tickvals=[0.0001, 0.001, 0.003, 0.005, 0.007])
fig.update_layout(title=title, showlegend=False, width=800, height=400)
fig.show()
print("Value counts for day 1 idx")

# Save image
fig.write_image(dh.get_path_to_main() + f"PlotsBreathe/N days model/{title}.pdf")

Value counts for day 1 idx


#### Further analysis within the 2nd FEV1 bin

In [None]:
df_diffs_2nd_bin = df_diffs[df_diffs["idx Max ecFEV1"] == 1]

In [None]:
height = df_diffs.Height.iloc[0]
age = df_diffs.Age.iloc[0]
sex = df_diffs.Sex.iloc[0]
(
    model,
    inf_alg,
    HFEV1,
    HO2Sat,
    ecFEV1,
    AR,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    # ecFEF2575prctecFEV1,
    ecFEV1_2,
    AR_2,
    O2SatFFA_2,
    IA_2,
    UO2Sat_2,
    O2Sat_2,
    # ecFEF2575prctecFEV1_2,
) = mb.o2_sat_fev1_two_days_model_light(height, age, sex)

In [None]:
# Show avg exact FEV1 above and below 0.001 of diff probability
df_diffs_2nd_bin["Outlying max diff"] = (
    df_diffs_2nd_bin[f"4 days HFEV1 max diff"] >= 0.001
)

# Compute average 4 days exact FEV1 for below and above 0.001 diff
avg_below = df_diffs_2nd_bin[df_diffs_2nd_bin["Outlying max diff"] == False][
    f"4 days HFEV1 exact inf"
].mean()
n_below = len(df_diffs_2nd_bin[df_diffs_2nd_bin["Outlying max diff"] == False])
avg_above = df_diffs_2nd_bin[df_diffs_2nd_bin["Outlying max diff"] == True][
    f"4 days HFEV1 exact inf"
].mean()
n_above = len(df_diffs_2nd_bin[df_diffs_2nd_bin["Outlying max diff"] == True])

# Plot those two averages on a histogram
title = "What characterises the outlying HFEV1 max diff values?<br>Comparing the HFEV1 exact posteriors averages for 2 groups<br> given max ecFEV1 in bin 1-2L, using the 4 days model"
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1)
ih.plot_histogram(
    fig,
    HFEV1,
    avg_above,
    HFEV1.a,
    HFEV1.b,
    1,
    1,
    name=f"Above 0.001 (#{n_above})",
    annot=False,
)
ih.plot_histogram(
    fig,
    HFEV1,
    avg_below,
    HFEV1.a,
    HFEV1.b,
    2,
    1,
    name=f"Below 0.001 (#{n_below})",
    annot=False,
)
# Ad legend title
fig.update_layout(legend_title_text="Absolute max difference:")
fig.update_xaxes(title_standoff=1, title=HFEV1.name, row=2, col=1)
fig.update_layout(
    width=600, height=300, showlegend=True, font=dict(size=9), title=title
)
fig.show()
# Write image
fig.write_image(dh.get_path_to_main() + f"PlotsBreathe/N days model/{title}.pdf")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Does HFEV1 diff correlates with high max ecFEV1?
title = f"Does the HFEV1 diff correlates with max ecFEV1 values?"
var = "HFEV1"
# idx_col = ""idx Max ecFEV1""
idx_col = "idx ecFEV1 (L)"
idx_col = "idx O2 saturation (%)"

fig = px.box(
    df_diffs_2nd_bin,
    x=idx_col,
    y=f"4 days {var} max diff",
    # color="N days model",
    # title=title,
    # hover_data=ref_cols,
)
val_counts = df_diffs.value_counts(idx_col).sort_index()
xaxislabels = list(
    map(lambda x: f"{x[0]} (#{x[1]})", zip(val_counts.index, val_counts.values))
)
# Update x axis labels
fig.update_xaxes(tickvals=val_counts.index, ticktext=xaxislabels)
# Add 10e-4 p on y axis label
fig.update_yaxes(tickvals=[0.0001, 0.001, 0.003, 0.005, 0.007])
fig.update_layout(title=title, showlegend=False, width=800, height=400)
fig.show()
print("Value counts for day 1 idx")

# Save image
# fig.write_image(dh.get_path_to_main() + f"PlotsBreathe/N days model/{title}.pdf")

Value counts for day 1 idx


## For how many days does the inference run in finite time?

In [None]:
res_exact = ih.infer(
    inference_alg=inf_alg_exact,
    variables=shared_vars_exact,
    evidence=obs_vars_exact,
    debug=False,
)