In [1]:
import pandas as pd
import numpy as np

import data.breathe_data as bd
import data.helpers as dh
import models.builders as mb
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import inference.helpers as ih

In [2]:
# Checked that obs indices are correct, see ipynb mentioned above(01.05.2025)
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [3]:
df_fev1 = bd.load_meas_from_excel(
    "infer_AR_using_fev1_01052025", ["AR", "HFEV1", "HO2Sat"], bypass_sanity_checks=True
).drop(columns=["HO2Sat"])
df_fev1.rename(
    {
        "AR": "AR_fev1",
        "HFEV1": "HFEV1_fev1",
    },
    axis=1,
    inplace=True,
)

In [4]:
df_fev1_o2 = bd.load_meas_from_excel(
    "infer_AR_using_o2sat_fev1_01052025", ["AR", "HFEV1"], bypass_sanity_checks=True
).drop(columns=["IA", "HO2Sat"])
df_fev1_o2.rename(
    {
        "AR": "AR_fev1_o2",
        "HFEV1": "HFEV1_fev1_o2",
    },
    axis=1,
    inplace=True,
)

In [5]:
df_fev1_o2_2d = bd.load_meas_from_excel(
    "infer_AR_using_two_days_model_o2_fev1_01052025",
    ["Airway resistance (%)", "Healthy FEV1 (L)"],
    date_cols=["Day"],
    bypass_sanity_checks=True,
).drop(columns=["Healthy O2 saturation (%)"])
df_fev1_o2_2d.rename(
    {
        "Airway resistance (%)": "AR_fev1_o2_2d",
        "Healthy FEV1 (L)": "HFEV1_fev1_o2_2d",
        "Day": "Date Recorded",
    },
    axis=1,
    inplace=True,
)

In [6]:
# join df to df_fev1 on ID and Date Recorded
cols = [
    "ID",
    "Date Recorded",
    "ecFEV1 % Predicted",
    "O2 Saturation",
    "Height",
    "Age",
    "Sex",
]
df_res = pd.merge(df[cols], df_fev1, on=["ID", "Date Recorded"], how="inner")
df_res = pd.merge(df_res, df_fev1_o2, on=["ID", "Date Recorded"], how="inner")
df_res = pd.merge(df_res, df_fev1_o2_2d, on=["ID", "Date Recorded"], how="inner")

In [7]:
# Add records count per ID
df_count = (
    df_res.groupby("ID")
    .apply(lambda df: len(df))
    .reset_index()
    .rename(columns={0: "count"})
)
df_res = pd.merge(df_res, df_count, on=["ID"])

In [8]:
# Add max ecFEV1 % pred
df_max_fev = (
    df_res.groupby("ID")
    .apply(lambda df: df["ecFEV1 % Predicted"].max())
    .reset_index()
    .rename(columns={0: "max ecFEV1 % Predicted"})
)
df_res = pd.merge(df_res, df_max_fev, on=["ID"])
df_res["FEV delta"] = df_res["max ecFEV1 % Predicted"] - df_res["ecFEV1 % Predicted"]

In [9]:
# Add avg o2 sat
df_avg_o2 = (
    df_res.groupby("ID")
    .apply(lambda df: df["O2 Saturation"].mean())
    .reset_index()
    .rename(columns={0: "Avg O2 saturation"})
)
df_res = pd.merge(df_res, df_avg_o2, on=["ID"])

In [10]:
# df_res.sort_values(by="FEV delta", ascending=False)[0:5]
# df_res.sort_values(by=["ecFEV1 % Predicted", "O2 Saturation"], ascending=True)[0:5]
indices_max_avg_o2_min_ecfev1 = (
    df_res.groupby("ID")
    .apply(
        lambda df: df.reset_index()
        .sort_values(by=["ecFEV1 % Predicted"], ascending=True)
        .iloc[0]
    )["index"]
    .values
)
indices_over_30_datapoints = df_res[df_res["count"] > 30].index
indices_max_avg_o2_below_94 = df_res[df_res["Avg O2 saturation"] < 94].index
mask = list(
    set(indices_max_avg_o2_min_ecfev1)
    .intersection(indices_max_avg_o2_below_94)
    .intersection(indices_over_30_datapoints)
)
df_res.iloc[mask].sort_values(
    by=["Avg O2 saturation", "ecFEV1 % Predicted"], ascending=[True, True]
).index

Index([3448, 33235, 36519, 35294, 12396], dtype='int64')

# Plot examples of narrowed in posteriors

In [54]:
df_res_last_30[df_res_last_30["ID"] == "101"][0:5].index

Index([1650, 1651, 1652, 1653, 1654], dtype='int64')

In [55]:
# i = 2955
# fev1 and o2: see impact of very low o2
# Impact of min o2 shouldn't get propagated back to hfev1 because it's more due to sudden infection rather than permanent lung damage
# indices = [6311, 35304]
# However, avg o2 is low (or max o2) is relevant. -  Avg o2 sat < 94, min ecfev1, > 30 datapoints
# indices = [35294]
# Unexpectedly small lungs. Below 30% of ecFE1%Predicted (life threatening), expects lower max o2, hence healthy oxygenation must be due to smaller lungs. Ex: o2 = 100, min and max ecfev1.
# → use app
# Exclude o2 from longitudinal model due to calibration issues

# Shared hfev1/ar uncertainty: small healthy lungs or big lungs with disease
# Unexpectedly big lungs with disease: high fev1 % pred delta (benefit of 2nd day)
# indices = [18452, 33198, 2330, 2975, 18430]
# Unexpectedly small lungs: low fev1 % pred but no permanent lung damage (high fef2575%fev1) → show without fef2575 then with

# Super high fev1
# indices = [37113]

indices = [1650, 1651, 1652, 1653, 1654]

for i in indices:
    print(i)
    id = df.loc[i, "ID"]
    (
        _,
        inf_alg,
        HFEV1,
        uecFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
        df_res.iloc[i].Height,
        df_res.iloc[i].Age,
        df_res.iloc[i].Sex,
        ia_prior="uniform",
        ar_prior="uniform",
        ecfev1_noise_model_cpt_suffix="_std_add_mult_ecfev1",
        ar_fef2575_cpt_suffix="_ecfev1_2_days_model_add_mult_noise",
    )

    fig = make_subplots(
        rows=4,
        cols=3,
        shared_xaxes=True,
    )
    first_data_col = 2

    def add_fev1_prct_pred_line(fig, val, y_max, row, col):
        fig.add_shape(
            type="line",
            # opacity=0.1,
            x0=val,
            y0=0,
            x1=val,
            y1=y_max * 1.1,
            line=dict(color="black", width=2, dash="dash"),
            # fillcolor="red",
            # line_width=0,
            row=row,
            col=col,
        )
        return -1

    def add_hists(fig, hfev1_col, ar_col, i, row):
        ih.plot_histogram(
            fig,
            HFEV1,
            df_res.loc[i, hfev1_col],
            HFEV1.a,
            HFEV1.b,
            row,
            first_data_col,
            f" {hfev1_col}",
            "#009e73",
        )
        ih.plot_histogram(
            fig,
            AR,
            df_res.loc[i, ar_col],
            AR.a,
            AR.b,
            row,
            first_data_col + 1,
            f" {ar_col}",
            "#d55e00",
            clean_ticks=True,
        )
        add_fev1_prct_pred_line(
            fig,
            max(100 - df_res.loc[i, "ecFEV1 % Predicted"], 0),
            max(df_res.loc[i, ar_col]),
            row,
            first_data_col + 1,
        )

    ih.plot_histogram(
        fig,
        HFEV1,
        HFEV1.cpt,
        HFEV1.a,
        HFEV1.b,
        1,
        first_data_col,
        HFEV1.name + " prior",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        AR,
        AR.cpt,
        AR.a,
        AR.b,
        1,
        first_data_col + 1,
        AR.name + " prior",
        "#d55e00",
        clean_ticks=True,
    )
    add_fev1_prct_pred_line(
        fig,
        max(100 - df_res.loc[i, "ecFEV1 % Predicted"], 0),
        max(AR.cpt),
        1,
        first_data_col + 1,
    )

    add_hists(fig, "HFEV1_fev1", "AR_fev1", i, 2)
    add_hists(fig, "HFEV1_fev1_o2", "AR_fev1_o2", i, 3)
    add_hists(fig, "HFEV1_fev1_o2_2d", "AR_fev1_o2_2d", i, 4)

    # Add "prior knowledge" on col 1, row 1
    fig.add_annotation(
        x=0.2,
        y=3.5 / 4,
        text="Prior knowledge",
        xref="paper",
        yref="paper",
        xanchor="center",
        yanchor="middle",
        showarrow=False,
    )
    add_text_box(fig, text="Your text here<br>Second line<br>Third line", row=2, col=1)

    title = f"{id}, {df_res.loc[i, 'Date Recorded']}, i={i}, o2 avg={df_res.loc[i, 'Avg O2 saturation']:.2f}%, ecFEV1%={df_res.loc[i, 'ecFEV1 % Predicted']:.2f}"
    print(title)

    fig.update_layout(
        showlegend=False,
        height=500,
        width=800,
        font=dict(size=11),
        bargap=0.01,
        title=title,
        # margin=dict(l=20, r=20, b=20, t=20),
    )

    # fig.show()
    fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/Narrowing_in/{title}.pdf")

1650
101, 2023-10-13, i=1650, o2 avg=97.46%, ecFEV1%=47.09
1651
101, 2023-10-14, i=1651, o2 avg=97.46%, ecFEV1%=46.81
1652
101, 2023-10-15, i=1652, o2 avg=97.46%, ecFEV1%=44.87
1653
101, 2023-10-16, i=1653, o2 avg=97.46%, ecFEV1%=45.71
1654
101, 2023-10-17, i=1654, o2 avg=97.46%, ecFEV1%=47.09


In [11]:
def add_text_box(fig, text, row, col):
    # Get the subplot's domain coordinates
    x_domain = fig.get_subplot(row, col).xaxis.domain
    y_domain = fig.get_subplot(row, col).yaxis.domain
    # print(x_domain, y_domain)

    # Add rectangle
    fig.add_shape(
        type="rect",
        x0=x_domain[0] - 1,
        x1=x_domain[1] * 25,
        y0=y_domain[0],
        y1=y_domain[1],
        line=dict(color="black", width=0),
        fillcolor="white",
        xref="paper",
        yref="paper",
        row=row,
        col=col,
    )

    # Add text
    fig.add_annotation(
        x=(x_domain[0] + x_domain[1]) / 2,  # center x
        y=(y_domain[0] + y_domain[1]) / 2,  # center y
        text=text,
        showarrow=False,
        font=dict(size=12),
        xref="paper",
        yref="paper",
        xanchor="center",
        yanchor="middle",
        row=row,
        col=col,
    )

# Who can the model help?

In [44]:
# Filter out IDs with less than 30 datapoints
df_res_enough_data = df_res[df_res["count"] >= 100]
df_res_enough_data = df_res[df_res["count"] >= 10]

In [45]:
# Get the last 30 days of data for each ID
df_res_min_30_ecfev1 = df_res_enough_data.sort_values(["ID", "ecFEV1 % Predicted"])
df_res_min_30_ecfev1 = df_res_min_30_ecfev1.groupby("ID").head(30)

df_res_last_30 = df_res_enough_data.sort_values(["ID", "Date Recorded"])
df_res_last_30 = df_res_last_30.groupby("ID").tail(30)

In [107]:
# Where AR is underestimated
import models.helpers as mh

AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior={"type": "uniform"})


def get_dumbell_plot_data(df_res, ar_row):

    df_res[f"{ar_row} mean"] = df_res[ar_row].apply(lambda row: AR.get_mean(row))
    df_res[f"{ar_row} std"] = df_res[ar_row].apply(lambda row: AR.get_std(row))
    df_res["AR_ecFEV1%"] = 100 - df_res["ecFEV1 % Predicted"]
    df_res[f"{ar_row} - AR_ecFEV1%"] = df_res[f"{ar_row} mean"] - df_res["AR_ecFEV1%"]
    # Add value low and high
    df_res[f"{ar_row} low"] = df_res[f"{ar_row} mean"] - df_res[f"{ar_row} std"]
    df_res[f"{ar_row} high"] = df_res[f"{ar_row} mean"] + df_res[f"{ar_row} std"]

    # Create DataFrame with means in separate columns
    df_means = (
        df_res.groupby("ID")
        .agg(
            {
                f"{ar_row} std": "mean",
                f"{ar_row} mean": "mean",
                "AR_ecFEV1%": "mean",
                f"{ar_row} - AR_ecFEV1%": "mean",
                f"{ar_row} low": "mean",
                f"{ar_row} high": "mean",
            }
        )
        .rename(
            columns={
                f"{ar_row} - AR_ecFEV1%": f"Mean {ar_row} - AR_ecFEV1%",
                f"{ar_row} mean": f"Mean {ar_row} prediction",
                f"{ar_row} std": f"Mean {ar_row} std",
                "AR_ecFEV1%": "Mean AR_ecFEV1%",
                f"{ar_row} low": f"Mean {ar_row} low",
                f"{ar_row} high": f"Mean {ar_row} high",
            }
        )
    ).reset_index()

    ids_sorted = df_means.sort_values(
        by=f"Mean {ar_row} - AR_ecFEV1%", ascending=False
    )["ID"].values

    # Rename Mean ar row low and high to ar row dist
    df_means_2 = df_means.rename(
        columns={
            f"Mean {ar_row} low": f"Mean {ar_row} dist",
            f"Mean {ar_row} high": f"Mean {ar_row} dist",
        },
    )

    # Melt the dataframe to get four rows per ID for three measures
    df_means_melted = pd.melt(
        df_means_2,
        id_vars=["ID"],
        value_vars=[
            "Mean AR_ecFEV1%",
            f"Mean {ar_row} dist",
            f"Mean {ar_row} prediction",
        ],
        var_name="measure",
        value_name="value",
    )

    return df_means_melted.set_index("ID").loc[ids_sorted].reset_index()


# Dumbell plot

In [112]:
df_ar_fev1_o2.head()

Unnamed: 0,ID,measure,value
0,469,Mean AR_ecFEV1%,-23.449175
1,469,Mean AR_fev1_o2 dist,0.649089
2,469,Mean AR_fev1_o2 dist,2.868431
3,469,Mean AR_fev1_o2 prediction,1.75876
4,370,Mean AR_ecFEV1%,-18.105205


In [111]:
df_ar_fev1_o2 = get_dumbell_plot_data(df_res_last_30, "AR_fev1_o2")
df_ar_fev1_o2_2d = get_dumbell_plot_data(df_res_last_30, "AR_fev1_o2_2d")
# df_ar_fev1_o2_2d = get_dumbell_plot_data('AR_fev1_o2_2d')

In [149]:
# Adding 2nd day massively reduces the AR compared to examples with low ecFEV1

import plotly.graph_objects as go

fig = make_subplots(1, 2)

def plot_dumbell_for_df(fig, df, measures, col):
    # For measure[0]
    mask = df["measure"] == measures[0]
    for id in df[mask]["ID"].unique():
        mask_id = df["ID"] == id
        # Add mask
        mask_final = mask_id & mask
        fig.add_trace(
            go.Scatter(
                x=df[mask_final]["value"],
                y=df[mask_final]["ID"],
                mode="lines",
                name=measures[0],
                marker=dict(size=3, color="red"),
            ),
            row=1,
            col=col,
        )
    # For measure[1]
    mask = df["measure"] == measures[1]
    fig.add_trace(
        go.Scatter(
            x=df[mask]["value"],
            y=df[mask]["ID"],
            mode="markers",
            name=measures[1],
            marker=dict(size=3, color="blue"),
        ),
        row=1,
        col=col,
    )
        
plot_dumbell_for_df(fig, df_ar_fev1_o2, ["Mean AR_fev1_o2 dist", "Mean AR_ecFEV1%"], 1)
plot_dumbell_for_df(fig, df_ar_fev1_o2_2d, ["Mean AR_fev1_o2_2d dist", "Mean AR_ecFEV1%"], 2)

fig.update_layout(height=1800, width=800, font=dict(size=4), showlegend=False)
fig.show()