In [39]:
import pandas as pd
import numpy as np

import data.breathe_data as bd
import data.helpers as dh
import models.builders as mb
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import inference.helpers as ih

In [19]:
# Load smartcare data
df_sc = dh.load_excel(
    f"{dh.get_path_to_main()}/ExcelFiles/SC/infer_AR_using_two_days_model_o2_fev1_06052025.xlsx",
    date_cols=["Day"],
    str_cols_to_arrays=["Healthy FEV1 (L)", "Airway resistance (%)"],
)
df_sc.rename(
    {
        "Healthy FEV1 (L)": "HFEV1_fev1_o2_2d",
        "Airway resistance (%)": "AR_fev1_o2_2d",
        "Day": "Date Recorded",
    },
    axis=1,
    inplace=True,
)

df_sc_meas = dh.load_excel(
    f"{dh.get_path_to_main()}/ExcelFiles/SC/O2_FEV1_df_conservative_smoothing_with_idx.xlsx",
    date_cols=["Date Recorded"],
)

cols = [
    "ID",
    "Date Recorded",
    "ecFEV1 % Predicted",
    "O2 Saturation",
    "Height",
    "Age",
    "Sex",
]

df_sc = pd.merge(df_sc, df_sc_meas[cols], on=["ID", "Date Recorded"], how="inner")

In [None]:
# Load BR data

In [40]:
# Checked that obs indices are correct, see ipynb mentioned above(01.05.2025)
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [41]:
df_fev1 = bd.load_meas_from_excel(
    "infer_AR_using_fev1_01052025", ["AR", "HFEV1", "HO2Sat"], bypass_sanity_checks=True
).drop(columns=["HO2Sat"])
df_fev1.rename(
    {
        "AR": "AR_fev1",
        "HFEV1": "HFEV1_fev1",
    },
    axis=1,
    inplace=True,
)

In [42]:
df_fev1_o2 = bd.load_meas_from_excel(
    "infer_AR_using_o2sat_fev1_01052025", ["AR", "HFEV1"], bypass_sanity_checks=True
).drop(columns=["IA", "HO2Sat"])
df_fev1_o2.rename(
    {
        "AR": "AR_fev1_o2",
        "HFEV1": "HFEV1_fev1_o2",
    },
    axis=1,
    inplace=True,
)

In [43]:
df_fev1_o2_2d = bd.load_meas_from_excel(
    "infer_AR_using_two_days_model_o2_fev1_01052025",
    ["Airway resistance (%)", "Healthy FEV1 (L)"],
    date_cols=["Day"],
    bypass_sanity_checks=True,
).drop(columns=["Healthy O2 saturation (%)"])
df_fev1_o2_2d.rename(
    {
        "Airway resistance (%)": "AR_fev1_o2_2d",
        "Healthy FEV1 (L)": "HFEV1_fev1_o2_2d",
        "Day": "Date Recorded",
    },
    axis=1,
    inplace=True,
)

In [44]:
df_fev1_o2_fef2575_2d = bd.load_meas_from_excel(
    "infer_AR_using_two_days_model_o2_fev1_fef2575_01052025",
    ["Airway resistance (%)", "Healthy FEV1 (L)"],
    date_cols=["Day"],
    bypass_sanity_checks=True,
).drop(columns=["Healthy O2 saturation (%)"])
df_fev1_o2_fef2575_2d.rename(
    {
        "Airway resistance (%)": "AR_fev1_o2_fef2575_2d",
        "Healthy FEV1 (L)": "HFEV1_fev1_o2_fef2575_2d",
        "Day": "Date Recorded",
    },
    axis=1,
    inplace=True,
)

In [45]:
df_fev1_fef2575_2d = bd.load_meas_from_excel(
    "infer_AR_using_two_days_model_fev1_fef2575_06052025",
    ["Airway resistance (%)", "Healthy FEV1 (L)"],
    date_cols=["Day"],
    bypass_sanity_checks=True,
)
df_fev1_fef2575_2d.rename(
    {
        "Airway resistance (%)": "AR_fev1_fef2575_2d",
        "Healthy FEV1 (L)": "HFEV1_fev1_fef2575_2d",
        "Day": "Date Recorded",
    },
    axis=1,
    inplace=True,
)

In [46]:
# join df to df_fev1 on ID and Date Recorded
cols = [
    "ID",
    "Date Recorded",
    "ecFEV1 % Predicted",
    "O2 Saturation",
    "Height",
    "Age",
    "Sex",
]
df_res = pd.merge(df[cols], df_fev1, on=["ID", "Date Recorded"], how="inner")
df_res = pd.merge(df_res, df_fev1_o2, on=["ID", "Date Recorded"], how="inner")
df_res = pd.merge(df_res, df_fev1_o2_2d, on=["ID", "Date Recorded"], how="inner")
df_res = pd.merge(
    df_res, df_fev1_o2_fef2575_2d, on=["ID", "Date Recorded"], how="inner"
)
df_res = pd.merge(df_res, df_fev1_fef2575_2d, on=["ID", "Date Recorded"], how="inner")

In [47]:
# Add records count per ID
df_count = (
    df_res.groupby("ID")
    .apply(lambda df: len(df))
    .reset_index()
    .rename(columns={0: "count"})
)
df_res = pd.merge(df_res, df_count, on=["ID"])

In [48]:
# Add max ecFEV1 % pred
df_max_fev = (
    df_res.groupby("ID")
    .apply(lambda df: df["ecFEV1 % Predicted"].max())
    .reset_index()
    .rename(columns={0: "max ecFEV1 % Predicted"})
)
df_res = pd.merge(df_res, df_max_fev, on=["ID"])
df_res["FEV delta"] = df_res["max ecFEV1 % Predicted"] - df_res["ecFEV1 % Predicted"]

In [49]:
# Add avg o2 sat
df_avg_o2 = (
    df_res.groupby("ID")
    .apply(lambda df: df["O2 Saturation"].mean())
    .reset_index()
    .rename(columns={0: "Avg O2 saturation"})
)
df_res = pd.merge(df_res, df_avg_o2, on=["ID"])

In [50]:
# df_res.sort_values(by="FEV delta", ascending=False)[0:5]
# df_res.sort_values(by=["ecFEV1 % Predicted", "O2 Saturation"], ascending=True)[0:5]
indices_max_avg_o2_min_ecfev1 = (
    df_res.groupby("ID")
    .apply(
        lambda df: df.reset_index()
        .sort_values(by=["ecFEV1 % Predicted"], ascending=True)
        .iloc[0]
    )["index"]
    .values
)
indices_over_30_datapoints = df_res[df_res["count"] > 30].index
indices_max_avg_o2_below_94 = df_res[df_res["Avg O2 saturation"] < 94].index
mask = list(
    set(indices_max_avg_o2_min_ecfev1)
    .intersection(indices_max_avg_o2_below_94)
    .intersection(indices_over_30_datapoints)
)
df_res.iloc[mask].sort_values(
    by=["Avg O2 saturation", "ecFEV1 % Predicted"], ascending=[True, True]
).index

Index([3448, 33235, 36519, 35294, 12396], dtype='int64')

# Plot examples of narrowed in posteriors

In [54]:
df_res_last_30[df_res_last_30["ID"] == "101"][0:5].index

Index([1650, 1651, 1652, 1653, 1654], dtype='int64')

In [157]:
df[df.ID == "210"]

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,idx ecFEV1 (L),idx O2 saturation (%),idx ecFEF2575%ecFEV1,idx ecFEF25-75 % ecFEV1 (%)
19804,210,2020-08-24,1.79,95,1.84,1.79,1.84,Female,163.0,29,3.245544,98.150426,55.152541,55.152541,96.790206,102.793296,35,45,51,51
19805,210,2020-08-26,1.79,98,2.21,1.79,2.21,Female,163.0,29,3.245544,98.150426,55.152541,55.152541,99.846739,123.463687,35,48,61,61
19806,210,2020-08-28,1.8,97,1.96,1.8,1.96,Female,163.0,29,3.245544,98.150426,55.460655,55.460655,98.827895,108.888889,36,47,54,54
19807,210,2020-08-31,1.88,97,2.03,1.88,2.03,Female,163.0,29,3.245544,98.150426,57.925573,57.925573,98.827895,107.978723,37,47,53,53
19808,210,2020-09-04,1.79,97,1.98,1.79,1.98,Female,163.0,29,3.245544,98.150426,55.152541,55.152541,98.827895,110.614525,35,47,55,55
19809,210,2020-09-06,1.84,98,2.14,1.84,2.82,Female,163.0,29,3.245544,98.150426,56.693114,56.693114,99.846739,116.304348,36,48,58,58
19810,210,2020-09-08,1.88,98,2.82,1.88,2.82,Female,163.0,29,3.245544,98.150426,57.925573,57.925573,99.846739,150.0,37,48,75,75
19811,210,2020-09-27,1.77,96,2.2,1.77,2.82,Female,163.0,29,3.245544,98.150426,54.536311,54.536311,97.809051,124.293785,35,46,62,62
19812,210,2020-10-29,1.84,97,2.32,1.84,2.32,Female,163.0,29,3.245544,98.150426,56.693114,56.693114,98.827895,126.086957,36,47,63,63
19813,210,2020-10-31,1.88,97,2.42,1.88,2.42,Female,163.0,29,3.245544,98.150426,57.925573,57.925573,98.827895,128.723404,37,47,64,64


In [177]:
# i = 2955
# fev1 and o2: see impact of very low o2
# Impact of min o2 shouldn't get propagated back to hfev1 because it's more due to sudden infection rather than permanent lung damage
# indices = [6311, 35304]
# However, avg o2 is low (or max o2) is relevant. -  Avg o2 sat < 94, min ecfev1, > 30 datapoints
# indices = [35294]
# Unexpectedly small lungs. Below 30% of ecFE1%Predicted (life threatening), expects lower max o2, hence healthy oxygenation must be due to smaller lungs. Ex: o2 = 100, min and max ecfev1.
# → use app
# Exclude o2 from longitudinal model due to calibration issues

# Shared hfev1/ar uncertainty: small healthy lungs or big lungs with disease
# Unexpectedly big lungs with disease: high fev1 % pred delta (benefit of 2nd day)
indices = [18452, 33198, 2330, 2975, 18430]
# Unexpectedly small lungs: low fev1 % pred but no permanent lung damage (high fef2575%fev1) → show without fef2575 then with

# Super high fev1
# indices = [37113]

# indices = [19804]

for i in indices:
    print(i)
    id = df.loc[i, "ID"]
    (
        _,
        inf_alg,
        HFEV1,
        uFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
        df_res.iloc[i].Height,
        df_res.iloc[i].Age,
        df_res.iloc[i].Sex,
        ia_prior="uniform",
        ar_prior="uniform",
        ecfev1_noise_model_cpt_suffix="_std_add_mult_ecfev1",
        ar_fef2575_cpt_suffix="_ecfev1_2_days_model_add_mult_noise",
    )

    max_row = 3

    fig = make_subplots(
        rows=max_row,
        cols=3,
        shared_xaxes=True,
        vertical_spacing=0.03,
    )
    first_data_col = 2

    def add_fev1_prct_pred_line(fig, val, y_max, row, col):
        fig.add_shape(
            type="line",
            # opacity=0.1,
            x0=val,
            y0=0,
            x1=val,
            y1=y_max * 1.1,
            line=dict(color="black", width=2, dash="dash"),
            # fillcolor="red",
            # line_width=0,
            row=row,
            col=col,
        )
        return -1

    def add_hists(fig, hfev1_col, ar_col, i, row):
        ih.plot_histogram(
            fig,
            HFEV1,
            df_res.loc[i, hfev1_col],
            HFEV1.a,
            HFEV1.b,
            row,
            first_data_col,
            f"{HFEV1.name}" if row == max_row else "",
            "#009e73",
        )
        ih.plot_histogram(
            fig,
            AR,
            df_res.loc[i, ar_col],
            AR.a,
            AR.b,
            row,
            first_data_col + 1,
            f"{AR.name}" if row == max_row else "",
            "#d55e00",
            clean_ticks=True,
        )
        add_fev1_prct_pred_line(
            fig,
            max(100 - df_res.loc[i, "ecFEV1 % Predicted"], 0),
            max(df_res.loc[i, ar_col]),
            row,
            first_data_col + 1,
        )

    ih.plot_histogram(
        fig,
        HFEV1,
        HFEV1.cpt,
        HFEV1.a,
        HFEV1.b,
        1,
        first_data_col,
        # HFEV1.name + " prior",
        "",
        "#009e73",
    )
    ih.plot_histogram(
        fig,
        AR,
        AR.cpt,
        AR.a,
        AR.b,
        1,
        first_data_col + 1,
        # AR.name + " prior",
        "",
        "#d55e00",
        clean_ticks=True,
    )
    add_fev1_prct_pred_line(
        fig,
        max(100 - df_res.loc[i, "ecFEV1 % Predicted"], 0),
        max(AR.cpt),
        1,
        first_data_col + 1,
    )

    def add_text_box(fig, text, row, col):
        # Get the subplot's domain coordinates
        x_domain = fig.get_subplot(row, col).xaxis.domain
        y_domain = fig.get_subplot(row, col).yaxis.domain
        # print(x_domain, y_domain)

        # Add rectangle
        fig.add_shape(
            type="rect",
            x0=x_domain[0] - 2.5,
            x1=x_domain[1] * 28,
            y0=y_domain[0],
            y1=y_domain[1],
            line=dict(color="black", width=0),
            fillcolor="white",
            xref="paper",
            yref="paper",
            row=row,
            col=col,
        )

        fig.update_yaxes(showticklabels=False, row=row, col=col)
        fig.update_xaxes(showticklabels=False, row=row, col=col)

        # Add text
        fig.add_annotation(
            x=x_domain[1] + 2.5,  # center x
            y=(y_domain[0] + y_domain[1]) / 2,  # center y
            text=text,
            showarrow=False,
            font=dict(size=12),
            xref="paper",
            yref="paper",
            xanchor="center",
            yanchor="middle",
            row=row,
            col=col,
        )

    add_hists(fig, "HFEV1_fev1", "AR_fev1", i, 2)
    add_hists(fig, "HFEV1_fev1_o2_2d", "AR_fev1_o2_2d", i, 3)
    # add_hists(fig, "HFEV1_fev1_o2_fef2575_2d", "AR_fev1_o2_fef2575_2d", i, 3)

    add_text_box(fig, text="Prior knowledge", row=1, col=1)
    add_text_box(
        fig,
        text=f"Observing FEV<sub>1</sub>={df.loc[i, 'ecFEV1']}L,<br>             SpO<sub>2</sub>={df.loc[i, 'O2 Saturation']}%",
        row=2,
        col=1,
    )
    max_fev1 = df[df.ID == df.loc[i, "ID"]]["ecFEV1"].max()
    add_text_box(
        fig,
        text=f"Additionally observing<br>a second day<br>with max(FEV<sub>1</sub>)={max_fev1}L",
        row=3,
        col=1,
    )
    # add_text_box(
    #     fig,
    #     text=f"Additionally observing<br>FEV<sub>25-75</sub> in % of FEV<sub>1</sub>={df.loc[i, 'ecFEF2575%ecFEV1']:.0f}%",
    #     row=3,
    #     col=1,
    # )

    title = f"{id}, {df_res.loc[i, 'Date Recorded']}, i={i}, o2 avg={df_res.loc[i, 'Avg O2 saturation']:.2f}%, ecFEV1%={df_res.loc[i, 'ecFEV1 % Predicted']:.2f}"
    print(title)

    fig.update_layout(
        showlegend=False,
        height=350,
        width=800,
        font=dict(size=10),
        bargap=0.01,
        title=title,
        # margin=dict(l=20, r=20, b=20, t=20),
    )

    fig.show()
    fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/Narrowing_in/{title}.pdf")

18452
202, 2022-05-13, i=18452, o2 avg=97.95%, ecFEV1%=92.59


33198
372, 2022-09-02, i=33198, o2 avg=96.78%, ecFEV1%=91.89


2330
104, 2019-02-25, i=2330, o2 avg=96.60%, ecFEV1%=52.80


2975
107, 2022-06-28, i=2975, o2 avg=97.77%, ecFEV1%=42.49


18430
202, 2021-12-03, i=18430, o2 avg=97.95%, ecFEV1%=79.04


# Who can the model help?

In [27]:
# Filter out IDs with less than 30 datapoints
min_datapoints = 10
# min_datapoints = 100

df_res_enough_data = df_res[df_res["count"] >= min_datapoints]
# df_res_enough_data = df_sc[df_sc["count"] >= min_datapoints]
print(df_res_enough_data.ID.nunique())

265


In [28]:
# Get the last 30 days of data for each ID

df_res_min_30_ecfev1 = df_res_enough_data.sort_values(["ID", "ecFEV1 % Predicted"])
df_res_min_30_ecfev1 = df_res_min_30_ecfev1.groupby("ID").head(30)

df_res_last_30 = df_res_enough_data.sort_values(["ID", "Date Recorded"])
df_res_last_30 = df_res_last_30.groupby("ID").tail(30)

df_res_first_30 = df_res_enough_data.sort_values(["ID", "Date Recorded"])
df_res_first_30 = df_res_first_30.groupby("ID").head(30)

In [29]:
# Where AR is underestimated
import models.helpers as mh

AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior={"type": "uniform"})


def get_dumbell_plot_data(df_res, ar_row):

    df_res[f"{ar_row} mean"] = df_res[ar_row].apply(lambda row: AR.get_mean(row))
    df_res[f"{ar_row} std"] = df_res[ar_row].apply(lambda row: AR.get_std(row))
    df_res["AR_ecFEV1%"] = 100 - df_res["ecFEV1 % Predicted"]
    df_res[f"{ar_row} - AR_ecFEV1%"] = df_res[f"{ar_row} mean"] - df_res["AR_ecFEV1%"]
    # Add value low and high
    df_res[f"{ar_row} low"] = df_res[f"{ar_row} mean"] - df_res[f"{ar_row} std"]
    df_res[f"{ar_row} high"] = df_res[f"{ar_row} mean"] + df_res[f"{ar_row} std"]

    # Create DataFrame with means in separate columns
    df_means = (
        df_res.groupby("ID")
        .agg(
            {
                f"{ar_row} std": "mean",
                f"{ar_row} mean": "mean",
                "AR_ecFEV1%": "mean",
                f"{ar_row} - AR_ecFEV1%": "mean",
                f"{ar_row} low": "mean",
                f"{ar_row} high": "mean",
            }
        )
        .rename(
            columns={
                f"{ar_row} - AR_ecFEV1%": f"Mean {ar_row} - AR_ecFEV1%",
                f"{ar_row} mean": f"Mean {ar_row} prediction",
                f"{ar_row} std": f"Mean {ar_row} std",
                "AR_ecFEV1%": "Mean AR_ecFEV1%",
                f"{ar_row} low": f"Mean {ar_row} low",
                f"{ar_row} high": f"Mean {ar_row} high",
            }
        )
    ).reset_index()

    ids_sorted = df_means.sort_values(
        by=f"Mean {ar_row} - AR_ecFEV1%", ascending=False
    )["ID"].values

    # Rename Mean ar row low and high to ar row dist
    df_means_2 = df_means.rename(
        columns={
            f"Mean {ar_row} low": f"Mean {ar_row} dist",
            f"Mean {ar_row} high": f"Mean {ar_row} dist",
        },
    )

    # Melt the dataframe to get four rows per ID for three measures
    df_means_melted = pd.melt(
        df_means_2,
        id_vars=["ID"],
        value_vars=[
            "Mean AR_ecFEV1%",
            f"Mean {ar_row} dist",
            f"Mean {ar_row} prediction",
        ],
        var_name="measure",
        value_name="value",
    )

    return (
        df_means_melted.set_index("ID").loc[ids_sorted].reset_index(),
        df_means,
        ids_sorted,
    )


# Dumbell plot

In [30]:
df_for_dumbell = df_res_last_30
# df_for_dumbell = df_res_first_30

df_ar_fev1_dumbell, df_ar_fev1_means, ids_sorted = get_dumbell_plot_data(
    df_for_dumbell, "AR_fev1"
)
df_ar_fev1_o2_dumbell, df_ar_fev1_o2_means, ids_sorted = get_dumbell_plot_data(
    df_for_dumbell, "AR_fev1_o2"
)
df_ar_fev1_o2_2d_dumbell, df_ar_fev1_o2_2d_means, ids_sorted_2d = get_dumbell_plot_data(
    df_for_dumbell, "AR_fev1_o2_2d"
)
(
    df_ar_fev1_o2_fef2575_2d_dumbell,
    df_ar_fev1_o2_fef2575_2d_means,
    ids_sorted_o2_fef2575,
) = get_dumbell_plot_data(df_for_dumbell, "AR_fev1_o2_fef2575_2d")
df_ar_fev1_fef2575_2d_dumbell, df_ar_fev1_fef2575_2d_means, ids_sorted_fef2575 = (
    get_dumbell_plot_data(df_for_dumbell, "AR_fev1_fef2575_2d")
)
# df_ar_fev1_o2_2d = get_dumbell_plot_data('AR_fev1_o2_2d')

In [31]:
df_ar_fev1_dumbell = (
    df_ar_fev1_dumbell.set_index("ID").loc[ids_sorted_fef2575].reset_index()
)
df_ar_fev1_o2_dumbell = (
    df_ar_fev1_o2_dumbell.set_index("ID").loc[ids_sorted_fef2575].reset_index()
)
df_ar_fev1_o2_2d_dumbell = (
    df_ar_fev1_o2_2d_dumbell.set_index("ID").loc[ids_sorted_fef2575].reset_index()
)
df_ar_fev1_o2_fef2575_2d_dumbell = (
    df_ar_fev1_o2_fef2575_2d_dumbell.set_index("ID")
    .loc[ids_sorted_fef2575]
    .reset_index()
)
df_ar_fev1_fef2575_2d_dumbell = (
    df_ar_fev1_fef2575_2d_dumbell.set_index("ID").loc[ids_sorted_fef2575].reset_index()
)

In [35]:
# Adding 2nd day massively reduces the AR compared to examples with low ecFEV1

import plotly.graph_objects as go

df_to_plot = df_ar_fev1_dumbell
ar_col = "AR_fev1"
df_to_plot = df_ar_fev1_o2_dumbell
ar_col = "AR_fev1_o2"
# df_to_plot = df_ar_fev1_o2_2d_dumbell
# ar_col = "AR_fev1_o2_2d"
# df_to_plot = df_ar_fev1_o2_fef2575_2d_dumbell
# ar_col = "AR_fev1_o2_fef2575_2d"
df_to_plot = df_ar_fev1_fef2575_2d_dumbell
ar_col = "AR_fev1_fef2575_2d"

title = f"Dumbell plot of inferred {ar_col} vs ecFEV1%(>={min_datapoints} points, last 30 days avg)"


fig = make_subplots(
    1, 3, horizontal_spacing=0.1, column_titles=["Mild CF", "Moderate CF", "Severe CF"]
)


def plot_dumbell_for_df(fig, df, measures, col):
    # For measure[0]
    mask = df["measure"] == measures[0]
    for id in df[mask]["ID"].unique():
        mask_id = df["ID"] == id
        # Add mask
        mask_final = mask_id & mask
        fig.add_trace(
            go.Scatter(
                x=df[mask_final]["value"],
                y=df[mask_final]["ID"],
                mode="lines",
                name="Airway resistance inferred",
                marker=dict(color="red"),
                line=dict(width=3),
                # legendgroup="Airway resistance predicted",  # Add legend group
                showlegend=(
                    True if id == df[mask]["ID"].unique()[0] else False
                ),  # Only show legend for first trace
            ),
            row=1,
            col=col,
        )
    # For measure[1]
    mask = df["measure"] == measures[1]
    fig.add_trace(
        go.Scatter(
            x=df[mask]["value"],
            y=df[mask]["ID"],
            mode="markers",
            name="1 - ecFEV1%Predicted",
            marker=dict(size=4, color="blue"),
            # legendgroup="1 - ecFEV1%Predicted",  # Add legend group
        ),
        row=1,
        col=col,
    )


# df_to_plot = df_ar_fev1_o2_2d_dumbell
# ar_col = "AR_fev1_o2_2d"


# Split dataframe between mild, moderate and severe CF lung disease
# Equivalent to Mean AR_ecFEV1% < 30%, 30 to 60 and > 60%
# Get unique IDs and their corresponding Mean AR_ecFEV1% values
mask_ecfev1 = df_to_plot["measure"] == f"Mean {ar_col} prediction"

id_ecfev1_values = df_to_plot[mask_ecfev1].groupby("ID")["value"].mean()

# Split IDs into three groups based on ecFEV1% values
mild_ids = id_ecfev1_values[id_ecfev1_values < 30].index
moderate_ids = id_ecfev1_values[
    (id_ecfev1_values >= 30) & (id_ecfev1_values <= 60)
].index
severe_ids = id_ecfev1_values[id_ecfev1_values > 60].index

# Create the three dataframes
df_mild = df_to_plot[df_to_plot["ID"].isin(mild_ids)]
df_moderate = df_to_plot[df_to_plot["ID"].isin(moderate_ids)]
df_severe = df_to_plot[df_to_plot["ID"].isin(severe_ids)]

# Print the sizes to verify
print(f"Mild size: {len(df_mild)}")
print(f"Moderate size: {len(df_moderate)}")
print(f"Severe size: {len(df_severe)}")

# Plot the three groups
plot_dumbell_for_df(fig, df_mild, [f"Mean {ar_col} dist", "Mean AR_ecFEV1%"], 1)
plot_dumbell_for_df(fig, df_moderate, [f"Mean {ar_col} dist", "Mean AR_ecFEV1%"], 2)
plot_dumbell_for_df(fig, df_severe, [f"Mean {ar_col} dist", "Mean AR_ecFEV1%"], 3)


# Split the dataframe into 3 roughly equal parts
#
#
# ids = df_ar_fev1_o2_fef2575_2d.ID.unique()
# chunk_size = int(np.ceil(len(ids) // 3))
#
# # Get first chunk
# chunk1_ids = ids[0:chunk_size]
#
# # Get second chunk
# chunk2_ids = ids[chunk_size:2*chunk_size]
#
# # Get third chunk
# chunk3_ids = ids[2*chunk_size:len(ids)]
#
# # Create the three dataframes
# df_ar_fev1_o2_fef2575_2d_chunk1 = df_ar_fev1_o2_fef2575_2d.set_index("ID").loc[chunk1_ids].reset_index()
# df_ar_fev1_o2_fef2575_2d_chunk2 = df_ar_fev1_o2_fef2575_2d.set_index("ID").loc[chunk2_ids].reset_index()
# df_ar_fev1_o2_fef2575_2d_chunk3 = df_ar_fev1_o2_fef2575_2d.set_index("ID").loc[chunk3_ids].reset_index()
#
# Print the sizes to verify
# print(f"Chunk 1 size: {len(df_ar_fev1_o2_fef2575_2d_chunk1)}")
# print(f"Chunk 2 size: {len(df_ar_fev1_o2_fef2575_2d_chunk2)}")
# print(f"Chunk 3 size: {len(df_ar_fev1_o2_fef2575_2d_chunk3)}")
#
# plot_dumbell_for_df(fig, df_ar_fev1_o2_fef2575_2d_chunk1, ["Mean AR_fev1_o2_fef2575_2d dist", "Mean AR_ecFEV1%"], 3)
# plot_dumbell_for_df(fig, df_ar_fev1_o2_fef2575_2d_chunk2, ["Mean AR_fev1_o2_fef2575_2d dist", "Mean AR_ecFEV1%"], 2)
# plot_dumbell_for_df(fig, df_ar_fev1_o2_fef2575_2d_chunk3, ["Mean AR_fev1_o2_fef2575_2d dist", "Mean AR_ecFEV1%"], 1)


# Compare different models
#
#
# plot_dumbell_for_df(fig, df_ar_fev1_o2, ["Mean AR_fev1_o2 dist", "Mean AR_ecFEV1%"], 1)
# plot_dumbell_for_df(fig, df_ar_fev1_o2_2d, ["Mean AR_fev1_o2_2d dist", "Mean AR_ecFEV1%"], 2)


fig.update_layout(
    height=900,
    # height=1800,
    width=1200,
    font=dict(size=10),
    showlegend=True,
    title=title,
    plot_bgcolor="white",
    paper_bgcolor="white",
)
print(min(df_to_plot["value"]))
fig.update_xaxes(
    range=[-25, 91],
    tickvals=[0, 30, 60, 90],
    title=AR.name,
    gridcolor="#2a3f5f",
    zeroline=True,
    zerolinecolor="#2a3f5f",
    zerolinewidth=2,
)
fig.update_yaxes(
    showticklabels=False,
    showgrid=False,
    gridcolor="#2a3f5f",
    zeroline=True,
    zerolinecolor="#2a3f5f",
    zerolinewidth=2,
)
fig.write_image(
    f"{dh.get_path_to_main()}PlotsBreathe/Dumbell_plot_AR_ecFEV1_by_severity/{title}.pdf"
)
fig.show()

# On this plot, if a person is very sick, it's best fev1 measurement will contain a lot of inflammatory markers (sputum, airway wall inflammation).
# The sicker the person,the more underestimated the AR pred is because the maximum FEV1 blown is not healthy.
# Let's add the FEF25-75.

# 60% of data falls within red band

# TODO: add vertical lines corresponding to key AR values

# Longitudinal AR profile on the web app

Mild size: 520
Moderate size: 456
Severe size: 84


-23.449175221381445


In [38]:
# For who many people would this model be really useful?

# Count number of individuals where Mean AR_ecFEV1% is below the lower bound of AR_fev1_fef2575_2d
mask = (
    df_ar_fev1_fef2575_2d_means["Mean AR_ecFEV1%"]
    < df_ar_fev1_fef2575_2d_means["Mean AR_fev1_fef2575_2d low"]
)
count_below_lower = mask.sum()
print(
    f"Number of individuals with Mean AR_ecFEV1% below Mean AR_fev1_fef2575_2d lower bound: {count_below_lower}"
)
print(f"Percentage: {count_below_lower/len(df_ar_fev1_fef2575_2d_means)*100:.1f}%")

# Count number of individuals where Mean AR_ecFEV1% is above the upper bound of Mean AR_fev1_fef2575_2d
mask = (
    df_ar_fev1_fef2575_2d_means["Mean AR_ecFEV1%"]
    > df_ar_fev1_fef2575_2d_means["Mean AR_fev1_fef2575_2d high"]
)
count_above_upper = mask.sum()
print(
    f"Number of individuals with Mean AR_ecFEV1% above Mean AR_fev1_fef2575_2d upper bound: {count_above_upper}"
)
print(f"Percentage: {count_above_upper/len(df_ar_fev1_fef2575_2d_means)*100:.1f}%")

Number of individuals with Mean AR_ecFEV1% below Mean AR_fev1_fef2575_2d lower bound: 40
Percentage: 15.1%
Number of individuals with Mean AR_ecFEV1% above Mean AR_fev1_fef2575_2d upper bound: 18
Percentage: 6.8%


In [None]:
ids_ecfev1prct_60 = df_ar_fev1_o2_fef2575_2d_means[
    df_ar_fev1_o2_fef2575_2d_means["Mean AR_ecFEV1%"] > 60
].ID.unique()

In [None]:
ids_ar_60 = df_ar_fev1_o2_fef2575_2d_means[
    df_ar_fev1_o2_fef2575_2d_means["Mean AR_fev1_o2_fef2575_2d prediction"] > 60
].ID.unique()

In [117]:
print(list(ids_ecfev1prct_60))
print(list(ids_ar_60))

['102', '111', '120', '122', '127', '155', '162', '187', '194', '248', '282', '283', '361', '362', '377', '404', '409', '411', '413', '417', '421', '422', '452', '484', '486', '520']
['111', '122', '155', '187', '282', '283', '361', '362', '377', '404', '409', '411', '413', '417', '421', '422', '452', '486', '520']


In [250]:
df_count.set_index("ID").loc[["372", "202", "530", "104", "433", "417"]]

Unnamed: 0_level_0,count
ID,Unnamed: 1_level_1
372,41
202,130
530,205
104,222
433,60
417,20


# Compute breathe avg 

In [64]:
ar_dist_BR = df_res.groupby("ID").apply(lambda df: df["AR_fev1_o2_2d"].mean()).mean()

In [None]:
ar_dist_SC = df_sc.groupby("ID").apply(lambda df: df["AR_fev1_o2_2d"].mean()).mean()

In [None]:
title = f"Comparison of the AR priors learnt on Breathe and Smartcare<br>with FEV1, SpO2, and 2 days model"

fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
ih.plot_histogram(fig, AR, ar_dist_BR, AR.a, AR.b, 1, 1, None, annot=False)
# name trace to breathe data
fig.data[0].name = "Breathe data"
ih.plot_histogram(
    fig, AR, ar_dist_SC, AR.a, AR.b, 2, 1, AR.name, annot=False, clean_ticks=True
)
fig.data[1].name = "Smartcare data"
fig.update_layout(
    showlegend=True, font=dict(size=10), title=title, width=600, height=400
)
fig.write_image(dh.get_path_to_main() + f"PlotsCrossStudies/{title}.pdf")
fig.show()

In [178]:
df_res_drug = bd.add_drug_therapy_to_df(df_res)

ERROR:root:ID 175 - ?? Symkevi start date is wrong, removing it because no clue about the true date (maybe 2019?)
ERROR:root:ID 206 - Updating Ivacaftor stop date to not overlap and changing Ivacaftor + Symkevi to Trikafta
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 426 - Currently no measures for this ID. Removing two entries with NaN drug therapy type
INFO:root:ID 462 - Symkevi and Ivacaftor prescribed, renaming it to Trikafta
INFO:root:ID 405 - Removing two entries with NaN drug therapy type


In [179]:
df_res_drug.head(1)

Unnamed: 0,ID,Date Recorded,ecFEV1 % Predicted,O2 Saturation,Height,Age,Sex,AR_fev1,HFEV1_fev1,AR_fev1_o2,...,AR_fev1_o2_2d,HFEV1_fev1_o2_fef2575_2d,AR_fev1_o2_fef2575_2d,HFEV1_fev1_fef2575_2d,AR_fev1_fef2575_2d,count,max ecFEV1 % Predicted,FEV delta,Avg O2 saturation,DrugTherapyType
0,101,2019-01-25,36.287474,97,173.0,53,Male,"[6.35674101e-06, 7.86135817e-06, 9.90296787e-0...","[5.34277995e-19, 4.19196617e-15, 5.82450148e-1...","[2.15992358e-05, 2.67116953e-05, 3.36487735e-0...",...,"[3.94487216e-15, 1.75061635e-13, 1.80590472e-1...","[2.09896401e-93, 5.43827739e-80, 5.06181421e-6...","[1.03090217e-18, 1.29831267e-16, 3.47423985e-1...","[3.51373896e-94, 9.10386601e-81, 8.47365351e-6...","[1.72576618e-19, 2.17342068e-17, 5.81599867e-1...",1680,49.583647,13.296174,97.4625,


# Statistical significance

In [120]:
df_res.head(1)

Unnamed: 0,ID,Date Recorded,ecFEV1 % Predicted,O2 Saturation,Height,Age,Sex,AR_fev1,HFEV1_fev1,AR_fev1_o2,HFEV1_fev1_o2,HFEV1_fev1_o2_2d,AR_fev1_o2_2d,HFEV1_fev1_o2_fef2575_2d,AR_fev1_o2_fef2575_2d,count,max ecFEV1 % Predicted,FEV delta,Avg O2 saturation
0,101,2019-01-25,36.287474,97,173.0,53,Male,"[6.35674101e-06, 7.86135817e-06, 9.90296787e-0...","[5.34277995e-19, 4.19196617e-15, 5.82450148e-1...","[2.15992358e-05, 2.67116953e-05, 3.36487735e-0...","[1.81539509e-18, 1.42436613e-14, 1.97907671e-1...","[4.0851858e-90, 1.13905537e-76, 1.12596763e-64...","[3.94487216e-15, 1.75061635e-13, 1.80590472e-1...","[2.09896401e-93, 5.43827739e-80, 5.06181421e-6...","[1.03090217e-18, 1.29831267e-16, 3.47423985e-1...",1680,49.583647,13.296174,97.4625


In [122]:
df_res_drug = bd.add_drug_therapy_to_df(df_res)

ERROR:root:ID 175 - ?? Symkevi start date is wrong, removing it because no clue about the true date (maybe 2019?)
ERROR:root:ID 206 - Updating Ivacaftor stop date to not overlap and changing Ivacaftor + Symkevi to Trikafta
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 426 - Currently no measures for this ID. Removing two entries with NaN drug therapy type
INFO:root:ID 462 - Symkevi and Ivacaftor prescribed, renaming it to Trikafta
INFO:root:ID 405 - Removing two entries with NaN drug therapy type


In [123]:
df_res_drug.DrugTherapyType.unique()

array(['None', 'Symkevi', 'Trikafta', 'Ivacaftor', 'Orkambi', 'Unknown'],
      dtype=object)

In [None]:
df_res_drug["Mean AR_fev1_o2_fef2575_2d"] = df_res_drug["AR_fev1_o2_fef2575_2d"].apply(
    lambda x: AR.get_mean(x)
)

In [256]:
after = ["Trikafta"]
before = ["None"]


def get_df_stat(df_res_drug, after, before, target_col):
    def get_mean_for_ID(df, drug_types, target_col):
        return df[df.DrugTherapyType.isin(drug_types)][target_col].mean()

    df_none = df_res_drug.groupby("ID").apply(
        lambda df: get_mean_for_ID(df, before, "ecFEV1 % Predicted")
    )
    df_none.name = "None"

    # Get IDs of patients with mild CF (FEV1% > 70%) before Trikafta
    mild_ids = df_none[df_none > 70].index.tolist()
    print("Patients with mild CF (FEV1% > 70%) before Trikafta:")
    print(len(mild_ids), mild_ids)

    df_trikafta = df_res_drug.groupby("ID").apply(
        lambda df: get_mean_for_ID(df, after, target_col)
    )
    df_trikafta.name = "Trikafta"
    df_none = df_res_drug.groupby("ID").apply(
        lambda df: get_mean_for_ID(df, before, target_col)
    )
    df_none.name = "None"

    df_stat = pd.merge(df_trikafta, df_none, on="ID", how="inner")
    # Remove patients with mild CF (FEV1% > 70%) before Trikafta
    df_stat = df_stat.loc[mild_ids]
    df_stat = df_stat.dropna(axis=0)
    return df_stat, df_trikafta, df_none

In [257]:
target_col = "ecFEV1 % Predicted"
target_col = "Mean AR_fev1_o2_fef2575_2d"

df_stat, df_trikafta, df_none = get_df_stat(df_res_drug, after, before, target_col)
run_stat_test(df_stat, target_col)

Patients with mild CF (FEV1% > 70%) before Trikafta:
49 ['109', '113', '121', '123', '124', '126', '130', '141', '144', '151', '152', '156', '163', '164', '166', '177', '178', '182', '189', '190', '209', '213', '231', '236', '242', '257', '262', '267', '271', '273', '278', '281', '288', '290', '292', '296', '302', '310', '313', '314', '327', '352', '406', '412', '490', '506', '511', '532', '535']
%% Mean AR_fev1_o2_fef2575_2d
Mean Mean AR_fev1_o2_fef2575_2d during None: 20.99%
Mean Mean AR_fev1_o2_fef2575_2d  during Trikafta: 15.64%
Mean improvement: -5.35%
p-value: 7.81e-05
Statistically significant: True


In [258]:
target_col = "ecFEV1 % Predicted"
# target_col = "Mean AR_fev1_o2_fef2575_2d"

df_stat, df_trikafta, df_none = get_df_stat(df_res_drug, after, before, target_col)
run_stat_test(df_stat, target_col)

Patients with mild CF (FEV1% > 70%) before Trikafta:
49 ['109', '113', '121', '123', '124', '126', '130', '141', '144', '151', '152', '156', '163', '164', '166', '177', '178', '182', '189', '190', '209', '213', '231', '236', '242', '257', '262', '267', '271', '273', '278', '281', '288', '290', '292', '296', '302', '310', '313', '314', '327', '352', '406', '412', '490', '506', '511', '532', '535']
%% ecFEV1 % Predicted
Mean ecFEV1 % Predicted during None: 84.40%
Mean ecFEV1 % Predicted  during Trikafta: 90.59%
Mean improvement: 6.19%
p-value: 6.50e-05
Statistically significant: True


In [205]:
def run_stat_test(df_stat, target_col):
    print("%%", target_col)

    # Create before/after masks for each patient
    before_after_data = []

    # Perform paired t-test
    from scipy import stats

    t_stat, p_val = stats.ttest_rel(df_stat["Trikafta"], df_stat["None"])

    print(f"Mean {target_col} during None: {df_stat['None'].mean():.2f}%")
    print(f"Mean {target_col}  during Trikafta: {df_stat['Trikafta'].mean():.2f}%")
    print(
        f"Mean improvement: {df_stat['Trikafta'].mean() - df_stat['None'].mean():.2f}%"
    )
    print(f"p-value: {p_val:.2e}")
    print(f"Statistically significant: {p_val < 0.05}")

In [268]:
df_109 = df_res_drug[df_res_drug.ID == "109"]
mask_trikafta = df_109.DrugTherapyType == "Trikafta"
mask_none = df_109.DrugTherapyType == "None"

t_stat, p_val = stats.ttest_rel(df_109[mask_trikafta], df_109[mask_none])

ValueError: unequal length arrays