In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.optimize import curve_fit, minimize
from scipy.interpolate import splrep, BSpline

import src.models.builders as mb
import src.inference.helpers as ih

plotsdir = "../../../../PlotsBreathe/O2_modelling/"

In [2]:
# df = breathe_data.build_O2_FEV1_df()
# Save to excel
# df.to_excel(plotsdir + "Breathe_O2_FEV1.xlsx", index=False)
# Load from excel
df = pd.read_excel(plotsdir + "/Breathe_O2_FEV1.xlsx")

In [3]:
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,ecFEV1,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy
0,101,2019-02-20,1.31,97.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.287474,99.767593
1,101,2019-02-21,1.29,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,35.733466,98.739061
2,101,2019-02-22,1.32,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.564477,98.739061
3,101,2019-02-23,1.28,97.0,1.33,53,Male,173.0,3.610061,97.22596,36.841481,35.456463,99.767593
4,101,2019-02-24,1.33,98.0,1.36,53,Male,173.0,3.610061,97.22596,37.672492,36.841481,100.796125


In [4]:
df.describe()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,ecFEV1,Age,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy
count,20397.0,20397,20397.0,20397.0,20397.0,20397.0,20397.0,20397.0,20397.0,20397.0,20397.0,20397.0
mean,182.41864,2020-08-09 21:07:02.003235840,2.198363,96.966501,2.263423,34.801147,166.24712,3.507772,97.711339,64.659185,62.787907,99.238755
min,101.0,2019-02-08 00:00:00,0.49,76.0,0.5,18.0,143.0,2.213968,96.975001,15.320382,15.013975,77.519654
25%,125.0,2020-03-06 00:00:00,1.55,96.0,1.61,27.0,160.0,2.979444,97.22596,48.198629,46.536607,98.699543
50%,169.0,2020-09-27 00:00:00,2.03,97.0,2.09,34.0,166.0,3.386997,97.989462,62.88092,61.235539,99.767593
75%,237.0,2021-01-26 00:00:00,2.76,98.0,2.83,41.0,173.0,3.987357,98.114941,77.657779,75.719884,100.255981
max,358.0,2021-07-24 00:00:00,5.26,100.0,5.26,64.0,193.0,5.322753,98.340804,149.50535,149.50535,103.026043
std,65.66033,,0.816148,1.649808,0.822063,10.154773,9.151066,0.649323,0.449598,20.301483,20.271387,1.68695


## Factor - Airway resistance vs O2 drop

In [6]:
# Infer airway resistance using the model
def infer_airway_resistance_for_ID(df_for_ID):
    # print(
    #     f"\nRunning for ID {df_for_ID.ID.iloc[0]}, with {len(df_for_ID)} observations"
    # )
    airway_resistances_for_ID = np.array([])
    HFEV1_prior = {
        "type": "default",
        "height": df_for_ID.loc[0, "Height"],
        "age": df_for_ID.loc[0, "Age"],
        "sex": df_for_ID.loc[0, "Sex"],
    }
    ho2sat_prior = {
        "type": "gaussian",
        "mu": 0.98,
        "sigma": 0.01,
    }
    (
        model,
        inf_alg,
        HFEV1,
        ecFEV1,
        _,
        _,
        AR,
    ) = mb.build_FEV1_O2_point_in_time_model(
        HFEV1_prior, ho2sat_prior
    )
    for ecFEV1_obs in df_for_ID.ecFEV1:
        res_ar = ih.infer(
            inf_alg, [AR], [[ecFEV1, ecFEV1_obs]], show_progress=False
        )
        # Get argmax of res_ar
        idx = np.argmax(res_ar.values)
        most_probable_airway_resistance = round(AR.bins[idx], 2)
        most_probable_airway_resistance_bin_str = AR.bins_str[idx]
        # print(f"Most probable airway resistance: {most_probable_airway_resistance}, bin: {most_probable_airway_resistance_bin_str}")
        # Add to airway resistance array
        airway_resistances_for_ID = np.append(
            airway_resistances_for_ID, most_probable_airway_resistance
        )
    return airway_resistances_for_ID


s_AW = df.groupby(["ID"]).apply(infer_airway_resistance_for_ID)

*** Building lung model with HFEV1 and AB ***
Defining gaussian prior with mu = 0.98, sigma = 0.01


  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  val, abserr = integrate.quad(

KeyboardInterrupt



In [None]:
# Read s_AW from excel and add to df
# s_AW.to_excel(f"{plotsdir}airway_resistance.xlsx")
s_AW = pd.read_excel(f"{plotsdir}airway_resistance.xlsx", index_col=0)[0]
s_AW = s_AW.apply(
    lambda x: [
        float(i)
        for i in x.replace("\n", "").replace("  ", " ").strip("[]").split(" ")
        if i != ""
    ]
)
df["Airway Resistance (%)"] = pd.Series(sum(s_AW.to_list(), [])) * 100

In [None]:
# Check that the computed airway resistance makes sense
# Plot ID 101 FEV1 profile with Date Recorded
# '113', '126', '202', '331'
ID = "101"
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df[df.ID == ID]["Date Recorded"],
        y=df[df.ID == ID]["FEV1"],
        mode="markers",
        name="FEV1",
    )
)
# Add trace for predicted FEV1
# fig.add_trace(go.Scatter(x=df[df.ID == "101"]["Date Recorded"], y=df[df.ID == "101"]["Predicted FEV1"], mode="markers", name="Predicted FEV1"))
# Add trace for airway resistance using s_AW
fig.add_trace(
    go.Scatter(
        x=df[df.ID == ID]["Date Recorded"],
        y=df[df.ID == ID]["Airway Resistance (%)"] / 100,
        mode="markers",
        name="Airway Resistance (%)",
    )
)
# fig.add_trace(go.Scatter(x=df[df.ID == ID]["Date Recorded"], y=s_AW[ID], mode="markers", name="Airway Resistance"))

In [None]:
# # 0% airway resistance: FEV1 = Predicted FEV1
# # 25% airway resistance: FEV1 = 0.75 * Predicted FEV1
# # Negative airway resistance: FEV1 > Predicted FEV1
# # Airway resistance = 1 - FEV1/Predicted FEV1
df["Airway Resistance Computed (%)"] = 100 - df["ecFEV1 % Predicted"]

df["Drop from Healthy O2 Saturation (%)"] = (
    df["O2 Saturation"] - df["Healthy O2 Saturation"]
)
df["O2 Saturation % Predicted"] = (
    df["O2 Saturation"] / df["Healthy O2 Saturation"] * 100
)

df.head()

### Raw scatter plot

In [None]:
# Plot Airway resistance vs O2 drop
title = f"O2 Drop vs Computed Airway Resistance vs  ({df.ID.nunique()} IDs, {len(df)} datapoints)"
fig = px.scatter(
    df,
    x="Airway Resistance Computed (%)",
    y="O2 Saturation % Predicted",
    # y="Drop from Healthy O2 Saturation (%)",
    title=title,
)
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.show()
fig.write_image(f"{plotsdir}{title}.pdf")
# 100% is about 98 for females and 97.4 for males
# Hence, threshold at 95% => 3-3.5% drop in O2 Saturation => 96.5-97% O2 Saturation

In [None]:
# Plot Airway resistance vs O2 drop
title = f"O2 Drop vs Inferred Airway Resistance ({df.ID.nunique()} IDs, {len(df)} datapoints)"
fig = px.scatter(
    df,
    x="Airway Resistance (%)",
    # y="Drop from Healthy O2 Saturation (%)",
    y="O2 Saturation % Predicted",
    title=title,
)
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.update_layout(font=dict(size=10))
fig.show()
fig.write_image(f"{plotsdir}{title}.pdf")
# 100% is about 98 for females and 97.4 for males
# Hence, threshold at 95% => 3-3.5% drop in O2 Saturation => 96.5-97% O2 Saturation

### Compute and interpolate factor profile

In [None]:
# Group by Airway Resistance and take 80th percentile of O2 Sat / Healthy O2 Sat if there are more than 50 observations
def calc_rmax_o2(df_for_AR, rmax_o2_sat_col, percentile=80):
    return np.percentile(df_for_AR[rmax_o2_sat_col], percentile), len(df_for_AR)
    # Take data between 80 and 90th percentile
    # return np.percentile(
    #     df_for_AR[rmax_o2_sat_col],
    #     range(percentile - 5, percentile + 5),
    # ).mean(), len(df_for_AR)


def fit_factor_profile(df_to_fit, rmax_o2_sat_col):
    x_data = df_to_fit["Airway Resistance (%)"].values
    y_data = df_to_fit[rmax_o2_sat_col].values

    # Piecewise fit (constant + polynomial)
    def func(x, x0, y0, k1, k2, k3):
        # x0 = 43
        # y0 = df_to_fit[df_to_fit["Airway Resistance (%)"] < x0][
        #     o2_col
        # ].mean()

        return np.piecewise(
            x,
            [x <= x0],
            [
                lambda x: y0,
                lambda x: k1 * np.power((x - x0), 3)
                + k2 * np.power((x - x0), 2)
                + k3 * (x - x0)
                + y0,
            ],
        )

    # def objective(params, x, y):
    #     return np.sum((func(x, *params) - y)**2)

    # Enforce monotonicity constraint
    # constraints = ({'type': 'ineq', 'fun': lambda params: np.diff(func(x_data, *params))})

    # Initial guess for parameters
    # initial_guess = [4.34232599e+01, 8.92599726e-01, -3.60069643e-04, 1.56798589e-02, -2.12605357e-01]

    # # Minimize the objective function with the constraint
    # result = minimize(objective, initial_guess, args=(x_data, y_data), constraints=constraints)
    # parameters = result.x

    parameters, covariance = curve_fit(
        func,
        df_to_fit["Airway Resistance (%)"].values,
        df_to_fit[rmax_o2_sat_col].values,
    )
    print(f"Parameters: {parameters}")
    df_to_fit["Piecewise fit"] = func(x_data, *parameters)

    # Spline fit
    ## Base value for smoothing parameter
    s = df_to_fit.shape[0] - np.sqrt(2 * df_to_fit.shape[0])
    print(f"Smoothing parameter: {s}")
    ### Create a spline representation of the curve
    ### tck-tuple: (t,c,k) containing the vector of knots, the B-spline coefficients, and the degree of the spline.
    tck = splrep(
        x_data,
        y_data,
        s=0,
    )
    ### Evalute the spline repr on a new set of points
    df_to_fit["Spline"] = BSpline(*tck)(df_to_fit["Airway Resistance (%)"])

    # Mean smoothing
    df_to_fit["Mean Smoothing"] = (
        df_to_fit[rmax_o2_sat_col].rolling(5, center=True).mean()
    )
    return df_to_fit


df_for_AW_O2Sat_study = df[df.ID != "120"].copy()

for prctile in [10, 85, 90, 95, 98, 100]:  # range(60, 90, 5):
    rmax_o2_sat_col = f"{prctile}th-rmax O2 Saturation<br> % Predicted"

    rmax_AW_O2Sat = df_for_AW_O2Sat_study.groupby(["Airway Resistance (%)"]).apply(
        lambda x: calc_rmax_o2(x, "O2 Saturation % Predicted", prctile)
    )
    # Unstack rmax_AR_O2Sat tuples into 2 columns
    rmax_AW_O2Sat = (
        rmax_AW_O2Sat.apply(pd.Series)
        .rename(columns={0: rmax_o2_sat_col, 1: "#datapoints"})
        .reset_index()
    )
    # Add column for >50 datapoints
    rmax_AW_O2Sat[">50 datapoints"] = rmax_AW_O2Sat["#datapoints"] > 50
    # Mask for >50 datapoints
    rmax_AW_O2Sat_plot = fit_factor_profile(
        rmax_AW_O2Sat[rmax_AW_O2Sat[">50 datapoints"]].copy(), rmax_o2_sat_col
    )

    # PLot rmax_AW_O2Sat
    title = f"Max achievable O2 Saturation % Predicted ({prctile}th-rmax) vs Airway Resistance ({df_for_AW_O2Sat_study.ID.nunique()} IDs, {len(df_for_AW_O2Sat_study)} datapoints)"
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=rmax_AW_O2Sat_plot["Airway Resistance (%)"],
            y=rmax_AW_O2Sat_plot[rmax_o2_sat_col],
            mode="markers",
            name="Airway Resistance vs O2 drop",
        ),
    )
    # fig.add_trace(
    #     go.Scatter(
    #         x=rmax_AW_O2Sat_plot["Airway Resistance (%)"],
    #         y=rmax_AW_O2Sat_plot["Spline"],
    #         mode="lines",
    #         name="Spline",
    #     )
    # )
    # fig.add_trace(
    #     go.Scatter(
    #         x=rmax_AW_O2Sat_plot["Airway Resistance (%)"],
    #         y=rmax_AW_O2Sat_plot["Mean Smoothing"],
    #         mode="lines",
    #         name="Mean Smoothing",
    #     )
    # )
    fig.add_trace(
        go.Scatter(
            x=rmax_AW_O2Sat_plot["Airway Resistance (%)"],
            y=rmax_AW_O2Sat_plot["Piecewise fit"],
            mode="lines",
            name="Constant + 3rd order polynomial fit",
        ),
    )
    fig.update_traces(line=dict(width=1), marker=dict(size=3))
    fig.update_yaxes(
        range=[90, 103.5],
        nticks=10,
        title=rmax_o2_sat_col,
    )
    fig.update_xaxes(title="Airway Resistance (%)", range=[-5, 91], nticks=10)
    fig.update_layout(title=title, height=300, width=800, font=dict(size=8))
    fig.show()
    # Save to file
    fig.write_image(
        f"{plotsdir}{title}.pdf",
        width=1000,
        height=400,
    )

# Bar plot of number of datapoints per airway resistance bin
title = f"Number of datapoints per Airway Resistance bin ({df_for_AW_O2Sat_study.ID.nunique()} IDs, {len(df_for_AW_O2Sat_study)} datapoints)"
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=rmax_AW_O2Sat["Airway Resistance (%)"],
        y=rmax_AW_O2Sat["#datapoints"],
        name="#datapoints",
        marker=dict(color="grey"),
    ),
)
# Add line for 50 datapoints
fig.add_trace(
    go.Scatter(
        x=rmax_AW_O2Sat["Airway Resistance (%)"],
        y=np.repeat(50, len(rmax_AW_O2Sat)),
        mode="lines",
        name="50 datapoints",
    )
)
# Log y axis
fig.update_yaxes(type="log", title="#datapoints", nticks=3)
# Range x axis to 91
fig.update_xaxes(range=[-5, 91], nticks=10, title="Binned Airway Resistance (%)")
fig.update_layout(font=dict(size=8), height=250, width=800, title=title)
fig.show()

In [None]:
rmax_AW_O2Sat.sort_values(by="n datapoints").head(10)

### Specific cases for the plot with Airway resistance computed

In [None]:
# Plot FEV1 % Predicted with time for individual 122
def plot_fev1_o2(df, ids, o2_col="O2 Saturation"):
    for id in ids:
        df_for_ID = df[df.ID == id]
        # Create subplot with 2 rows
        fig = make_subplots(rows=2, cols=1)
        # Add trace for FEV1 % Predicted on one subplot
        fig.add_trace(
            go.Scatter(
                x=df_for_ID["Date Recorded"],
                y=df_for_ID["ecFEV1 % Predicted"],
                mode="markers",
                name="ecFEV1 % Predicted",
            ),
            row=1,
            col=1,
        )
        # fig.add_trace(
        #     go.Scatter(
        #         x=df_for_ID["Date Recorded"],
        #         y=df_for_ID["FEV1 % Predicted"],
        #         mode="markers",
        #         name="FEV1 % Predicted",
        #     ),
        #     row=1,
        #     col=1,
        # )
        # Add trace for O2 Saturation on another subplot
        fig.add_trace(
            go.Scatter(
                x=df_for_ID["Date Recorded"],
                y=df_for_ID[o2_col],
                mode="markers",
                name="O2 Saturation",
            ),
            row=2,
            col=1,
        )
        fig.update_traces(marker=dict(size=3), line=dict(width=0.5))
        title = f"ecFEV1 % Predicted and {o2_col} for individual {id} ({len(df_for_ID)} datapoints)"
        fig.update_layout(title=title)
        # Add trace for O2 Saturation on another subplot

        fig.show()

#### Low airway resistance

In [None]:
# Filter airway resistance below 40%
df[df["Airway Resistance (%)"] < -20]
# '113', '126', '202', '331'

In [None]:
plot_fev1_o2(df, ["113", "126", "202", "331"])

#### High airway resistance

In [None]:
# Filter airway resistance below 40%
df[df["Airway Resistance (%)"] > 80].ID.unique()
# 3 individuals '122', '198', '286' have airway resistance > 80%

In [None]:
# Plot FEV1 % Predicted with time for individual 122
plot_fev1_o2(df, ["122", "286"])

#### High O2 Drop

In [None]:
df[df["Drop from Healthy O2 Saturation (%)"] < 90].ID.unique()

In [None]:
plot_fev1_o2(df, ["111", "180", "352"])

#### High positive O2 Drop

In [None]:
df[df["Drop from Healthy O2 Saturation (%)"] > 2.8].ID.unique()

In [None]:
df[df.ID == "120"].sort_values(by="Drop from Healthy O2 Saturation (%)")

In [None]:
plot_fev1_o2(df, ["120"], "Drop from Healthy O2 Saturation (%)")
plot_fev1_o2(df, ["120"])

### Factor function

In [None]:
# We wanna plot a noise-agnostic version of the raw scatter plot's top envelope

# > 50 datapoints per bin

# Discretise airway resistance
# Take 90th percentile for each bin
# Plot 90th percentile and amount of data per bin

df_aw_o2_factor

In [None]:
def calc_cpt_AR_HO2Sat():
    """
    Returns the CPT for the AR_HO2Sat node
    """