In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time

from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import BeliefPropagation
from pgmpy.models import BayesianNetwork

import src.modelling_o2.o2satffa as o2satffa
import src.models.helpers as mh
import src.models.builders as mb
import src.inference.helpers as ih


plotsdir = "../../../../PlotsBreathe/O2_modelling/"

In [None]:
df = pd.read_excel(f"{plotsdir}airwayresistance_o2satffa_df.xlsx", index_col=0)
df.ID = df.ID.astype(str)
# To excel
# df.to_excel(f"{plotsdir}airwayresistance_o2satffa_df.xlsx")

## Infer O2SatFFA after observing FEV1

In [None]:
def model_up_to_O2SatFFA(hfev1_prior, ho2sat_prior):
    """
    This is a point in time model with
    FEV1 = HFEV1 * (1-AR)
    O2SatFFA = HO2Sat * drop_func(AR)

    The model is the same as build_HFEV1_AB_FEV1(), with Airway Blockage renamed to Airway Resistance.
    """
    print("*** Building FEV1 and O2 point in time model ***")

    # Setting resolution of 0.05 to avoud rounding errors for AR
    HFEV1 = mh.variableNode("Healthy FEV1 (L)", 1, 6, 0.05, prior=hfev1_prior)
    AR = mh.variableNode("Airway Resistance (%)", 0, 90, 1, prior={"type": "uniform"})
    ecFEV1 = mh.variableNode("FEV1 (L)", 0, 6, 0.05, prior=None)
    # Lowest predicted FEV1 is 15% (AR = 1-predictedFEV1)
    HO2Sat = mh.variableNode(
        "Healthy O2 Saturation (%)", 90, 100, 0.5, prior=ho2sat_prior
    )
    # Highest drop is 93% (for AR = 90%), hence the lowest O2SatFFA is 90 * 0.93 = 83.7%
    O2SatFFA = mh.variableNode(
        "O2 Sat if fully functional alveoli (%)", 80, 100, 0.5, prior=None
    )

    prior_hfev1 = TabularCPD(
        variable=HFEV1.name,
        variable_card=len(HFEV1.bins),
        values=HFEV1.prior,
        evidence=[],
        evidence_card=[],
    )
    prior_ho2sat = TabularCPD(
        variable=HO2Sat.name,
        variable_card=len(HO2Sat.bins),
        values=HO2Sat.prior,
        evidence=[],
        evidence_card=[],
    )
    prior_ar = TabularCPD(
        variable=AR.name,
        variable_card=len(AR.bins),
        values=AR.prior,
        evidence=[],
        evidence_card=[],
    )
    cpt_fev1 = TabularCPD(
        variable=ecFEV1.name,
        variable_card=len(ecFEV1.bins),
        values=mh.calc_pgmpy_cpt_X_x_1_minus_Y(HFEV1, AR, ecFEV1),
        evidence=[HFEV1.name, AR.name],
        evidence_card=[len(HFEV1.bins), len(AR.bins)],
    )
    cpt_o2_sat_ffa = TabularCPD(
        variable=O2SatFFA.name,
        variable_card=len(O2SatFFA.bins),
        values=o2satffa.calc_cpt(O2SatFFA, HO2Sat, AR, debug=False),
        evidence=[HO2Sat.name, AR.name],
        evidence_card=[len(HO2Sat.bins), len(AR.bins)],
    )

    model = BayesianNetwork(
        [
            (HFEV1.name, ecFEV1.name),
            (AR.name, ecFEV1.name),
            (HO2Sat.name, O2SatFFA.name),
            (AR.name, O2SatFFA.name),
        ]
    )

    model.add_cpds(cpt_fev1, prior_ar, prior_hfev1, prior_ho2sat, cpt_o2_sat_ffa)

    model.check_model()
    inf_alg = BeliefPropagation(model)
    return (model, inf_alg, HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA)

In [None]:
# Infer airway resistance using the model
for id in df.ID.unique():
    # for id in ["101"]:
    df_for_ID = df[df.ID == id].copy().reset_index()
    print(f"\nRunning for ID {id}, with {len(df_for_ID)} observations")
    # Take one element of the df for ID at column height
    height = df_for_ID.Height[0]
    sex = df_for_ID.Sex[0]
    hfev1_prior = {
        "type": "default",
        "height": height,
        "age": df_for_ID.Age[0],
        "sex": sex,
    }
    ho2sat_prior = {
        "type": "default",
        "height": height,
        "sex": sex,
    }
    tic = time.time()
    (
        model,
        inf_alg,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
    ) = model_up_to_O2SatFFA(hfev1_prior, ho2sat_prior)
    print(f"model took {time.time() - tic} seconds to build")

    tic = time.time()
    df_for_ID["AR from FEV1"] = np.nan
    df_for_ID["AR from ecFEV1"] = np.nan
    df_for_ID["O2SatFFA from FEV1"] = np.nan
    df_for_ID["O2SatFFA from ecFEV1"] = np.nan
    fev1s_tmp = []
    ecfev1s_tmp = []
    for i in range(len(df_for_ID)):
        fev1_obs = df_for_ID.loc[i, "FEV1"]
        ecfev1_obs = df_for_ID.loc[i, "ecFEV1"]

        # FEV1
        if fev1_obs in fev1s_tmp:
            df_for_ID.loc[i, "AR from FEV1"] = df_for_ID.loc[
                df_for_ID["FEV1"] == fev1_obs, "AR from FEV1"
            ].values[0]
            df_for_ID.loc[i, "O2SatFFA from FEV1"] = df_for_ID.loc[
                df_for_ID["FEV1"] == fev1_obs, "O2SatFFA from FEV1"
            ].values[0]
        else:
            res_ar_for_fev1 = ih.infer(
                inf_alg, [AR], [[ecFEV1, fev1_obs]], show_progress=False
            )
            df_for_ID.loc[i, "AR from FEV1"] = AR.get_mean(res_ar_for_fev1.values)

            res_o2satffa_for_fev1 = ih.infer(
                inf_alg, [O2SatFFA], [[ecFEV1, fev1_obs]], show_progress=False
            )
            df_for_ID.loc[i, "O2SatFFA from FEV1"] = O2SatFFA.get_mean(
                res_o2satffa_for_fev1.values
            )

        # ecFEV1
        if ecfev1_obs in ecfev1s_tmp:
            df_for_ID.loc[i, "AR from ecFEV1"] = df_for_ID.loc[
                df_for_ID["ecFEV1"] == ecfev1_obs, "AR from ecFEV1"
            ].values[0]

            df_for_ID.loc[i, "O2SatFFA from ecFEV1"] = df_for_ID.loc[
                df_for_ID["ecFEV1"] == ecfev1_obs, "O2SatFFA from ecFEV1"
            ].values[0]
        else:
            res_ar_for_ecfev1 = ih.infer(
                inf_alg, [AR], [[ecFEV1, ecfev1_obs]], show_progress=False
            )
            df_for_ID.loc[i, "AR from ecFEV1"] = AR.get_mean(res_ar_for_ecfev1.values)

            res_o2satffa_for_ecfev1 = ih.infer(
                inf_alg, [O2SatFFA], [[ecFEV1, ecfev1_obs]], show_progress=False
            )
            df_for_ID.loc[i, "O2SatFFA from ecFEV1"] = O2SatFFA.get_mean(
                res_o2satffa_for_ecfev1.values
            )

        fev1s_tmp = np.append(fev1s_tmp, fev1_obs)
        ecfev1s_tmp = np.append(ecfev1s_tmp, ecfev1_obs)
    print(f"inference took {time.time() - tic} seconds to run")

    # Add to df
    df.loc[df.ID == id, "AR from ecFEV1 (%)"] = df_for_ID["AR from ecFEV1"].values
    df.loc[df.ID == id, "AR from FEV1 (%)"] = df_for_ID["AR from FEV1"].values

    df.loc[df.ID == id, "O2SatFFA from ecFEV1 (%)"] = df_for_ID[
        "O2SatFFA from ecFEV1"
    ].values
    df.loc[df.ID == id, "O2SatFFA from FEV1 (%)"] = df_for_ID[
        "O2SatFFA from FEV1"
    ].values

# IntegrationWarning: The maximum number of subdivisions (50) has been achieved.
# Happens when the results becomes too close to 0

In [None]:
# For ID 101, 0.05 resolution takes 50s + 12s to run
df.head()

In [None]:
# Add title: inferred O2SatFFA from ecFEV1 vs. AR
title = "Inferred O2SatFFA vs. AR"
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    y="O2SatFFA from ecFEV1 (%)",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.update_layout(font=dict(size=10), title=title)
fig.show()

In [None]:
df["O2SatFFA from ecFEV1 (%)"]

# Plot F2

In [None]:
df["O2Sat % O2SatFFA"] = df["O2 Saturation"] / df["O2SatFFA from ecFEV1 (%)"] * 100

In [None]:
title = (
    f"O2Sat % Inferred O2SatFFA vs. AR ({df.ID.nunique()} IDs, {len(df)} datapoints)"
)
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    y="O2Sat % O2SatFFA",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2), opacity=0.3)
fig.update_layout(font=dict(size=10))
fig.show()

In [None]:
import src.o2_fev1_analysis.partition as partition

O2_col = "O2Sat % O2SatFFA"
# O2_col = "O2SatFFA from ecFEV1 (%)"

# # Create 3 equally spaced bins for Airway Resistance
# df["AR group"] = partition.partition_in_n_equal_groups(
#     df["Airway Resistance mean from ecFEV1 (%)"], 5
# )

# Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
df["AR group"] = pd.cut(
    df["AR from ecFEV1 (%)"],
    bins=np.arange(0, 100, 20),
    include_lowest=False,
)

group_labels = df["AR group"].unique()
print(f"AR groups: {group_labels}")

# Create subplot with 3 rows
fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    fig.add_trace(
        go.Histogram(
            x=df[df["AR group"] == group_labels[i]][O2_col],
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.2),
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} for different Airway Resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli",
    row=len(group_labels) - 1,
    col=1,
)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

In [None]:
df.head()

# Remove measurement noise and ho2sat model spread to get F2

In [None]:
# Plot the overall distribution of O2SatFFA with airway resistance
import plotly.figure_factory as ff

O2_col = "O2Sat % O2SatFFA"


def o2sat_prct_o2satffa_displot(array):
    fig = ff.create_distplot(
        [array],
        ["O2Sat % O2SatFFA"],
        bin_size=0.2,
        show_rug=False,
        show_curve=True,
        histnorm="probability density",
        colors=["#636EFA"],
    )

    fig.update_layout(font=dict(size=10))
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()
    return -1


o2sat_prct_o2satffa_displot(df[O2_col])

In [None]:
# Fit a gaussian distribution
import scipy.stats as stats


def fit_gaussian(array, bin_width=0.2):
    # Print data median
    mu, std = stats.norm.fit(array)
    print(f"Unconstrained gaussian fit - mu: {mu}, std: {std}")
    # Redo a fit with a fixed mu
    mu = np.median(array)
    std = stats.norm.fit(array, floc=mu)[1]
    print(f"Gaussian fit with mu = median - mu: {mu}, std: {std}")
    # Redo a fit using the same mu, but taking the std as the std of the right hand side from the median
    mu = np.median(array)
    right_hand_side = array[array > mu]
    std = np.sqrt(np.sum((right_hand_side - mu) ** 2) / len(array))
    print(
        f"Gaussian fit centered on median, defining std to the right hand side deviation from median - mu: {mu}, std: {std}"
    )

    # Then plot the pdf on top of the histogram
    # Create x vector from 75 to 110 with bin_width
    x = np.arange(75, 110, bin_width)
    pdf = stats.norm.pdf(x, mu, std)
    # Normalise pdf
    # pdf = pdf / np.sum(pdf)
    # print(sum(pdf))

    fig = go.Figure()
    fig.add_trace(
        go.Histogram(
            x=array,
            name=f"O2Sat % O2SatFFA",
            xbins=dict(start=75, end=110, size=bin_width),
            histnorm="probability density",
        )
    )
    # Add gaussian fit
    fig.add_trace(
        go.Scatter(
            x=x,
            y=pdf,
            mode="lines",
            name="Gaussian fit",
            line=dict(color="black", width=1),
        )
    )
    fig.update_layout(
        title=f"Distribution of O2Sat % O2SatFFA for different airway resistance groups",
        font=dict(size=10),
    )
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()


fit_gaussian(df[O2_col])

## Use the O2 saturation with gaussian noise to smooth the histogram.

That means for each O2Sat value, get the denoised distribution, sample 100 O2Sat values from this distribution.
Thus each O2Sat value has equal weight in this new denoised dataset.

Then replot the histogram and redo the gaussian fit

In [None]:
import src.modelling_o2.o2sat as o2sat


def smart_sample(bins, p):
    """
    Smartly sampling so as to respect the probability distribution
    """
    n_vals_per_bin_arr = p * 100
    n_vals_per_bin_arr = np.round(n_vals_per_bin_arr)
    n_vals_per_bin_arr = n_vals_per_bin_arr.astype(int)

    # Create an array with n times the values of the bin
    bin_vals = np.repeat(bins, n_vals_per_bin_arr)
    return bin_vals


def get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width=0.1, n_samples=100000):
    O2Sat = o2sat.emulate_gaussian_distribution(o2sat_obs, bin_width=bin_width)
    # sample = O2Sat.sample(n_samples)
    sample = smart_sample(O2Sat.bins, O2Sat.prior[:, 0])
    return sample

In [None]:
# How many points should you sample to have a good representation of the distribution? -> 100000
bin_width = 0.2
sample = get_unbiased_o2sat_set_from_value(100, bin_width=bin_width, n_samples=100)
print(f"Sample size: {len(sample)}")

# Plot
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=sample,
        name=f"O2SatFFA",
        xbins=dict(start=75, end=110, size=bin_width),
    )
)
fig.update_layout(
    font=dict(size=10),
    xaxis=dict(range=[80, 100]),
)
fig.show()

In [None]:
df.head()

In [None]:
# Remove AR group 60, 80
print(f"Removing group label: {group_labels[0]}")
df_trusted_drop = df[df["AR group"] != group_labels[0]]
print(f"Initial entries: {len(df)}, after removing AR group 80: {len(df_trusted_drop)}")


def calc_unbiased_o2sat_prct_o2satffa(o2sat_obs, o2satffa, bin_width, n_samples):
    unbiased_o2sat = get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width, n_samples)
    return unbiased_o2sat / o2satffa * 100


print("Initially N values:", len(df_trusted_drop))

unbiased_o2sat_prct_o2satffa = df_trusted_drop.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)

unbiased_o2sat_prct_o2satffa_flat = np.concatenate(unbiased_o2sat_prct_o2satffa.values)
print("N values:", len(unbiased_o2sat_prct_o2satffa_flat))

In [None]:
o2sat_prct_o2satffa_displot(unbiased_o2sat_prct_o2satffa_flat)

In [None]:
fit_gaussian(unbiased_o2sat_prct_o2satffa_flat)

## Reproduce hist by AR groups

In [None]:
# Reproduce the plot with the 3 AR groups
df["Unbiased O2Sat % O2SatFFA"] = df.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)


import src.o2_fev1_analysis.partition as partition

O2_col = "Unbiased O2Sat % O2SatFFA"

# Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
df["AR group"] = pd.cut(
    df["AR from ecFEV1 (%)"],
    bins=np.arange(0, 100, 20),
    include_lowest=False,
)

group_labels = df["AR group"].unique()
print(f"AR groups: {group_labels}")

# Create subplot with 3 rows
fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    fig.add_trace(
        go.Histogram(
            x=np.concatenate(df[df["AR group"] == group_labels[i]][O2_col].values),
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.2),
            # histnorm="probability density",
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} for different airway resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Unbiased O2Sat%O2SatFFA",
    row=len(group_labels) - 1,
    col=1,
)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

In [None]:
df.head()