In this notebook, I will use the information about IA to refine F3. Initially, I buildt F3 using FEF25-75%FEV1 against FEV1-based-AR. The latter variable contains only the aspect of AR as measured by FEV1. In CF, high AR usually correlates with high IA. We can use this correlation to refine the uncertainty present in the FEV1-based-AR. The corrected FEV1-based-AR can therefore be closer to the true AR. We can use it to improve the model fo F3.

In [1]:
import src.data.breathe_data as br
import src.data.helpers as dh
import src.models.helpers as mh
import numpy as np
import pandas as pd
import src.inference.helpers as ih
import src.modelling_fef2575.cpt_and_plots as cpt_and_plots
import src.models.var_builders as var_builders


In [2]:
# Need dataset with O2sat, FEV1, FEF25-75. Use as many datapoints as possible
# Infer FEV1-based-AR using FEV1
# Model F3 using FEV1-based-AR
# Infer FEV1-FEF2575-based-AR using FEV1 and FEF25-75
# Model F3 using FEV1-FEF2575-based-AR
# Infer IA using FEV1-FEF2575-based-AR
# Model AR-IA
# Infer IA-FEV1-FEF2575-based-AR using FEV1, FEF25-75, IA
# Model F3 using this new AR
# Compare the two models: compare the mean, median, std-percentiles plots of both -> std should be smaller

# Optionally repeat until std doesn't change

In [2]:
df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan")
# Remove PEF related rows
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
# Remove NaN on FEV1, O2 saturation columns
print(df.shape)
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
print(df.shape)
df.head()

(48978, 15)
(32715, 15)


Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,53,Male,173.0,3.610061,97.150104,36.287474,36.287474,99.845492
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,53,Male,173.0,3.610061,97.150104,36.287474,36.287474,100.874827
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,53,Male,173.0,3.610061,97.150104,36.287474,36.287474,98.816157
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,53,Male,173.0,3.610061,97.150104,36.287474,36.01047,98.816157
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,53,Male,173.0,3.610061,97.150104,36.01047,35.456463,100.874827


In [3]:
(
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
    180, 10, 'Male'
)

In [4]:
# Infer FEV1-based-AR using FEV1

# inf_res_df = ih.infer_vars_and_get_back_df(df,  observed_variables=["ecFEV1", "O2Sat"])
inf_res_df = ih.infer_vars_and_get_back_df(
    df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
    variables_to_infer=[AR],
    observed_variables=[ecFEV1, O2Sat],
    ecFEF2575prctecFEV1_cpt=None,
)

In [5]:
# Merge the inferred AR back to the original dataframe
df1 = pd.merge(df, inf_res_df, on=["ID", "Date Recorded"], how="inner")
# df1.head()

In [6]:
# Model F3 using FEV1-based-AR

def model_f3(df, AR, ar_col):
    y_col = "ecFEF2575%ecFEV1"
    df["ecFEF2575%ecFEV1"] = df["ecFEF2575"] / df["ecFEV1"] * 100

    # Parameters
    n_samples = 100

    ecFEF2575prctecFEV1 = mh.VariableNode(
        "ecFEF25-75 % ecFEV1 (%)", 0, 200, 2, prior=None
    )

    df_sampled, df_f3 = cpt_and_plots.get_sampled_df_and_statistics_df(
        df, n_samples, AR
    )

    cpt_and_plots.plot_F3_mean_and_percentiles_per_AR_bin(
        df_f3, ar_col, y_col, save=True
    )
    cpt_f3 = cpt_and_plots.calc_plot_cpt_var_given_AR(
        df_sampled, df_f3, n_samples, AR, ar_col, ecFEF2575prctecFEV1, y_col, save=True
    )
    return cpt_f3


cpt_f3 = model_f3(df1, AR, "FEV1-based-AR")

Max sampled AR values: 90.00


  df_sampled.groupby("AR bin")


In [7]:
inf_res_df_2 = ih.infer_vars_and_get_back_df(
    df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
    variables_to_infer=[AR],
    observed_variables=[ecFEV1, O2Sat, ecFEF2575prctecFEV1],
    ecFEF2575prctecFEV1_cpt=cpt_f3,
)
# Merge the inferred AR back to the original dataframe
df2 = pd.merge(df, inf_res_df_2, on=["ID", "Date Recorded"], how="inner")

In [8]:
# Model F3 using FEV1-FEF2575-based-AR

cpt_f3_2 = model_f3(df2, AR, "FEV1-FEF2575-based-AR")

Max sampled AR values: 90.00






In [29]:
import src.models.var_builders as var_builders
import src.models.graph_builders as graph_builders
from src.inference.inf_algs import apply_custom_bp


def infer_vars_and_get_back_df(
    df,
    ecFEF2575prctecFEV1_cpt=None,
    variables_to_infer=["AR", "IA", "HFEV1", "HO2Sat", "O2SatFFA", "UO2Sat"],
    observed_variables=["ecFEV1", "O2Sat", "ecFEF2575prctecFEV1"],
):
    """
    Infer AR, IA, HFEV1, HO2Sat fo each entry in the dataset, for the given observed variables as evidence
    """

    def infer_vars_for_ID(df):
        df.reset_index(inplace=True)

        (
            HFEV1,
            ecFEV1,
            AR,
            HO2Sat,
            O2SatFFA,
            IA,
            UO2Sat,
            O2Sat,
            ecFEF2575prctecFEV1,
        ) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
            df.Height[0], df.Age[0], df.Sex[0]
        )

        # Update cpt to custom one if provided
        if ecFEF2575prctecFEV1_cpt is not None:
            ecFEF2575prctecFEV1.set_cpt(ecFEF2575prctecFEV1_cpt)

        model = graph_builders.fev1_fef2575_o2sat_point_in_time_factor_graph(
            HFEV1,
            ecFEV1,
            AR,
            HO2Sat,
            O2SatFFA,
            IA,
            UO2Sat,
            O2Sat,
            ecFEF2575prctecFEV1,
            False,
        )
        inf_alg = apply_custom_bp(model)

        variables = ih._get_vars_for_model(
            variables_to_infer, AR, IA, HFEV1, HO2Sat, O2SatFFA, UO2Sat
        )

        def infer_and_unpack(row):
            # Build evidence
            evidence = []
            if "ecFEV1" in observed_variables:
                evidence.append([ecFEV1, row["ecFEV1"]])
            if "O2Sat" in observed_variables:
                evidence.append([O2Sat, row["O2 Saturation"]])
            if "ecFEF2575prctecFEV1" in observed_variables:
                evidence.append(
                    [ecFEF2575prctecFEV1, row["ecFEF2575"] / row["ecFEV1"] * 100]
                )

            res = ih.infer_on_factor_graph(
                inf_alg,
                variables,
                evidence,
            )

            res_values = (res[var.name].values for var in variables)

            return row["Date Recorded"], *res_values

        res = df.apply(infer_and_unpack, axis=1)
        return res

    variables_to_infer_dict = {
        i + 1: variables_to_infer[i] for i in range(len(variables_to_infer))
    }
    variables_to_infer_dict[0] = "Date Recorded"

    resraw = df.groupby("ID").apply(infer_vars_for_ID)
    # resraw = df.iloc[np.r_[10:13, 3000:3007]].groupby("ID").apply(infer_vars_for_ID)
    res = (
        resraw.apply(pd.Series)
        .reset_index()
        .rename(columns=variables_to_infer_dict)
        .drop(columns="level_1")
    )

    # Build model to get variables
    (
        HFEV1,
        _,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        _,
        _,
    ) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
        160, 10, "Female"
    )

    print(variables_to_infer_dict)

    def fn(model_var, row):
        row.shape
        return model_var.get_mean(row)

    for model_var, str_var in ih._get_vars_and_str_for_model(
        variables_to_infer, AR, IA, HFEV1, HO2Sat, O2SatFFA, UO2Sat
    ):
        print(res[str_var][0].shape)
        print(model_var.name, str_var)
        res[f"{str_var} mean"] = res[str_var].apply(lambda x: fn(model_var, x))

    return res

In [30]:
# Infer IA using FEV1-FEF2575-based-AR

inf_res_df_3 = infer_vars_and_get_back_df(
    df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
    ecFEF2575prctecFEV1_cpt=cpt_f3_2,
    variables_to_infer=["IA", "AR"],
    observed_variables=["ecFEV1", "O2Sat", "ecFEF2575prctecFEV1"],
)

{1: 'IA', 2: 'AR', 0: 'Date Recorded'}
(30,)
Airway resistance (%) AR


ValueError: operands could not be broadcast together with shapes (30,) (45,) 

In [None]:
df3 = pd.merge(df, inf_res_df_3, on=["ID", "Date Recorded"], how="inner")

In [None]:
# Model the relationship between AR and IA

(45,)

In [None]:
# df["ecFEF2575%ecFEV1"] = df["ecFEF2575"] / df["ecFEV1"] * 100

# Parameters
n_samples = 100

# parameters
IA = mh.VariableNode("Inactive alveoli (%)", 0, 30, 1, prior={"type": "uniform"})
AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior=None)
y_col = "IA mean"
ar_col = "FEV1-FEF2575-based-AR"

df_sampled, df_f3 = cpt_and_plots.get_sampled_df_and_statistics_df(
    df3, n_samples, AR, "IA mean"
)

cpt_and_plots.plot_F3_mean_and_percentiles_per_AR_bin(df_f3, ar_col, y_col, save=True)
# cpt_f3 = cpt_and_plots.calc_plot_cpt_var_given_AR(
#     df_sampled, df_f3, n_samples, AR, ar_col, IA, y_col, save=True
# )

ValueError: 'a' and 'p' must have same size

In [26]:
df_sampled = df3.copy()
df_sampled["AR sampled"] = np.nan

# Renormalise all AR distributions
df_sampled["AR norm"] = df_sampled.apply(lambda row: row.AR / sum(row["AR"]), axis=1)


def fn(row):
    print(n_samples)
    print(row["AR norm"].shape)
    return AR.sample(n=n_samples, p=row["AR norm"])


# Create n AR samples per row
# df_sampled["AR sample"] = df_sampled.apply(
#     lambda row: AR.sample(n=n_samples, p=row["AR norm"]), axis=1
# )
df_sampled["AR sample"] = df_sampled.apply(fn, axis=1)

df_sampled = df_sampled.explode("AR sample").reset_index(drop=True)

print(f'Max sampled AR values: {max(df_sampled["AR sample"]):.2f}')

df_sampled["AR bin"] = pd.cut(
    df_sampled["AR sample"],
    bins=np.arange(
        np.floor(min(df_sampled["AR sample"])),
        np.ceil(max(df_sampled["AR sample"])) + AR.bin_width,
        AR.bin_width,
    ),
)

df_f3 = (
    df_sampled.groupby("AR bin")
    .agg(
        mean=(y_col, "mean"),
        std=(y_col, "std"),
        median=(y_col, "median"),
        p3=(y_col, lambda x: np.percentile(x, 3)),
        p97=(y_col, lambda x: np.percentile(x, 97)),
        p16=(y_col, lambda x: np.percentile(x, 16)),
        p84=(y_col, lambda x: np.percentile(x, 84)),
    )
    .reset_index()
)

df_sampled["AR midbin"] = df_sampled["AR bin"].apply(
    lambda x: x.left + AR.bin_width / 2
)
df_f3["AR midbin"] = df_f3["AR bin"].apply(lambda x: x.left + AR.bin_width / 2)

100
(30,)


ValueError: 'a' and 'p' must have same size

In [25]:
AR.midbins.shape

(45,)