In this notebook, I will use the information about IA to refine F3. Initially, I buildt F3 using FEF25-75%FEV1 against FEV1-based-AR. The latter variable contains only the aspect of AR as measured by FEV1. In CF, high AR usually correlates with high IA. We can use this correlation to refine the uncertainty present in the FEV1-based-AR. The corrected FEV1-based-AR can therefore be closer to the true AR. We can use it to improve the model fo F3.

In [1]:
import src.data.breathe_data as br
import src.data.helpers as dh
import src.models.helpers as mh
import src.inference.helpers as ih
import src.modelling_fef2575.cpt_and_plots as cpt_and_plots
import src.models.var_builders as var_builders

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
(
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
    180, 10, "Male"
)

# Plan to refine F3

In [None]:
# Need dataset with O2sat, FEV1, FEF25-75. Use as many datapoints as possible
# Infer FEV1-based-AR using FEV1
# Model F3 using FEV1-based-AR
# Infer FEV1-FEF2575-based-AR using FEV1 and FEF25-75
# Model F3 using FEV1-FEF2575-based-AR
# Infer IA using FEV1-FEF2575-based-AR
# Model AR-IA
# Infer IA-FEV1-FEF2575-based-AR using FEV1, FEF25-75, IA
# Model F3 using this new AR
# Compare the two models: compare the mean, median, std-percentiles plots of both -> std should be smaller

# Optionally repeat until std doesn't change

# Load data

In [18]:
df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan")
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
df["ecFEF2575%ecFEV1"] = df["FEF2575"] / df["FEV1"] * 100
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,100.874827,43.51145
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,98.816157,51.145038
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.01047,98.816157,53.076923
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,Male,173.0,53,3.610061,97.150104,36.01047,35.456463,100.874827,46.875


# Inference

## Model F3 using AR inferred from FEV1

In [3]:
# Infer FEV1-based-AR using FEV1

# inf_res_df = ih.infer_vars_and_get_back_df(
#     df,
#     # df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
#     variables_to_infer=[AR, IA],
#     observed_variables=[ecFEV1, O2Sat],
#     ecFEF2575prctecFEV1_cpt=None,
# )
# # Merge the inferred AR back to the original dataframe
# df1 = pd.merge(df, inf_res_df, on=["ID", "Date Recorded"], how="inner")
# df1.to_excel(
#     dh.get_path_to_main()
#     + "ExcelFiles/BR/Refining_F3/infer_FEV1-based-AR_using_O2Sat_FEV1.xlsx",
#     index=False,
# )

df1 = dh.load_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-based-AR_using_O2Sat_FEV1.xlsx",
    ["AR"],
)

In [4]:
# Model F3 using FEV1-based-AR
cpt_f3, df_f3, _ = cpt_and_plots.model_f3(df1, AR, "FEV1-based-AR")

Max sampled AR values: 87.91


  df_sampled.groupby("AR bin")


## Model F3 using AR inferred from FEV1 and FEF25-75 (using F3 to model F3)

In [17]:
# Infer FEV1-FEF2575-based-AR
# inf_res_df_2 = ih.infer_vars_and_get_back_df(
#     df,
#     # df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
#     variables_to_infer=[AR, IA],
#     observed_variables=[ecFEV1, O2Sat, ecFEF2575prctecFEV1],
#     ecFEF2575prctecFEV1_cpt=cpt_f3,
# )
# # # Merge the inferred AR back to the original dataframe
# df2 = pd.merge(df, inf_res_df_2, on=["ID", "Date Recorded"], how="inner")
# df2.to_excel(
#     dh.get_path_to_main()
#     + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR_using_O2Sat_FEV1_FEF2575.xlsx",
#     index=False,
# )

df2 = dh.load_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR_using_O2Sat_FEV1_FEF2575.xlsx",
    ["AR"],
)

In [18]:
# Model F3 using FEV1-FEF2575-based-AR

## TODO: why it gives nan when it's the first time I run it?
cpt_f3_2, df_f3_2, _ = cpt_and_plots.model_f3(df2, AR, "FEV1-FEF2575-based-AR")

Max sampled AR values: 87.92






## Model AR-IA using AR inferred with FEV1 and FEF25-75 (using F3 refined with F3 once)

In [23]:
# Infer IA, AR using FEV1-FEF2575-based-AR

# inf_res_df_3 = ih.infer_vars_and_get_back_df(
#     df,
#     # df.iloc[np.r_[10:1300, 3000:4007, 10000:11000]],
#     variables_to_infer=[IA, AR],
#     observed_variables=[ecFEV1, O2Sat, ecFEF2575prctecFEV1],
#     ecFEF2575prctecFEV1_cpt=cpt_f3_2,
# )
# df3 = pd.merge(df, inf_res_df_3, on=["ID", "Date Recorded"], how="inner")
# df3.to_excel(
#     dh.get_path_to_main()
#     + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR-and-IA_using_O2Sat_FEV1_FEF2575.xlsx",
#     index=False,
# )

df3 = dh.load_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR-and-IA_using_O2Sat_FEV1_FEF2575.xlsx",
    ["AR", "IA"],
)

In [8]:
# Model the relationship between AR and IA, and add it to the model

AR_bin_width = 4
n_samples = 100

df_sampled_3 = cpt_and_plots.get_sampled_df_and_statistics_df_for_IA(
    df3, n_samples, AR, AR_bin_width, IA
)

# cpt_and_plots.plot_F3_mean_and_percentiles_per_AR_bin(df_f3, ar_col, y_col, save=True)

ar_col = "FEV1-FEF2575-based-AR"
cpt_ia_ar = cpt_and_plots.calc_plot_cpt_IA_given_AR(
    df_sampled_3, AR, AR_bin_width, ar_col, IA, n_samples, save=True, debug=True
)

# The four last histograms have too few data to be reliable
# Replacing them by the -11th
idx_sixty_six_from_back = len(AR.bins) - AR.get_bin_for_value(66)[1]
print(AR.get_bins_str()[-idx_sixty_six_from_back])
for j in range(idx_sixty_six_from_back - 1):
    cpt_ia_ar[:, -j - 1] = cpt_ia_ar[:, -idx_sixty_six_from_back]

# Renormalise the CPT
cpt_ia_ar = cpt_ia_ar / cpt_ia_ar.sum(axis=0)

cpt_and_plots.check_IA_cpt(cpt_ia_ar, IA, AR, True)

Max sampled AR values: 87.99
Max sampled IA values: 26.70
dirichlet factor 890
A: 2.56270, K: 0.90679, C: 0.00314
dirichlet factor 778
A: 2.32935, K: 0.97312, C: 0.00314
dirichlet factor 765
A: 2.19570, K: 1.01710, C: 0.00320
dirichlet factor 784
A: 2.05222, K: 1.07214, C: 0.00318
dirichlet factor 817
A: 1.93648, K: 1.12250, C: 0.00319
dirichlet factor 865
A: 1.85648, K: 1.16117, C: 0.00319
dirichlet factor 931
A: 1.80639, K: 1.18805, C: 0.00314
dirichlet factor 1013
A: 1.80441, K: 1.18853, C: 0.00318
dirichlet factor 1078
A: 1.81910, K: 1.18005, C: 0.00322
dirichlet factor 984
A: 1.70753, K: 1.24503, C: 0.00309
dirichlet factor 852
A: 1.56550, K: 1.34272, C: 0.00287
dirichlet factor 788
A: 1.51657, K: 1.38150, C: 0.00276
dirichlet factor 740
A: 1.51514, K: 1.38253, C: 0.00276
dirichlet factor 709
A: 1.49327, K: 1.39672, C: 0.00294
dirichlet factor 634
A: 1.41844, K: 1.45706, C: 0.00311
dirichlet factor 516
A: 1.33666, K: 1.52595, C: 0.00355
dirichlet factor 352
A: 1.12539, K: 1.78454,

In [4]:
import src.models.cpts.helpers as cpt


def get_cpt(vars, suffix=None):
    path_to_folder = dh.get_path_to_src() + "models/cpts/"
    var_spec = map(
        lambda var: f"{cpt.name_to_abbr(var.name)}_{var.a}_{var.b}_{var.bin_width}",
        vars,
    )
    filename = "_".join(var_spec)

    if suffix is not None:
        filename = filename + suffix

    arr = np.load(f"{path_to_folder}{filename}.npy")

    assert len(vars) == len(arr.shape)
    return arr

## Model F3 using AR inferred from AR-IA factor and FEV1

In [36]:
# # Infer FEV1-based-AR with IA factor using FEV1

# # Longer inference because of bayes net
# inf_res_df_4 = ih.infer_vars_and_get_back_df(
#     df,
#     variables_to_infer=[AR],
#     observed_variables=[ecFEV1, O2Sat],
#     ecFEF2575prctecFEV1_cpt=None,
#     IA_cpt=get_cpt([IA, AR], "_Dirichlet_prior"),
# )
# df4 = pd.merge(df, inf_res_df_4, on=["ID", "Date Recorded"], how="inner")
# # F4 is the IA-AR factor
# df4.to_excel(
#     dh.get_path_to_main()
#     + "ExcelFiles/BR/Refining_F3/infer_FEV1-based-AR_using_O2Sat_FEV1_F4-with-dirichlet-prior.xlsx",
#     index=False,
# )

df4 = dh.load_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/infer_FEV1-based-AR_using_O2Sat_FEV1_F4-with-dirichlet-prior.xlsx",
    ["AR"],
)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

In [37]:
# Model F3 using FEV1-based-AR with IA factor
cpt_f3_4, df_f3_4, _ = cpt_and_plots.model_f3(df4, AR, "FEV1-based-AR with F4")

Max sampled AR values: 87.78






## Model F3 using AR inferred from FEV1, FEF25-75 (refined with itself once), and the AR-IA factor

In [None]:
# Infer FEV1-FEF2575-based-AR with IA factor using FEV1 and F4

inf_res_df_5 = ih.infer_vars_and_get_back_df(
    df,
    # df.iloc[np.r_[10:1300, 3000:4007]],
    variables_to_infer=[AR, IA],
    observed_variables=[ecFEV1, O2Sat, ecFEF2575prctecFEV1],
    ecFEF2575prctecFEV1_cpt=cpt_f3_2,
    IA_cpt=cpt_ia_ar,
)
df5 = pd.merge(df, inf_res_df_5, on=["ID", "Date Recorded"], how="inner")
df5.to_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR_using_O2Sat_FEV1_F4-with-dirichlet-prior.xlsx",
    index=False,
)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

NameError: name 'inf_res_df_4' is not defined

In [None]:
df5 = pd.merge(df, inf_res_df_5, on=["ID", "Date Recorded"], how="inner")
df5.to_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR_using_O2Sat_FEV1_F4.xlsx",
    index=False,
)

In [None]:
# Model F3 using FEV1-based-AR with IA factor
cpt_f3_5, df_f3_5, _ = cpt_and_plots.model_f3(df5, AR, "FEV1-FEF2575-based-AR with F4")

Max sampled AR values: 89.65






## Model F3 using AR from the two days model (FEV1, O2Sat)

In [8]:
df_meas = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_with_idx")

INFO:root:* Checking for same day measurements *


In [9]:
df6_tmp = dh.load_excel(
    f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_FEV1.xlsx",
    [AR.name],
    ["Day"],
).drop(columns=["Unnamed: 0", HO2Sat.name, IA.name, HFEV1.name])

In [10]:
df6 = df6_tmp.merge(
    df_meas, left_on=["ID", "Day"], right_on=["ID", "Date Recorded"], how="inner"
)
print(df_meas.shape, "length raw meas")
print(df6_tmp.shape, "length two days model output")
print(df6.shape, "length merged")
df6 = df6.drop(
    columns=[
        'Day',
        "FEV1",
        "O2 Saturation",
        "FEF2575",
        # "ecFEV1",
        # "ecFEF2575",
        "Sex",
        "Height",
        "Age",
        "Predicted FEV1",
        "Healthy O2 Saturation",
        "ecFEV1 % Predicted",
        "FEV1 % Predicted",
        "O2 Saturation % Healthy",
        "idx ecFEV1 (L)",
        "idx ecFEF25-75 % ecFEV1 (%)",
        "idx O2 saturation (%)",
    ]
).rename(columns={AR.name: 'AR'})

(41260, 19) length raw meas
(41260, 3) length two days model output
(41260, 21) length merged


In [11]:
df6.columns

Index(['ID', 'AR', 'Date Recorded', 'ecFEV1', 'ecFEF2575', 'ecFEF2575%ecFEV1'], dtype='object')

In [12]:
# Model F3
cpt_f3_6, df_f3_6, _ = cpt_and_plots.model_f3(df6, AR, "AR from two days model")

Max sampled AR values: 87.98






## Plot results

In [13]:
def add_traces_F3_mean_and_percentiles_per_AR_bin(fig, df_f3, dash_style="solid"):
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["mean"],
            mode="lines+markers",
            line=dict(color="blue", dash=dash_style),
            name=f"Mean",
        )
    )
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["median"],
            mode="lines+markers",
            line=dict(color="purple", dash=dash_style),
            name=f"Median",
        )
    )
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["p16"],
            mode="lines+markers",
            line=dict(color="red", dash=dash_style),
            name=f"16th prctile",
        )
    )
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["p84"],
            mode="lines+markers",
            line=dict(color="red", dash=dash_style),
            name=f"84th prctile",
        )
    )
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["p3"],
            mode="lines+markers",
            line=dict(color="green", dash=dash_style),
            name=f"3th prctile",
        )
    )
    fig.add_traces(
        go.Scatter(
            x=df_f3["AR midbin"],
            y=df_f3["p97"],
            mode="lines+markers",
            line=dict(color="green", dash=dash_style),
            name=f"97th prctile",
        )
    )
    return -1

In [19]:
# Conclusion
# df_f3 - F3 from FEV1-based-AR
# df_f3_2 - F3 from FEV1-FEF2575-based-AR
# df_f3_4 - F3 from FEV1-based-AR with IA-AR factor
# F3 from FEV1-FEF2575-based-AR

# ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']

fig = go.Figure()
add_traces_F3_mean_and_percentiles_per_AR_bin(fig, df_f3, "dot")
dot_name='AR from FEV1'
add_traces_F3_mean_and_percentiles_per_AR_bin(fig, df_f3_2, "dash")
dash_name='AR from FEV1 and FEF25-75'
add_traces_F3_mean_and_percentiles_per_AR_bin(fig, df_f3_6, "solid")
solid_name='AR from two days model (FEV1 and O2sat)'
fig.add_annotation(
    x=1.1,
    y=1.15,
    text=f"dot: {dot_name}<br>dash: {dash_name}<br>solid: {solid_name}",
    xref="paper",
    yref="paper",
    showarrow=False,
    font=dict(size=12),
)
fig.update_xaxes(
    title=f"Airway resistance midbin (%)",
    tickvals=np.floor(list(df_f3["AR midbin"].values)),
)
fig.update_traces(marker=dict(size=2))
fig.update_yaxes(title="ecFEF2575prctecFEV1 (%)")
title = f"F3 statistics"
fig.update_layout(title=title, width=1200, height=700)
# fig.write_image(
#     f"{dh.get_path_to_main()}PlotsBreathe/AR_modelling/CPT - Refined F3 - with Dirichlet prior.pdf"
# )

# Plot ecFEF25-75%ecFEV1 vs AR sample

In [None]:
df_sampled, df_f3 = cpt_and_plots.get_sampled_df_and_statistics_df(df3, 1, AR)
df_sampled_tmp = df_sampled[df_sampled["ecFEF2575%ecFEV1"] < 200]
cpt_and_plots.plot_FEF2575_ratio_with_IA(
    df_sampled_tmp, "AR sample", "ecFEF2575%ecFEV1"
)

Max sampled AR values: 84.82






# Model the relationship between AR and IA

In [15]:
df3 = dh.load_excel(
    dh.get_path_to_main()
    + "ExcelFiles/BR/Refining_F3/infer_FEV1-FEF2575-based-AR-and-IA_using_O2Sat_FEV1_FEF2575.xlsx",
    ["AR", "IA"],
)

## Relationship between IA and AR

In [16]:
fig = px.scatter(df3, x="AR mean", y="IA mean", color="ID")
fig.update_traces(marker=dict(size=4))
fig.update_layout(width=1000, height=600, title=f"IA vs AR")
fig.show()

In [17]:
# Why is there fewer points with high IA where AR is in 25-50%?
# Outlying IA values are specific to a few (5) individuals. We just don't have an individual which is in the 25-50%.
# It makes sense to add Dirichlet data prior
df3[(df3["AR mean"] < 30) & (df3["IA mean"] > 8)].drop(columns=["AR", "IA"])

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,IA mean,AR mean
8223,130,2020-02-02,3.03,89,3.14,3.13,3.38,Female,167.0,24,3.489446,98.078678,89.699062,86.833278,90.743474,103.630363,9.255157,11.288598
18426,202,2021-10-29,1.23,82,1.99,1.85,2.61,Female,157.0,64,2.213968,98.258048,83.560392,55.556369,83.453724,161.788618,16.537218,8.653823
19860,215,2020-10-02,2.7,90,2.43,2.72,2.5,Female,173.0,42,3.386997,97.971056,80.307133,79.716639,91.863866,90.0,8.127251,18.384576
25687,265,2021-08-11,2.05,87,2.21,2.18,2.81,Female,158.0,24,3.102588,98.240111,70.263927,66.073876,88.558532,107.804878,11.430453,21.936963
32040,339,2022-06-08,1.62,89,1.22,1.62,1.22,Female,155.0,60,2.257582,98.293922,71.758177,71.758177,90.544764,75.308642,9.387033,27.821145
32044,339,2022-06-13,1.71,89,1.25,1.71,1.3,Female,155.0,60,2.257582,98.293922,75.744743,75.744743,90.544764,73.099415,9.420126,25.119213
32045,339,2022-06-14,1.63,89,1.3,1.71,1.3,Female,155.0,60,2.257582,98.293922,75.744743,72.201129,90.544764,79.754601,9.428928,23.804457
32050,339,2022-06-20,1.68,89,1.22,1.68,1.28,Female,155.0,60,2.257582,98.293922,74.415888,74.415888,90.544764,72.619048,9.40341,26.682316
32086,339,2023-02-01,1.59,90,1.39,1.76,1.55,Female,155.0,60,2.257582,98.293922,77.959501,70.429322,91.562121,87.421384,8.420946,20.534807
32101,339,2023-03-22,1.7,86,1.57,1.77,1.65,Female,155.0,60,2.257582,98.293922,78.402453,75.301791,87.492694,92.352941,12.493225,19.128569


## Relationship between IA and AR after sampling (+ Dirichlet input)

In [18]:
AR_bin_width = 10  # Not necessary for sampling, hence putting a high value
n_samples = 20
df_sampled_tmp = cpt_and_plots.get_sampled_df_and_statistics_df_for_IA(
    df3, n_samples, AR, AR_bin_width, IA
)

Max sampled AR values: 87.98
Max sampled IA values: 26.44


In [19]:
def add_generated_dirichlet_fake_data_prior(df, AR, IA, n):
    dirichlet_data = pd.DataFrame()
    for ar_low, ar_high in AR.get_bins_arr():
        for ia_low, ia_high in IA.get_bins_arr():
            dirichlet_sample = pd.DataFrame(
                {
                    "AR sample": np.random.uniform(ar_low, ar_high, n),
                    "IA sample": np.random.uniform(ia_low, ia_high, n),
                }
            )
            dirichlet_data = pd.concat([dirichlet_data, dirichlet_sample])

    df["Dirichlet"] = False
    dirichlet_data["Dirichlet"] = True

    return pd.concat([df, dirichlet_data])

In [20]:
# Add dirichlet input
AR_bin_width = 4
IA_bin_width = 1
df_sampled_tmp = df_sampled_tmp[["IA sample", "AR sample"]]
df_sampled_tmp = cpt_and_plots.add_binned_up_var(
    df_sampled_tmp, "AR sample", "AR", AR_bin_width
)
df_sampled_tmp = cpt_and_plots.add_binned_up_var(
    df_sampled_tmp, "IA sample", "IA", IA_bin_width
)

# For each AR bin and for each IA bin, add a Dirichlet prior of 10 uniformly distributed datapoints
n_dirichlet = 1  # 20 samples with bin width of 10 = 4 samples with bin width of 2

# Using bins with full resolution
df_sampled_tmp_dirichet = add_generated_dirichlet_fake_data_prior(
    df_sampled_tmp, AR, IA, n_dirichlet
)
print(df_sampled_tmp.shape)

(825760, 7)


In [21]:
fig = px.scatter(df_sampled_tmp, x="AR sample", y="IA sample", color="ID")
# fig = px.scatter(df_sampled_tmp_dirichet, x="AR sample", y="IA sample", color='Dirichlet')
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1000, height=600, title=f"IA vs AR, {n_samples} samples")
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['IA sample', 'AR sample', 'AR bin', 'AR midbin', 'IA bin', 'IA midbin', 'Dirichlet'] but received: ID

## Sample

In [None]:
AR_bin_width = 4
IA_bin_width = 1
n_samples = 100
ar_col = "FEV1-FEF2575-based AR"

df_sampled_3 = cpt_and_plots.get_sampled_df_for_AR_IA(df3, n_samples, AR, IA)

Max sampled AR values: 88.00
Max sampled IA values: 26.83


## Add dirichlet prior

In [None]:
# Add Dirichlet fake data
print(df_sampled_3.shape)
# 4m datapoints is way to much, we need to randomly pick a subset of 100k of it

n_dirichlet = 1
df_sampled_3_dirichlet = add_generated_dirichlet_fake_data_prior(
    df_sampled_3, AR, IA, n_dirichlet
)

n_samples_filtered = 300000
df_sampled_3_dirichlet_reduced = df_sampled_3_dirichlet.sample(n_samples_filtered)

fig = px.scatter(
    df_sampled_3_dirichlet_reduced, x="AR sample", y="IA sample", color="Dirichlet"
)
fig.update_traces(marker=dict(size=3))
fig.update_layout(
    width=1000,
    height=600,
    title=f"IA vs AR, {n_samples_filtered} samples drawn after a {n_samples}x supersampling",
)
fig.show()

(4128800, 23)


NameError: name 'add_generated_dirichlet_fake_data_prior' is not defined

In [None]:
# Bin up variables
df_sample_fit = df_sampled_3_dirichlet
# df_sample_fit = df_sampled_3

df_sample_fit = cpt_and_plots.add_binned_up_var(
    df_sample_fit, "AR sample", "AR", AR_bin_width
)
df_sample_fit = cpt_and_plots.add_binned_up_var(
    df_sample_fit, "IA sample", "IA", IA_bin_width
)

ar_groups = np.sort(list(df_sample_fit["AR midbin"].unique()))
ar_groups

NameError: name 'df_sampled_3_dirichlet' is not defined

## Fit Boltzmann MLE/MSE, custom exp MSE, Log series MLE

In [None]:
# MLE
import scipy.stats as stats
from scipy.optimize import minimize

cpt_ia_ar = np.zeros([len(IA.bins), len(ar_groups), 4])

# ar_groups_with_n = df_sample_fit['AR midbin'].value_counts().sort_index()
# ar_group_index = list(map(lambda ar_value: f"AR={ar_value:g}%", ar_groups_with_n.index))
# ar_group_count = list(map(lambda ar_count: f"n={ar_count}", ar_groups_with_n.values))
# ar_group_legend = [f"AR midbin={ar_group_index:g}%<br>n={ar_group_count}" for ar_group_index, ar_group_count in zip(ar_groups_with_n.index, ar_groups_with_n.values)]


# f"AR={ar_group}%<br>n={df_tmp['IA sample'].shape[0]}

for fit_id in [1, 2, 3, 4]:

    fig = make_subplots(
        cols=2,
        rows=len(ar_groups),
        shared_xaxes="all",
        # y_title="Probability",
        x_title="Inactive alveoli midbin value (%)",
        # row_titles=ar_group_legend,
        column_titles=[
            "Probabilities given in linear scale",
            "Probabilities given in log scale",
        ],
        # vertical_spacing=0.02,
        horizontal_spacing=0.04,
    )

    for idx, ar_group in enumerate(ar_groups):
        df_tmp = df_sample_fit[df_sample_fit["AR midbin"] == ar_group].copy()

        # Create histogram data for IA, binned by IA bins
        s_ia_hist = df_tmp["IA midbin"].value_counts()

        # Add 10% of the data distributed evenly on each bin
        # print("dirichlet factor", round(s_ia_hist.sum() * 0.1 / len(IA.bins)))
        # s_ia_hist_dirichlet = s_ia_hist  # + round(s_ia_hist.sum() * 0.1 / len(IA.bins))
        # s_ia_hist_dirichlet = s_ia_hist  + 100
        # s_ia_hist_norm = s_ia_hist_dirichlet / s_ia_hist_dirichlet.sum()
        s_ia_hist_norm = s_ia_hist / s_ia_hist.sum()

        x = np.array(s_ia_hist_norm.index)
        p = s_ia_hist_norm.values

        # The boltzmann distribution is defined for integer values (not float)
        # That means we can't use the midbin value of the IA bins (since it's usually a float)
        # Instead we'll designate the bin by its lower boundary
        # Input data: the midbin value (uniform approximation within each bin).
        # Output data fetched using the bin's lower boundary value

        def get_y_for_fit(fit_id):
            # Boltzmann fit with MLE
            if fit_id == 1:
                bounds = [(0, 2), (len(IA.bins), len(IA.bins))]
                result = stats.fit(
                    stats.boltzmann, df_tmp["IA midbin"], bounds, method="mle"
                )
                dist_name = "Boltzmann"
                params_str = f"param={result.params.lambda_:.4g}"
                print(f"{result}")
                y_fit = stats.boltzmann(*result.params).pmf(IA.bins)

            # Log series fit with MLE
            if fit_id == 2:
                bounds = [(0, 1)]
                result = stats.fit(
                    stats.logser, df_tmp["IA midbin"].to_numpy() + 1, bounds
                )
                dist_name = "Log-Series"
                params_str = f"param={result.params.p:.4g}"
                print(f"{result}")
                y_fit = stats.logser(*result.params).pmf(IA.bins + 1)

            # Fit decay with MSE
            if fit_id == 3:
                dist_name = "Exponential with MSE"

                def func(x, A, K, C):
                    # C = 0
                    # return K * x + np.log(A)
                    return A * np.exp(-x / K) + C

                def objective(params, x, y):
                    return np.sum((func(x, *params) - y) ** 2)

                # Initial guess for parameters
                initial_guess = [1, 1, 0]
                # Enforce constraint: function can't give negative value
                cons = {
                    "type": "ineq",
                    "fun": lambda params: func(x, *params),
                }
                # cons = {}
                # Minimize the objective function with the constraint
                result = minimize(
                    objective, initial_guess, args=(x, p), constraints=cons
                )
                A, K, C = result.x
                print(f"A: {A:.5f}, K: {K:.5f}, C: {C:.5f}")
                params_str = f"A={A:.2g}<br>K={K:.2g}<br>C={C:.2g}"
                # params_str = f"{A:.2g},{K:.2g},{C:.2g}"
                y_fit = func(IA.midbins, A, K, C)

            if fit_id == 4:
                # Fit Boltzmann with MSE
                dist_name = "Boltzmann with MSE"

                def func(x, lambda_):
                    # C = 0
                    # return K * x + np.log(A)
                    return (
                        (1 - np.exp(-lambda_))
                        * np.exp(-lambda_ * x)
                        / (1 - np.exp(-lambda_ * len(IA.bins)))
                    )

                def objective(params, x, y):
                    return np.sum((func(x, *params) - y) ** 2)

                # Initial guess for parameters
                initial_guess = [0.8]
                # Enforce constraint: function can't give negative value
                cons = {
                    "type": "ineq",
                    "fun": lambda params: func(x, *params),
                }
                # cons = {}
                # Minimize the objective function with the constraint
                result = minimize(
                    objective, initial_guess, args=(x, p), constraints=cons
                )
                [lambda_] = result.x
                print(f"lambda_={lambda_:.5g}")
                params_str = f"param={lambda_:.4g}"
                y_fit = func(IA.midbins, result.x)
            return y_fit, dist_name, params_str

        y_fit_plot, dist_name, params_str = get_y_for_fit(fit_id)
        cpt_ia_ar[:, idx, fit_id - 1] = y_fit_plot

        fig.add_trace(
            go.Bar(
                x=x,
                y=p,
                name="Inferred inactive alveoli data",
                marker=dict(color="#0072b2"),
                legendgroup="2",
                showlegend=True if idx == 0 else False,
            ),
            col=1,
            row=idx + 1,
        )
        fig.add_trace(
            go.Bar(
                x=IA.midbins,
                y=y_fit_plot,
                name=f"Fitted {dist_name} distribution",
                marker=dict(color="#d55e00"),
                legendgroup="1",
                showlegend=True if idx == 0 else False,
            ),
            col=1,
            row=idx + 1,
        )
        fig.add_trace(
            go.Bar(
                x=x,
                y=p,
                name="Inactive alveoli data",
                marker=dict(color="#0072b2"),
                legendgroup="2",
                showlegend=False,
            ),
            col=2,
            row=idx + 1,
        )
        fig.add_trace(
            go.Bar(
                x=IA.midbins,
                y=y_fit_plot,
                name=f"Fitted {dist_name} distribution",
                marker=dict(color="#d55e00"),
                legendgroup="1",
                showlegend=False,
            ),
            col=2,
            row=idx + 1,
        )

        fig.update_yaxes(
            title=f"Probability<br><br>AR={ar_group:g}%<br>n={df_tmp['IA sample'].shape[0]}<br>{params_str}",
            col=1,
            row=idx + 1,
            range=[0, 0.64],
        )
        fig.update_yaxes(
            type="log", col=2, row=idx + 1, range=[np.log(10e-5), np.log(1)]
        )

    fig.update_xaxes(tickvals=IA.midbins)
    title = f"AR-IA CPT with {dist_name} fit - distribution of inactive alveoli (IA) given each bin of airway resistance ({len(ar_groups)} AR groups)"

    fig.update_layout(
        width=1600,
        height=3500,
        font=dict(size=9),
        # title=f"P({IA.name} | Airway resistance in % (ar_col))",
        title=f"{title}<br> using {n_samples} pairs of samples drawn from AR and IA's inferred distributions as input data",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )
    fig.update_annotations(font_size=10)
    # fig.show()
    fig.write_image(
        dh.get_path_to_main()
        # + f"PlotsBreathe/AR_modelling/CPT IA-AR - 22 groups/{title}.pdf"
        + f"PlotsBreathe/AR_modelling/CPT IA-AR - 22 groups - with Dirichlet fake data/{title}.pdf"
    )

# An Botlzmann fit is not ideal: with an exponential decay, the decay rate has to be small enough to prevent 0 probabilities for high IA values
# Exponential decay is too strong
# Try log decay

NameError: name 'ar_groups' is not defined

In [None]:
cpt_ia_ar.shape

(30, 23, 4)

In [None]:
# Test
fig = go.Figure()
lambda_, N = 0.3, 100
rv1 = stats.boltzmann(lambda_, N, loc=0)
rv2 = stats.logser(p=0.7)
rv3 = stats.poisson(mu=10)
res1 = rv1.pmf(IA.bins)
res2 = rv2.pmf(IA.bins + 1)
# res3 = rv3.pmf(IA.bins+1)
fig.add_trace(go.Bar(x=IA.midbins, y=res1 / sum(res1), name="Boltzmann"))
fig.add_trace(go.Bar(x=IA.midbins, y=res2 / sum(res2), name="Log-series"))
# fig.add_trace(go.Bar(x=IA.midbins, y=res3/sum(res3), name='Poisson'))
# rv2 = stats.planck()
fig.show()

## Analyse IA-AR cpt (with AR rebinned)

In [None]:
import src.inference.helpers as ih

AR_rebinned = mh.VariableNode(
    "Airway resistance (%)", 0, 90, 4, prior={"type": "uniform"}
)

# cpt_ia_ar = cpt.get_cpt([IA, AR])
print(cpt_ia_ar.shape)
print("IA dim", len(IA.bins))

(30, 23, 4)
IA dim 30


In [None]:
# Plot cpt using import src.models.cpts.helpers as cpt_helpers.plot_2d_cpt

# Create subplots
# n_ar_bins = len(AR.bins)
# n_ar_bins = len(ar_groups)

# fig = make_subplots(rows=n_ar_bins, cols=1, shared_xaxes=True, vertical_spacing=0.005)

# for i in range(n_ar_bins):
#     p = cpt_ia_ar[:, i, 1]
#     ih.plot_histogram(fig, IA, p, IA.a, IA.b, i+1, 1, colour='#0072b2', annot=False)
#     fig.update_yaxes(title_text=f"AR={AR_rebinned.midbins[i]:2g}%", row=i + 1, col=1)

# title = f"IA-AR CPT with Dirichlet prior - IA given AR ({len(ar_groups)} AR groups)"

# fig.update_layout(
#     title=title,
#     width=500,
#     height=1300,
#     showlegend=False,
#     font=dict(size=8),
# )
# # fig.show()
# fig.write_image(
#     dh.get_path_to_main()
#     + f"PlotsBreathe/AR_modelling/{title}.pdf"
# )

In [None]:
# Plot cpt using import src.models.cpts.helpers as cpt_helpers.plot_2d_cpt

# Create subplots
fig = make_subplots(
    rows=len(IA.bins), cols=1, shared_xaxes=True, vertical_spacing=0.003
)

for i in range(len(IA.bins)):
    p = cpt_ia_ar[i, :, 1] / sum(cpt_ia_ar[i, :, 1])
    ih.plot_histogram(
        fig,
        AR_rebinned,
        p,
        AR_rebinned.a,
        AR_rebinned.b,
        i + 1,
        1,
        colour="#0072b2",
        annot=False,
    )
    fig.update_yaxes(title_text=f"IA={IA.midbins[i]}%", row=i + 1, col=1)
    # Add tickvals on x axis
    fig.update_xaxes(tickvals=ar_groups, row=i + 1, col=1)
    fig.add_shape(
        type="rect",
        x0=76,
        x1=90,
        y0=0,
        y1=0.37,
        line=dict(color="red", width=1),
        fillcolor="red",
        opacity=0.2,
        row=i + 1,
        col=1,
    )

fig.update_xaxes(title_text="Airway resistance midbin (%)", row=len(IA.bins), col=1)
fig.update_yaxes(range=[0, 0.37], tickvals=[0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35])

title = f"IA-AR CPT with Dirichlet prior - AR given IA ({len(ar_groups)} AR groups)"

fig.update_layout(
    title=title,
    width=600,
    height=2500,
    showlegend=False,
    font=dict(size=8),
)
# fig.show()
fig.write_image(dh.get_path_to_main() + f"PlotsBreathe/AR_modelling/{title}.pdf")

## Format CPT to full AR resolution, analyse it and save

In [None]:
import src.models.cpts.helpers as cpt

cpt_ia_ar.shape

print(len(AR.bins))
print(len(AR_rebinned.bins))

NameError: name 'cpt_ia_ar' is not defined

In [14]:
# Reformat the AR dim for 45 bins instead of 22
cpt_ia_ar_final = np.zeros([len(IA.bins), len(AR.bins)])

# Each bin is replicated 2 times
for i in range(cpt_ia_ar.shape[1] - 1):
    for j in [0, 1]:
        print(i * 2 + j)
        cpt_ia_ar_final[:, i * 2 + j] = cpt_ia_ar[:, i, 1]

# After AR 72-76%, replicate the same data
print(f"Unreliable data at bin: {AR.get_bin_for_value(76)}")
for i in np.arange(38, len(AR.bins)):
    cpt_ia_ar_final[:, i] = cpt_ia_ar_final[:, 37]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
Unreliable data at bin: ('[76.0; 78.0)', 38)


In [None]:
# Create subplots
plot_bins = range(len(AR.bins))
vspace = 0.003
height = 2500
title = (
    f"IA-AR full CPT with Dirichlet prior - IA given AR ({len(ar_groups)} AR groups)"
)

# plot_bins = np.arange(0, len(AR.bins), 6)
# vspace=0.02
# height=700
# title = f"IA-AR full CPT with Dirichlet prior - IA given AR ({len(ar_groups)} AR groups) - summary"

fig = make_subplots(
    rows=len(plot_bins), cols=1, shared_xaxes=True, vertical_spacing=vspace
)

for i, idx in enumerate(plot_bins):
    p = cpt_ia_ar_final[:, idx]
    ih.plot_histogram(fig, IA, p, IA.a, IA.b, i + 1, 1, colour="#0072b2", annot=False)
    fig.update_yaxes(
        title_text=f"{AR.midbins[idx]:2g}%", row=i + 1, col=1, range=[0, 0.6]
    )

fig.update_layout(
    title=title,
    width=500,
    height=height,
    showlegend=False,
    font=dict(size=8),
)
# # fig.show()
# fig.write_image(
#     dh.get_path_to_main()
#     + f"PlotsBreathe/AR_modelling/{title}.pdf"
# )

NameError: name 'ar_groups' is not defined

In [6]:
# Create subplots
plot_bins = IA.bins.astype(int)
vspace = 0.003
height = 2500
title = (
    f"IA-AR full CPT with Dirichlet prior - AR given IA ({len(ar_groups)} AR groups)"
)

# plot_bins = np.arange(0, len(IA.bins), 4)
# vspace=0.02
# height=700
# title = f"IA-AR full CPT with Dirichlet prior - AR given IA ({len(ar_groups)} AR groups) - summary"


fig = make_subplots(
    rows=len(plot_bins), cols=1, shared_xaxes=True, vertical_spacing=vspace
)

for i, idx in enumerate(plot_bins):
    p = cpt_ia_ar_final[idx, :] / sum(cpt_ia_ar_final[idx, :])
    ih.plot_histogram(fig, AR, p, AR.a, AR.b, i + 1, 1, colour="#0072b2", annot=False)
    fig.update_yaxes(title_text=f"IA={IA.midbins[idx]}%", row=i + 1, col=1)
    # Add tickvals on x axis
    fig.update_xaxes(tickvals=ar_groups, row=i + 1, col=1)
    fig.add_shape(
        type="rect",
        x0=76,
        x1=90,
        y0=0,
        y1=0.37,
        line=dict(color="yellow", width=1),
        fillcolor="yellow",
        opacity=0.2,
        row=i + 1,
        col=1,
    )

fig.update_xaxes(title_text="Airway resistance midbin (%)", row=len(IA.bins), col=1)
fig.update_yaxes(range=[0, 0.1], tickvals=[0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35])

fig.update_layout(
    title=title,
    width=600,
    height=height,
    showlegend=False,
    font=dict(size=8),
)
# fig.show()
fig.write_image(dh.get_path_to_main() + f"PlotsBreathe/AR_modelling/{title}.pdf")

NameError: name 'ar_groups' is not defined

In [17]:
# Let's try this cpt
import src.models.cpts.helpers as cpt

cpt.save_cpt([IA, AR], cpt_ia_ar_final, "_Dirichlet_prior")

[0, 4, 8, 12, 16, 20, 24, 28]