In [1]:
import src.data.breathe_data as bd
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.models.var_builders as var_builders
import src.inference.helpers as ih
from plotly.subplots import make_subplots


import pandas as pd
import numpy as np

# Get unblocked FEV1

In [2]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan")

In [3]:
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
df["ecFEF2575%ecFEV1"] = df["FEF2575"] / df["FEV1"] * 100
print(df.shape)
print("IDs: ", df["ID"].nunique())
df.head()

(41272, 16)
IDs:  353


Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,100.874827,43.51145
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,98.816157,51.145038
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.01047,98.816157,53.076923
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,Male,173.0,53,3.610061,97.150104,36.01047,35.456463,100.874827,46.875


In [4]:
# Add unblocked FEV1
def get_unblocked_fev1_for_ID(df):
    # Find index where ecFEV1 is max
    idx = df["ecFEV1"].idxmax()
    df["Max ecFEV1"] = df.loc[idx, "ecFEV1"]
    df["Max ecFEF2575"] = df.loc[idx, "ecFEF2575"]
    return df


df = (
    df.groupby("ID")
    .apply(get_unblocked_fev1_for_ID)
    .drop(columns=["ID"])
    .reset_index()
    .drop(columns=["level_1"])
)

In [5]:
# How often is max ecFEV1 obtained at the same time as max ecFEF2575?
# Filter idx where max ecFEV1 = ecFEV1
idx_max = df["Max ecFEV1"] == df["ecFEV1"]
df_tmp = df[idx_max].copy()
df_tmp["Drop from max ecFEF2575"] = np.abs(
    df_tmp["Max ecFEF2575"] - df_tmp["ecFEF2575"]
)

(df_tmp["Drop from max ecFEF2575"] == 0).value_counts()
# Most of the time, max ecFEV1 is obtained at the same time as max ecFEF2575

Drop from max ecFEF2575
True     669
False    271
Name: count, dtype: int64

In [6]:
df.columns

Index(['ID', 'Date Recorded', 'FEV1', 'O2 Saturation', 'FEF2575', 'ecFEV1',
       'ecFEF2575', 'Sex', 'Height', 'Age', 'Predicted FEV1',
       'Healthy O2 Saturation', 'ecFEV1 % Predicted', 'FEV1 % Predicted',
       'O2 Saturation % Healthy', 'ecFEF2575%ecFEV1', 'Max ecFEV1',
       'Max ecFEF2575'],
      dtype='object')

# Preprocess breathe Data

In [7]:
(HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat, ecFEF2575prctecFEV1) = (
    var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
        160, 40, "Male"
    )
)

In [8]:
ecFEV1.name

'ecFEV1 (L)'

In [9]:
# Applied get_bin_for_value to all inputs and updated excel file
df[f"idx {ecFEV1.name}"] = df.apply(
    lambda row: ih.get_bin_for_value(row["ecFEV1"], ecFEV1)[1], axis=1
)
df[f"idx {ecFEF2575prctecFEV1.name}"] = df.apply(
    lambda row: ih.get_bin_for_value(row["ecFEF2575%ecFEV1"], ecFEF2575prctecFEV1)[1],
    axis=1,
)
df[f"idx {O2Sat.name}"] = df.apply(
    lambda row: ih.get_bin_for_value(row["O2 Saturation"], O2Sat)[1], axis=1
)

In [10]:
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,...,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,Max ecFEV1,Max ecFEF2575,idx ecFEV1 (L),idx ecFEF25-75 % ecFEV1 (%),idx O2 saturation (%)
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,...,97.150104,36.287474,36.287474,99.845492,41.221374,1.79,1.14,26,20,47
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,Male,173.0,53,...,97.150104,36.287474,36.287474,100.874827,43.51145,1.79,1.14,26,21,48
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,Male,173.0,53,...,97.150104,36.287474,36.287474,98.816157,51.145038,1.79,1.14,26,25,46
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,Male,173.0,53,...,97.150104,36.287474,36.01047,98.816157,53.076923,1.79,1.14,26,26,46
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,Male,173.0,53,...,97.150104,36.01047,35.456463,100.874827,46.875,1.79,1.14,26,23,48


In [11]:
df.to_excel("BR_O2_FEV1_FEF2575_with_idx.xlsx", index=False)

# Use approximate inference (slicing)

In [55]:
df_for_ID.iloc[0]

ID                                    101
Date Recorded                  2019-01-25
FEV1                                 1.31
O2 Saturation                        97.0
FEF2575                              0.54
ecFEV1                               1.31
ecFEF2575                            0.67
Sex                                  Male
Height                              173.0
Age                                    53
Predicted FEV1                   3.610061
Healthy O2 Saturation           97.150104
ecFEV1 % Predicted              36.287474
FEV1 % Predicted                36.287474
O2 Saturation % Healthy         99.845492
ecFEF2575%ecFEV1                41.221374
Max ecFEV1                           1.79
Max ecFEF2575                        1.14
idx ecFEV1 (L)                         26
idx ecFEF25-75 % ecFEV1 (%)            20
idx O2 saturation (%)                  47
Name: 0, dtype: object

In [67]:
idx = df_for_ID.sort_values(
    by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
).index[0]

df_for_ID.loc[idx]

# pd.concat([df_for_ID.iloc[0], df_for_ID.iloc[idx]], axis=1)



Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,...,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,Max ecFEV1,Max ecFEF2575,idx ecFEV1 (L),idx ecFEF25-75 % ecFEV1 (%),idx O2 saturation (%)
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,...,97.150104,36.287474,36.287474,99.845492,41.221374,1.79,1.14,26,20,47
1576,101,2023-07-31,1.79,98.0,1.14,1.79,1.15,Male,173.0,53,...,97.150104,49.583647,49.583647,100.874827,63.687151,1.79,1.14,35,31,48


In [68]:
(HFEV1, ecFEV1, AR, HO2Sat, O2SatFFA, IA, UO2Sat, O2Sat, ecFEF2575prctecFEV1) = (
    var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
        160, 40, "Male"
    )
)

key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
HFEV1.set_factor_node_key(key_hfev1)
HO2Sat.set_factor_node_key(key_ho2sat)

vars = [AR, IA]
shared_vars = [HFEV1, HO2Sat]
obs_vars = [ecFEV1.name, O2Sat.name]


def infer_and_plot_for_id(df_for_ID, shared_vars, vars, obs_vars, diff_threshold=1e-8):
    for shared_var in shared_vars:
        shared_var.reset()
    df_for_ID = df_for_ID.reset_index(drop=True)
    print(f"\nID: {df_for_ID.ID.iloc[0]}")
    print(f"#datapoints: {len(df_for_ID)}")

    height = df_for_ID.Height.iloc[0]
    age = df_for_ID.Age.iloc[0]
    sex = df_for_ID.Sex.iloc[0]
    (
        _,
        inf_alg,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(height, age, sex)

    # Given an ID, get the data which maximises ecFEV1, then ecFEF2575, then O2 Saturation
    idx_unblocked_FEV1 = df_for_ID.sort_values(
        by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
    ).index[0]

    for i, _ in df_for_ID[0:1].iterrows():
        df_two_days = df_for_ID.iloc[[i, idx_unblocked_FEV1]]

        df_query_res, df_res_before_convergence, shared_vars_final = (
            slicing.query_across_days(
                df_two_days, inf_alg, shared_vars, vars, obs_vars, diff_threshold
            )
        )
    return df_query_res, df_res_before_convergence, shared_vars_final


df_for_ID = df[df["ID"] == "101"]
df_query_res, df_res_before_convergence, shared_vars_final = infer_and_plot_for_id(
    df_for_ID, shared_vars, vars, obs_vars, diff_threshold=1e-2
)
# df.groupby("ID").apply(
#     lambda df_for_ID: infer_and_plot_for_id(
#         df_for_ID, shared_vars, vars, obs_vars, diff_threshold=1e-6
#     )
# )


ID: 101
#datapoints: 1680
epoch 0
Epoch 0 - Posteriors' diff for Healthy FEV1 (L): 1.1591598729487829
Epoch 0 - Posteriors' diff for Healthy O2 saturation (%): 1.5410573754374948
epoch 1
Epoch 1 - Posteriors' diff for Healthy FEV1 (L): 0.050254593374776994
Epoch 1 - Posteriors' diff for Healthy O2 saturation (%): 0.035771146622777555
epoch 2
Epoch 2 - Posteriors' diff for Healthy FEV1 (L): 0.0005364392615121114
Epoch 2 - Posteriors' diff for Healthy O2 saturation (%): 0.0003795449118953611
All diffs are below 0.01, running another epoch to get all posteriors
epoch 3
Epoch 3 - Posteriors' diff for Healthy FEV1 (L): 1.5356144391834845e-16
Epoch 3 - Posteriors' diff for Healthy O2 saturation (%): 1.7594568380366327e-17


In [69]:
df_query_res

Unnamed: 0,Day,Airway resistance (%),Inactive alveoli (%),Healthy FEV1 (L),Healthy O2 saturation (%)
0,2019-01-25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7096781291009916, 0.24579384615574595, 0.04...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2023-07-31,"[0.0011286972120282816, 0.001415695739241656, ...","[0.7968573829498137, 0.18247438856020826, 0.01...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
