In [100]:
import src.data.breathe_data as br
import plotly.express as px
import src.modelling_fef2575.hfef2575 as hfef2575
import src.o2_fev1_analysis.smooth as smooth
import src.data.helpers as dh
import src.models.helpers as mh
import numpy as np
import pandas as pd
import plotly.graph_objects as go

In [2]:
# df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_inferred_AR_IA").drop(columns=["AR", "IA"])
# df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_inferred_AR_IA").drop(columns=["AR", "IA"])
# df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan_processed")
# df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_inferred_AR_IA_processed").drop(
#     columns=["AR", "IA"]
# )
df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_inferred_AR_IA_HFEV1_HO2Sat", ['AR']).drop(
    columns=["IA", "HFEV1", "HO2Sat"]
)


In [3]:
df["PEF (L/s)"] = df.PEF / 60

In [4]:
def apply_new_smoothing(df, col, scale, shift):
    df[f"ec{col}"] = df[col]
    no_nan_mask = ~df[f"ec{col}"].isna()
    df.iloc[no_nan_mask] = smooth.identify_and_replace_outliers_up(
        df[no_nan_mask], f"ec{col}", scale=scale, shift=shift
    ).copy()
    df[f"ec{col}"][no_nan_mask] = smooth.smooth_vector(
        df[f"ec{col}"][no_nan_mask].to_numpy(), "max"
    )

    return df


df = (
    df.groupby(by="ID")
    .apply(lambda x: apply_new_smoothing(x, "FEV1", 3, 0.5))
    .reset_index(drop=True)
)
df = (
    df.groupby(by="ID")
    .apply(lambda x: apply_new_smoothing(x, "FEF2575", 3, 0.5))
    .reset_index(drop=True)
)
df = (
    df.groupby(by="ID")
    .apply(lambda x: apply_new_smoothing(x, "PEF (L/s)", 3, 1))
    .reset_index(drop=True)
)

ID 221 - Outlier up for ecFEV1, day 2022-08-09: 3.03 > 2.45 and > 2.44 (mean=1.94,std=0.17), update to 2.06
ID 244 - Outlier up for ecFEV1, day 2022-08-27: 4.38 > 3.87 and > 3.61 (mean=3.11,std=0.26), update to 3.06
ID 104 - Outlier up for ecFEF2575, day 2023-07-26: 1.73 > 1.72 and > 1.67 (mean=1.17,std=0.18), update to 1.12
ID 123 - Outlier up for ecFEF2575, day 2022-01-04: 5.66 > 5.65 and > 5.21 (mean=4.71,std=0.31), update to 4.94
ID 123 - Outlier up for ecFEF2575, day 2022-11-27: 5.45 > 5.38 and > 5.14 (mean=4.64,std=0.25), update to 4.55
ID 172 - Outlier up for ecFEF2575, day 2022-08-10: 4.23 > 4.05 and > 3.46 (mean=2.96,std=0.37), update to 2.92
ID 203 - Outlier up for ecFEF2575, day 2022-09-21: 2.25 > 1.98 and > 1.75 (mean=1.25,std=0.24), update to 1.20
ID 221 - Outlier up for ecFEF2575, day 2022-08-09: 3.46 > 2.96 and > 2.78 (mean=2.28,std=0.23), update to 2.44
ID 240 - Outlier up for ecFEF2575, day 2023-01-17: 4.64 > 4.43 and > 2.63 (mean=2.13,std=0.77), update to 1.87
ID 240 

In [5]:
def calc_predicted_FEF2575_LMS_df(df):
    """
    Returns a Series with Predicted FEV1 from a DataFrame with Sex, Height, Age
    """
    df["Predicted FEF2575"] = df.apply(
        lambda x: hfef2575.calc_predicted_value_LMS_straight(
            x.Height,
            x.Age,
            x.Sex,
        )["M"],
        axis=1,
    )
    # df.apply(lambda x: sanity_checks.predicted_fev1(x["Predicted FEV1"], x.ID), axis=1)
    return df


def calc_FEVF2575_prct_predicted_df(df):
    """
    Returns input DataFrame with FEV1 % Predicted as a new column, after sanity check
    """
    df["ecFEF2575 % Predicted"] = df["ecFEF2575"] / df["Predicted FEF2575"] * 100
    df["FEF2575 % Predicted"] = df["FEF2575"] / df["Predicted FEF2575"] * 100
    # df.apply(
    #     lambda x: sanity_checks.fev1_prct_predicted(x["ecFEV1 % Predicted"], x.ID),
    #     axis=1,
    # )
    # df["FEV1 % Predicted"] = df["FEV1"] / df["Predicted FEV1"] * 100
    # df.apply(
    #     lambda x: sanity_checks.fev1_prct_predicted(x["FEV1 % Predicted"], x.ID), axis=1
    # )
    return df


# Add Healthy FEF2575
df = calc_predicted_FEF2575_LMS_df(df)
df = calc_FEVF2575_prct_predicted_df(df)

## Analysing PAP individuals to select CT scans candidates

In [None]:
pap_ids = [
    "101",
    "102",
    "103",
    "104",
    "105",
    "106",
    "107",
    "108",
    "109",
    "110",
    "111",
    "112",
    "113",
    "114",
    "115",
    "116",
    "117",
    "118",
    "119",
    "120",
    "121",
    "122",
    "123",
    "124",
    "125",
    "126",
    "127",
    "128",
    "129",
    "130",
    "131",
    "132",
    "133",
    "134",
    "135",
    "136",
    "137",
    "138",
    "139",
    "140",
    "141",
    "142",
    "143",
    "144",
    "145",
    "146",
    "147",
    "148",
    "149",
    "150",
    "151",
    "152",
    "153",
    "154",
    "155",
    "156",
    "157",
    "158",
    "159",
    "160",
    "161",
    "162",
    "163",
    "164",
    "165",
    "166",
    "167",
    "168",
    "169",
    "170",
    "171",
    "172",
    "173",
    "174",
    "175",
    "176",
    "177",
    "178",
    "179",
    "180",
    "181",
    "182",
    "183",
    "184",
    "185",
    "186",
    "187",
    "188",
    "189",
    "190",
    "191",
    "192",
    "193",
    "194",
    "195",
    "196",
    "197",
    "198",
    "199",
    "200",
    "201",
    "202",
    "203",
    "204",
    "205",
    "206",
    "207",
    "208",
    "209",
    "210",
    "211",
    "212",
    "213",
    "214",
    "215",
    "216",
    "217",
    "218",
    "219",
    "220",
    "221",
    "222",
    "223",
    "224",
    "225",
    "226",
    "227",
    "228",
    "229",
    "230",
    "231",
    "232",
    "233",
    "234",
    "235",
    "236",
    "237",
    "238",
    "239",
    "240",
    "241",
    "242",
    "243",
    "244",
    "245",
    "246",
    "247",
    "248",
]

df2 = df[df.ID.isin(pap_ids)]

In [10]:
import numpy as np
import pandas as pd


def get_avg_ar_ia(df):
    avg_ar = np.nanmean(df["AR mean"])
    avg_ia = np.nanmean(df["IA mean"])
    return avg_ar, avg_ia


dftmp = df2.groupby(by="ID").apply(get_avg_ar_ia)

res = (
    dftmp.apply(pd.Series)
    .reset_index()
    .rename(columns={0: "avg ar", 1: "avg ia"})
    # .drop(columns="level_1")
)

# plot avg ar vs avg ia
fig = px.scatter(res, x="avg ar", y="avg ia", color="ID")
fig.show()

In [32]:
import numpy as np
import pandas as pd


def get_avg_ar_ia(df):
    avg_ar = np.nanmean(df["AR mean"])
    avg_ia = np.nanmean(df["IA mean"])
    return avg_ar, avg_ia


dftmp = df2[df2.ID.isin(['113', '111', '215', '120', '101', '103', '237', '224', '133'])].groupby(by="ID").apply(get_avg_ar_ia)

res = (
    dftmp.apply(pd.Series)
    .reset_index()
    .rename(columns={0: "Average airway resistance for this individual (%)", 1: "Average % of inactive alveoli for this individual"})
    # .drop(columns="level_1")
)

# plot Average airway resistance for this indiviudal (%) vs Average % of inactive alveoli for this individual
fig = px.scatter(res, x="Average airway resistance for this individual (%)", y="Average % of inactive alveoli for this individual", color="ID")
fig.update_layout(title="Subset of Papworth individuals selected for CT scan analysis")
fig.show()

In [25]:
import datetime
dff = df2[df2.ID == '127']
dff[dff["Date Recorded"] < datetime.date(2021, 7, 1)]

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,PEF,ecFEV1,Age,Sex,Height,...,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,AR mean,IA mean,HFEV1 mean,HO2Sat mean,PEF (L/s),ecFEF2575,ecPEF (L/s)
3370,127,2021-05-26,1.72,95,1.11,450,1.73,37,Male,177.0,...,40.560781,40.326326,97.859094,57.583564,1.447175,4.137815,97.163018,7.5,1.11,7.5


In [24]:
df2[df2.ID == '127']["Date Recorded"][3410]

datetime.date(2022, 8, 20)

In [None]:
fig = px.scatter(df2, x="AR mean", y="IA mean", color="ID")
fig.update_layout(width=2000)
fig.show()

## PEF, FEF2575 data quality

In [None]:
# Using plotly create histogram of fef2575%pef
def pef_for_df(df):
    fig = px.histogram(df, x="PEF", nbins=500, color="ID")
    # fig = px.histogram(df, x="FEF2575 % Predicted", nbins=500)
    fig.update_layout(width=1500, height=700)
    fig.show()
    return

ids = df[df.PEF == 300].ID.unique()
print(ids)
pef_for_df(df)
# pef_for_df(df[df.ID.isin(ids)])
# df[df.ID.isin(ids)].groupby("ID").apply(pef_for_df)

['104' '106' '107' '112' '116' '122' '158' '162' '176' '202' '220' '221'
 '224' '229' '238' '265' '272' '310' '334']


In [None]:
df[df.PEF < 80]

In [None]:
df["FEF2575%PEF"] = df.FEF2575 / df["PEF (L/s)"]

# Using plotly create histogram of fef2575%pef
fig = px.histogram(df, x="FEF2575%PEF")
fig.show()

In [None]:
# Why are there measurements > 100? even > 70?

# If the test is well performed, FEF2575 should never be > than PEF
# Or the PEF is super low
print(len(df[df["FEF2575%PEF"] > 70]) / len(df) * 100, "% are > 70%")

df[df["FEF2575%PEF"] > 100]

In [None]:
# PEF should be greater than FEV1
idx = df[df["PEF (L/s)"] < df["FEV1"]].index
df[df["PEF (L/s)"] < df["FEV1"]]

In [None]:
# df1 = df1.drop(idx, axis=0, inplace=False)

In [None]:
# FEF2575 should be lower than FEV1, otherwise:
# Test not well performed: PEF reached late -> flow increase over a long time (0.5s-0.1s) instead of 0.1s, little dynamic compression of the airways
df[df.FEV1 < df.FEF2575].sort_values(by="FEV1")

# Highlight HFEV1 postior shift

In [48]:
df["ecFEV1%HFEV1"] = df["ecFEV1"] / df["HFEV1 mean"] * 100

In [49]:
# fig = px.scatter(df1, x="AR mean", y="ecFEV1%HFEV1", color="ID")
fig = px.scatter(df, x="AR mean", y="ecFEV1 % Predicted", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1000, height=700)
fig.show()

In [50]:
import plotly.graph_objects as go

fig = go.Figure()
# Add ecFEV1%HFEV1 AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1%HFEV1"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1%HFEV1",
    )
)
# Add ecFEV1 % Predicted AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1 % Predicted",
    )
)
fig.update_layout(width=1000, height=700)
# x axis title
fig.update_xaxes(title_text="Airway resistance (%)")
fig.update_yaxes(title_text="FEV1% (prior, posterior)")
fig.show()

# Modelling FEF2575

In [11]:
# fig = px.scatter(df, x="FEV1", y="FEF2575", color="ID")
fig = px.scatter(df, x="ecFEV1", y="ecFEF2575", color="ID")
fig.update_traces(marker=dict(size=3))
fig.update_layout(width=800, height=800)
fig.show()

In [22]:
def plot_for_df(df):
    df = df.reset_index()
    # fig = px.scatter(df, x="ecPEF (L/s)", y="ecFEF2575", color="ID")
    # fig = px.scatter(df1, x="PEF (L/s)", y="FEF2575", color="ID")
    fig.update_traces(marker=dict(size=3))
    # Update the layout to show othonormal axises
    fig.update_xaxes(range=[0, 17])
    fig.update_yaxes(range=[0, 7.5])
    fig.update_layout(width=1200, height=600)
    # fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/AR_modelling/{df.ID[0]}.pdf")
    fig.show()
    return


# df1.groupby(by="ID").apply(plot_for_df)
plot_for_df(df)

# FEF25-75 against AR mean

In [6]:
df["FEF2575%FEV1"] = df["FEF2575"] / df["FEV1"] * 100
df["ecFEF2575%ecFEV1"] = df["ecFEF2575"] / df["ecFEV1"] * 100
df["FEF2575%PEF"] = df["FEF2575"] / df["PEF (L/s)"] * 100
df["ecFEF2575%ecPEF"] = df["ecFEF2575"] / df["ecPEF (L/s)"] * 100

In [7]:
fig = px.scatter(df, x="AR mean", y="FEF2575%FEV1", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
# Update y axis ranve
fig.update_yaxes(range=[0, 250])
fig.show()

In [14]:
fig = px.scatter(df, x="AR mean", y="FEF2575%PEF", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [86]:
threshold = 1.7
# fig = px.scatter(df, x="AR mean", y="FEF2575%PEF", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="ecFEF2575%ecPEF", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="ecFEF2575 % Predicted", color="IA mean")
fig = px.scatter(df, x="AR mean", y="ecFEF2575%ecFEV1", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="FEF2575%FEV1", color="IA mean")
# Update the scale of the figure's color heatmap
t1 = 1.6
t2 = 2
t3 = 8
colorscale = [
    [0, "#FFFFA1"],
    [round(t1 / max(df["IA mean"]), 2) * 0.99, "#FFFFA1"],
    [round(t1 / max(df["IA mean"]), 2), "#FFFFA1"],
    [round(t2 / max(df["IA mean"]), 2) * 0.99, "#FFFFA1"],
    [round(t2 / max(df["IA mean"]), 2), "orange"],
    [round(t3 / max(df["IA mean"]), 2) * 0.99, "orange"],
    [round(t3 / max(df["IA mean"]), 2), "black"],
    [1, "black"],
]
fig.update_coloraxes(colorscale=colorscale)
# fig.update_yaxes(range=[0,150])
fig.update_traces(marker=dict(size=3))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
fig = px.scatter(df, x="AR mean", y="ecFEF2575 % Predicted", color="ID")
fig.update_yaxes(range=[0, 250])
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
# Add ecFEV1%HFEV1 AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEF2575 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEF2575 % Predicted",
    )
)
# Add ecFEV1 % Predicted AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1 % Predicted",
    )
)
fig.update_layout(width=1000, height=700)
# x axis title
fig.update_xaxes(title_text="Airway resistance (%)")
fig.update_yaxes(title_text="Lung function")
fig.show()

# FEF25-75%FEV1 against sampled AR
Sample AR values from the distribution to obtain a realistic spread

In [71]:
df.shape

(15368, 30)

In [154]:
df_sampled = df.copy()
df_sampled['AR sampled'] = np.nan

n_samples = 10

AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior=None)

# Renormalise all AR distributions
df_sampled['AR norm'] = df_sampled.apply(lambda row: row.AR / sum(row.AR), axis=1)

# Create n AR samples per row
df_sampled['AR sample'] = df_sampled.apply(lambda row: AR.sample(n=n_samples, p=row['AR norm']), axis=1)

df_sampled = df_sampled.explode('AR sample').reset_index()

# For each bin of AR compute the statistics of the FEF2575%FEV1 (y_col)
bin_width=2
y_col = 'ecFEF2575%ecFEV1'

df_sampled['AR sample binned'] = pd.cut(
    df_sampled['AR sample'],
    bins=np.arange(np.floor(min(df_sampled['AR sample'])), np.ceil(max(df_sampled['AR sample'])), bin_width)
)

df_f3 = df_sampled.groupby('AR sample binned').agg(
    mean=(y_col, 'mean'),
    std=(y_col, 'std'),
    median=(y_col, 'median'),
    p25=(y_col, lambda x: np.percentile(x, 25)),
    p75=(y_col, lambda x: np.percentile(x, 75)),
    p5=(y_col, lambda x: np.percentile(x, 5)),
    p95=(y_col, lambda x: np.percentile(x, 95)),
).reset_index()

df_f3['AR midbin'] = df_f3['AR sample binned'].apply(lambda x: x.left + bin_width/2)

fig = go.Figure()
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['mean'],
    mode='lines+markers',
    line=dict(color='blue'),
    name='Mean'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['mean'] + df_f3['std'],
    mode='lines+markers',
    line=dict(color='red'),
    name='Mean + std'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['mean'] - df_f3['std'],
    mode='lines+markers',
    line=dict(color='red'),
    name='Mean - std'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['p5'],
    mode='lines+markers',
    line=dict(color='green'),
    name='5th percentile'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['p95'],
    mode='lines+markers',
    line=dict(color='green'),
    name='95th percentile'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['p25'],
    mode='lines+markers',
    line=dict(color='orange'),
    name='25th percentile'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['p75'],
    mode='lines+markers',
    line=dict(color='orange'),
    name='75th percentile'
))
fig.add_traces(go.Scatter(
    x=df_f3['AR midbin'],
    y=df_f3['median'],
    mode='lines+markers',
    line=dict(color='purple'),
    name='Median'
))
# Add ticks on x axis
fig.update_xaxes(title='Airway resistance binned (%)', tickvals=np.floor(list(df_f3['AR midbin'].values)))
fig.update_yaxes(title=y_col)
fig.show()





In [None]:
threshold = 1.7
# fig = px.scatter(df, x="AR mean", y="FEF2575%PEF", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="ecFEF2575%ecPEF", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="ecFEF2575 % Predicted", color="IA mean")
fig = px.scatter(df_sampled, x="AR sample", y="ecFEF2575%ecFEV1", color="IA mean")
# fig = px.scatter(df, x="AR mean", y="FEF2575%FEV1", color="IA mean")
# Update the scale of the figure's color heatmap
t1 = 1.6
t2 = 2
t3 = 8
colorscale = [
    [0, "#FFFFA1"],
    [round(t1 / max(df_sampled["IA mean"]), 2) * 0.99, "#FFFFA1"],
    [round(t1 / max(df_sampled["IA mean"]), 2), "#FFFFA1"],
    [round(t2 / max(df_sampled["IA mean"]), 2) * 0.99, "#FFFFA1"],
    [round(t2 / max(df_sampled["IA mean"]), 2), "orange"],
    [round(t3 / max(df_sampled["IA mean"]), 2) * 0.99, "orange"],
    [round(t3 / max(df_sampled["IA mean"]), 2), "black"],
    [1, "black"],
]
fig.update_coloraxes(colorscale=colorscale)
# fig.update_yaxes(range=[0,150])
fig.update_traces(marker=dict(size=3))
fig.update_layout(width=1200, height=800)
fig.show()

# FEF25-75 against IA

In [None]:
fig = px.scatter(df, x="IA mean", y="ecFEF2575 % Predicted", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
fig = px.scatter(df, x="IA mean", y="FEF2575%PEF", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
fig = px.scatter(df, x="IA mean", y="FEV1", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()