In [1]:
import src.data.breathe_data as br
import plotly.express as px
import src.modelling_fef2575.hfef2575 as hfef2575
import src.o2_fev1_analysis.smooth as smooth
import src.data.helpers as dh
import src.models.helpers as mh
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import norm
import src.modelling_fef2575.cpt_and_plots as cpt_and_plots

In [3]:
df = br.load_meas_from_excel(
    "BR_O2_FEV1_FEF2575_PEF_inferred_AR_IA_HFEV1_HO2Sat", ["AR"]
).drop(columns=["IA", "HFEV1", "HO2Sat"])
df["PEF (L/s)"] = df.PEF / 60

In [73]:
df = br.load_meas_from_excel("BR_O2_FEV1_FEF2575_PEF_Nan")
df = df.drop(columns=["PEF", "ecPEF (L/s)", "PEF (L/s)"])
df = df.dropna(subset=["FEV1", "O2 Saturation", "FEF2575"])
df["ecFEF2575%ecFEV1"] = df["FEF2575"] / df["FEV1"] * 100
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1
0,101,2019-01-25,1.31,97.0,0.54,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,99.845492,41.221374
1,101,2019-01-26,1.31,98.0,0.57,1.31,0.67,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,100.874827,43.51145
2,101,2019-01-27,1.31,96.0,0.67,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.287474,98.816157,51.145038
3,101,2019-01-28,1.3,96.0,0.69,1.31,0.69,Male,173.0,53,3.610061,97.150104,36.287474,36.01047,98.816157,53.076923
4,101,2019-01-29,1.28,98.0,0.6,1.3,0.69,Male,173.0,53,3.610061,97.150104,36.01047,35.456463,100.874827,46.875


In [4]:
# Drop rows where FEF2575 > 2*FEV1
# Typical when the person expires for only 1s. And doesn't breathe everything out.
# df = df[df.FEF2575/df.FEV1 > 2].drop(columns=['Age', 'Sex', 'Height', 'AR', 'IA mean', 'HFEV1 mean'])
# Removes 5 entries
df = df[df.FEF2575 / df.FEV1 < 2]

In [5]:
# Smoothing moved to data pipeline now
def apply_new_smoothing(df, col, scale, shift):
    """
    Works on df with NaN in col.
    """

    df[f"ec{col}"] = df[col]
    no_nan_mask = ~df[f"ec{col}"].isna()
    df.iloc[no_nan_mask] = smooth.identify_and_replace_outliers_up(
        df[no_nan_mask], f"ec{col}", scale=scale, shift=shift
    ).copy()
    df[f"ec{col}"][no_nan_mask] = smooth.smooth_vector(
        df[f"ec{col}"][no_nan_mask].to_numpy(), "max"
    )

    return df


df = (
    df.groupby(by="ID")
    .apply(lambda x: apply_new_smoothing(x, "FEV1", 3, 0.5))
    .reset_index(drop=True)
)
df = (
    df.groupby(by="ID")
    .apply(lambda x: apply_new_smoothing(x, "FEF2575", 3, 0.5))
    .reset_index(drop=True)
)
# df = (
#     df.groupby(by="ID")
#     .apply(lambda x: apply_new_smoothing(x, "PEF (L/s)", 3, 1))
#     .reset_index(drop=True)
# )

In [6]:
def calc_predicted_FEF2575_LMS_df(df):
    """
    Returns a Series with Predicted FEV1 from a DataFrame with Sex, Height, Age
    """
    df["Predicted FEF2575"] = df.apply(
        lambda x: hfef2575.calc_predicted_value_LMS_straight(
            x.Height,
            x.Age,
            x.Sex,
        )["M"],
        axis=1,
    )
    # df.apply(lambda x: sanity_checks.predicted_fev1(x["Predicted FEV1"], x.ID), axis=1)
    return df


def calc_FEVF2575_prct_predicted_df(df):
    """
    Returns input DataFrame with FEV1 % Predicted as a new column, after sanity check
    """
    df["ecFEF2575 % Predicted"] = df["ecFEF2575"] / df["Predicted FEF2575"] * 100
    df["FEF2575 % Predicted"] = df["FEF2575"] / df["Predicted FEF2575"] * 100
    # df.apply(
    #     lambda x: sanity_checks.fev1_prct_predicted(x["ecFEV1 % Predicted"], x.ID),
    #     axis=1,
    # )
    # df["FEV1 % Predicted"] = df["FEV1"] / df["Predicted FEV1"] * 100
    # df.apply(
    #     lambda x: sanity_checks.fev1_prct_predicted(x["FEV1 % Predicted"], x.ID), axis=1
    # )
    return df


# Add Healthy FEF2575
df = calc_predicted_FEF2575_LMS_df(df)
df = calc_FEVF2575_prct_predicted_df(df)

## Analysing PAP individuals to select CT scans candidates

In [None]:
# pap_ids = ["101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153","154","155","156","157","158","159","160","161","162","163","164","165","166","167","168","169","170","171","172","173","174","175","176","177","178","179","180","181","182","183","184","185","186","187","188","189","190","191","192","193","194","195","196","197","198","199","200","201","202","203","204","205","206","207","208","209","210","211","212","213","214","215","216","217","218","219","220","221","222","223","224","225","226","227","228","229","230","231","232","233","234","235","236","237","238","239","240","241","242","243","244","245","246","247","248",
# ]

# df2 = df[df.ID.isin(pap_ids)]

In [None]:
import numpy as np
import pandas as pd


def get_avg_ar_ia(df):
    avg_ar = np.nanmean(df["AR mean"])
    avg_ia = np.nanmean(df["IA mean"])
    return avg_ar, avg_ia


dftmp = df2.groupby(by="ID").apply(get_avg_ar_ia)

res = (
    dftmp.apply(pd.Series)
    .reset_index()
    .rename(columns={0: "avg ar", 1: "avg ia"})
    # .drop(columns="level_1")
)

# plot avg ar vs avg ia
fig = px.scatter(res, x="avg ar", y="avg ia", color="ID")
fig.show()

In [None]:
import numpy as np
import pandas as pd


def get_avg_ar_ia(df):
    avg_ar = np.nanmean(df["AR mean"])
    avg_ia = np.nanmean(df["IA mean"])
    return avg_ar, avg_ia


dftmp = (
    df2[df2.ID.isin(["113", "111", "215", "120", "101", "103", "237", "224", "133"])]
    .groupby(by="ID")
    .apply(get_avg_ar_ia)
)

res = (
    dftmp.apply(pd.Series)
    .reset_index()
    .rename(
        columns={
            0: "Average airway resistance for this individual (%)",
            1: "Average % of inactive alveoli for this individual",
        }
    )
    # .drop(columns="level_1")
)

# plot Average airway resistance for this indiviudal (%) vs Average % of inactive alveoli for this individual
fig = px.scatter(
    res,
    x="Average airway resistance for this individual (%)",
    y="Average % of inactive alveoli for this individual",
    color="ID",
)
fig.update_layout(title="Subset of Papworth individuals selected for CT scan analysis")
fig.show()

In [None]:
import datetime

dff = df2[df2.ID == "127"]
dff[dff["Date Recorded"] < datetime.date(2021, 7, 1)]

In [None]:
df2[df2.ID == "127"]["Date Recorded"][3410]

In [None]:
fig = px.scatter(df2, x="AR mean", y="IA mean", color="ID")
fig.update_layout(width=2000)
fig.show()

## PEF, FEF2575 data quality

In [19]:
# Using plotly create histogram of fef2575%pef
def pef_for_df(df):
    fig = px.histogram(df, x="PEF", nbins=500)
    # fig = px.histogram(df, x="FEF2575 % Predicted", nbins=500)
    fig.update_layout(width=1500, height=700)
    fig.show()
    return


ids = df[df.PEF == 300].ID.unique()
print(ids)
pef_for_df(df)

['104' '106' '107' '112' '116' '122' '158' '162' '176' '202' '220' '221'
 '224' '229' '238' '265' '272' '310' '334' '377' '387' '413' '415' '418'
 '435' '444' '459' '466' '470' '502' '506' '509' '519' '529' '554']


In [None]:
df[df.PEF < 80]

In [None]:
df["FEF2575%PEF"] = df.FEF2575 / df["PEF (L/s)"]

# Using plotly create histogram of fef2575%pef
fig = px.histogram(df, x="FEF2575%PEF")
fig.show()

In [None]:
# Why are there measurements > 100? even > 70?

# If the test is well performed, FEF2575 should never be > than PEF
# Or the PEF is super low
print(len(df[df["FEF2575%PEF"] > 70]) / len(df) * 100, "% are > 70%")

df[df["FEF2575%PEF"] > 100]

In [None]:
# PEF should be greater than FEV1
idx = df[df["PEF (L/s)"] < df["FEV1"]].index
df[df["PEF (L/s)"] < df["FEV1"]]

In [None]:
# df1 = df1.drop(idx, axis=0, inplace=False)

In [None]:
# FEF2575 should be lower than FEV1, otherwise:
# Test not well performed: PEF reached late -> flow increase over a long time (0.5s-0.1s) instead of 0.1s, little dynamic compression of the airways
df[df.FEV1 < df.FEF2575].sort_values(by="FEV1")

# Highlight HFEV1 postior shift

In [None]:
df["ecFEV1%HFEV1"] = df["ecFEV1"] / df["HFEV1 mean"] * 100

In [None]:
# fig = px.scatter(df1, x="AR mean", y="ecFEV1%HFEV1", color="ID")
fig = px.scatter(df, x="AR mean", y="ecFEV1 % Predicted", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1000, height=700)
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
# Add ecFEV1%HFEV1 AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1%HFEV1"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1%HFEV1",
    )
)
# Add ecFEV1 % Predicted AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1 % Predicted",
    )
)
fig.update_layout(width=1000, height=700)
# x axis title
fig.update_xaxes(title_text="Airway resistance (%)")
fig.update_yaxes(title_text="FEV1% (prior, posterior)")
fig.show()

# FEF2575 vs FEV1, predicted, PEF

In [None]:
# fig = px.scatter(df, x="FEV1", y="FEF2575", color="ID")
fig = px.scatter(df, x="ecFEV1", y="ecFEF2575", color="ID")
fig.update_traces(marker=dict(size=3))
fig.update_layout(width=800, height=800)
fig.show()

In [None]:
def plot_for_df(df):
    df = df.reset_index()
    # fig = px.scatter(df, x="ecPEF (L/s)", y="ecFEF2575", color="ID")
    # fig = px.scatter(df1, x="PEF (L/s)", y="FEF2575", color="ID")
    fig.update_traces(marker=dict(size=3))
    # Update the layout to show othonormal axises
    fig.update_xaxes(range=[0, 17])
    fig.update_yaxes(range=[0, 7.5])
    fig.update_layout(width=1200, height=600)
    # fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/AR_modelling/{df.ID[0]}.pdf")
    fig.show()
    return


# df1.groupby(by="ID").apply(plot_for_df)
plot_for_df(df)

# FEF25-75 modelling

In [8]:
df["FEF2575%FEV1"] = df["FEF2575"] / df["FEV1"] * 100
df["ecFEF2575%ecFEV1"] = df["ecFEF2575"] / df["ecFEV1"] * 100
# df["FEF2575%PEF"] = df["FEF2575"] / df["PEF (L/s)"] * 100
# df["ecFEF2575%ecPEF"] = df["ecFEF2575"] / df["ecPEF (L/s)"] * 100

## FEF25-75 ratios against AR mean

In [6]:
fig = px.scatter(df, x="AR mean", y="FEF2575%FEV1", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
# Update y axis ranve
fig.update_yaxes(range=[0, 250])
fig.show()

In [7]:
fig = px.scatter(df, x="AR mean", y="FEF2575%PEF", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [7]:
cpt_and_plots.plot_FEF2575_ratio_with_IA(df, "AR mean", "ecFEF2575%ecFEV1")

In [None]:
fig = px.scatter(df, x="AR mean", y="ecFEF2575 % Predicted", color="ID")
fig.update_yaxes(range=[0, 250])
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
# Add ecFEV1%HFEV1 AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEF2575 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEF2575 % Predicted",
    )
)
# Add ecFEV1 % Predicted AR mean trace
fig.add_trace(
    go.Scatter(
        x=df["AR mean"],
        y=df["ecFEV1 % Predicted"],
        mode="markers",
        marker=dict(size=2),
        name="ecFEV1 % Predicted",
    )
)
fig.update_layout(width=1000, height=700)
# x axis title
fig.update_xaxes(title_text="Airway resistance (%)")
fig.update_yaxes(title_text="Lung function")
fig.show()

## FEF25-75%FEV1 against sampled AR
Sample AR values from the distribution to obtain a realistic spread

In [None]:
# pap_ids = ["101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120","121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140","141","142","143","144","145","146","147","148","149","150","151","152","153","154","155","156","157","158","159","160","161","162","163","164","165","166","167","168","169","170","171","172","173","174","175","176","177","178","179","180","181","182","183","184","185","186","187","188","189","190","191","192","193","194","195","196","197","198","199","200","201","202","203","204","205","206","207","208","209","210","211","212","213","214","215","216","217","218","219","220","221","222","223","224","225","226","227","228","229","230","231","232","233","234","235","236","237","238","239","240","241","242","243","244","245","246","247","248",
# ]

# df2 = df[df.ID.isin(pap_ids)]

In [9]:
AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior=None)
n_samples = 2

df_sampled, df_f3 = cpt_and_plots.get_sampled_df_and_statistics_df(df, n_samples, AR)

Max sampled AR values: 77.37


  df_sampled.groupby("AR bin")


In [12]:
cpt_and_plots.plot_FEF2575_ratio_with_IA(df_sampled, "AR sample", "ecFEF2575%ecFEV1")

### Understand bimodality

In [10]:
# Due to effect of Trikafta
# After Trikafta, FEV1 gets higher, FEF2575 doesn't because the lung damage stays. Hence FEF2575%FEV1 gets smaller
df_sampled_with_drugs = br.add_drug_therapy_to_df(df_sampled)
df_sampled_with_drugs.DrugTherapyType.value_counts()



DrugTherapyType
Trikafta     16990
Symkevi       9428
Ivacaftor     2574
None          1298
Orkambi        436
Name: count, dtype: int64

In [74]:
df_sampled_with_drugs["Trikafta"] = df_sampled_with_drugs.DrugTherapyType == "Trikafta"

In [75]:
# One ID is just one horizontal line. IDs stays in each mode.
# Can be due to data imbalance per ID.
df_sampled_with_drugs.ID.value_counts()[0:50]

def keep_firsts(df):
    df = df.copy()
    n = 400
    if len(df) > n:
        return df[0:n]

df_balanced = df_sampled_with_drugs.groupby('ID').apply(keep_firsts).drop(columns=['ID']).reset_index().drop(columns=['level_1'])

In [85]:
# fig = px.scatter(df_sampled_with_drugs, x="AR sample", y="ecFEF2575%ecFEV1", color="DrugTherapyType")
idx = 30
fig = px.scatter(
    # df_sampled_with_drugs, x="AR sample", y="ecFEV1", color="DrugTherapyType"
    # df_sampled_with_drugs[df_sampled_with_drugs.ID.isin(df_sampled_with_drugs.ID.unique()[idx:idx+10])], x="AR sample", y="ecFEF2575%ecFEV1", color="ID"
    df_sampled_with_drugs[df_sampled_with_drugs.ID == '101'], x="AR sample", y="ecFEF2575%ecFEV1", color="DrugTherapyType"
    # df_sampled_with_drugs, x="AR sample", y="ecFEF2575%ecFEV1", color="ID"
    # df_sampled_with_drugs, x="AR sample", y="ecFEF2575%ecFEV1", color="Trikafta"
    # df_balanced, x="AR sample", y="ecFEF2575%ecFEV1", color="ID"
    # df_balanced, x="AR sample", y="ecFEF2575%ecFEV1", color="Trikafta"
)
fig.update_traces(marker=dict(size=3))
fig.update_layout(width=1200, height=800)
fig.show()

In [65]:
cpt_and_plots.plot_F3_mean_and_percentiles_per_AR_bin(df_f3)

NameError: name 'y_col' is not defined

# FEF25-75 against IA

In [None]:
fig = px.scatter(df, x="IA mean", y="ecFEF2575 % Predicted", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
fig = px.scatter(df, x="IA mean", y="FEF2575%PEF", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()

In [None]:
fig = px.scatter(df, x="IA mean", y="FEV1", color="ID")
fig.update_traces(marker=dict(size=2))
fig.update_layout(width=1200, height=800)
fig.show()