In [2]:
import numpy as np
import src.models.helpers as mh
import plotly.graph_objects as go
import src.modelling_fev1.uecfev1 as uecfev1
import src.data.breathe_data as bd
import plotly.express as px
import pandas as pd
import src.data.helpers as dh
import src.models.cpts.helpers as cpth
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

## Pick std gauss from the FEV1 variability study (msc thesis)

In [None]:
ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 0.05, prior={"type": "uniform"})
uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 0.05, prior=None)
# std_gauss = 0.3
# median std gauss is 0.068

# Plot the PDF of ecFEV1 given the middle bin of uecFEV1
uecfev1_bin = uecFEV1.get_bins_arr()[uecFEV1.card // 2]
print("uecFEV1 bin", uecfev1_bin)
# Get the PDF
pdf = np.zeros(ecFEV1.card)
for i, z in enumerate(ecFEV1.midbins):
    pdf[i] = uecfev1.PDF_conv_uni_gausian(z, uecfev1_bin[0], uecfev1_bin[1], std_gauss)
# Norm the pdf
pdf /= np.sum(pdf)
# Plot pdf with graph objects library
fig = go.Figure()
fig.add_trace(go.Bar(x=ecFEV1.midbins, y=pdf))
title = f"P(ecFEV1 | uecFEV1={uecfev1_bin} L)"
fig.update_layout(title=title, height=300, width=650)
fig.update_xaxes(title_text=ecFEV1.name)
fig.show()

### Boxplot of individul level std

In [None]:
# Pick healthy individuals
def get_std(df):
    """
    If there are more than 10 values, compute the standard deviation
    Else, return NaN
    """
    if len(df) > 10:
        return df.std()
    else:
        return np.nan


def get_boxplot_of_std_per_healthy_individual(
    df, health_threshold=80, ecFEV1_col="ecFEV1"
):
    # Remove unhealthy individuals
    # Compute avg predicted fev1 per id
    stmp = df.groupby("ID")["ecFEV1 % Predicted"].agg("mean").sort_values()
    ids_healthy = stmp[stmp > health_threshold].index

    # Filter healthy individuals
    df_healhiest = df[df["ID"].isin(ids_healthy)]
    stds = df_healhiest.groupby("ID")[ecFEV1_col].agg(get_std).dropna().sort_values()

    # Print avg std
    print(f"Average std: {stds.mean()}")
    print(f"Median std: {stds.median()}")

    # Plost histogram of stds
    fig = px.box(stds, x=ecFEV1_col, orientation="h")
    # Improve boxplot colouring
    # More x axis ticks
    fig.update_xaxes(tick0=0, dtick=0.05)
    # fig = px.histogram(stds, nbins=80)  # , marginal="box")
    # Update x axis
    # fig.update_xaxes(
    #     title_text=f"Standard deviation of the individual's<br> O2 saturation measurements"
    # )
    # fig.update_yaxes(title_text="Individuals count")
    title = f"Boxplot of individual-level std ({len(stds)} individuals)"
    fig.update_layout(
        title=title,
        height=180,
        width=500,
        showlegend=False,
        font=dict(size=9),
        plot_bgcolor="white",
    )
    # Grey grid
    fig.update_xaxes(
        showgrid=True,
        gridwidth=1,
        gridcolor="lightgrey",
        linecolor="black",
        linewidth=1,
        mirror=True,
        title=f"{ecFEV1_col} std",
    )
    # Same on y
    fig.update_yaxes(linecolor="black", linewidth=1, mirror=True)
    fig.show()
    return df_healhiest


# Median std: 0.1
# df_healhiest = get_boxplot_of_std_per_healthy_individual(df)

### Implement FEV1 variability model

Model from the msc thesis: noise = measurement - signal

In [None]:
# Load data
df_ma = bd.load_meas_from_excel(
    "BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx_MA_31_7"
)

In [None]:
df_ma.head()

In [None]:
w = 31
t = 7

In [None]:
# SKIP IF ALREADY LOADED THE DATA
# Noise model
def moving_average_for_ID(df, w, t, fev1_col="FEV1"):
    def moving_average(row, w, t):
        """
        Compute the moving average of an array with window w and threshold t
        """
        date = row["Date Recorded"]
        n_days_ago = date - pd.Timedelta(days=(w - 1) / 2)
        n_days_later = date + pd.Timedelta(days=(w - 1) / 2)
        # Get the values within the window
        df_ma = df.loc[:, ["Date Recorded", fev1_col]][
            (df["Date Recorded"] >= n_days_ago) & (df["Date Recorded"] <= n_days_later)
        ]
        # If there are more than t values, compute average of t closest values
        if len(df_ma) > t:
            # Compute number of days reference on the date
            df_ma["N days away"] = (df_ma["Date Recorded"] - date).abs()
            # Sort by the absolute value of the days away
            df_ma = df_ma.sort_values("N days away")
            # Take mean of first t values
            return df_ma[fev1_col].head(t).mean()
        else:
            return np.nan

    # Compute moving average
    df[f"MA{fev1_col}"] = df.apply(moving_average, args=(w, t), axis=1)
    return df


# df_out = df.groupby("ID").apply(lambda df: moving_average_for_ID(df, w, t, "ecFEV1"))
df_out = df.groupby("ID").apply(lambda df: moving_average_for_ID(df, w, t, "FEV1"))
df_out = df_out.drop(columns=["ID"]).reset_index().drop(columns=["level_1"])

In [None]:
# Save df
# df_ma.to_excel(
#     f"{dh.get_path_to_main()}ExcelFiles/BR/BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx_MA_{w}_{t}.xlsx",
#     index=False,
# )

In [None]:
# Join MAFEV1 from df_out into df_ma
df_ma = pd.merge(
    df_ma, df_out[["ID", "Date Recorded", "MAFEV1"]], on=["ID", "Date Recorded"]
)

In [None]:
(df_ma["MAFEV1"] - df_ma["FEV1"])
# Compute difference, remove nans and sort
diff = (df_ma["MAFEV1"] - df_ma["FEV1"]).dropna().sort_values()
diff

In [None]:
# using plotly go, plot scatter of initiall values and moving average

# mafev1_col = "MAecFEV1"
mafev1_col = "MAFEV1"

for id in ["101"]:  # df_ma["ID"].unique():
    dftmp = df_ma[df_ma["ID"] == id]
    df_smoothed = dftmp.dropna(subset=[mafev1_col])
    df_isna = dftmp[dftmp[mafev1_col].isna()]
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=dftmp["Date Recorded"],
            y=dftmp[mafev1_col],
            mode="markers",
            name="Moving average",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=df_smoothed["Date Recorded"],
            y=df_smoothed["ecFEV1"],
            mode="markers",
            name="Initial ecFEV1 values",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=df_isna["Date Recorded"],
            y=df_isna["ecFEV1"],
            mode="markers",
            name="Values excluded from the moving average",
        )
    )
    for trace in fig.data:
        if trace.name == "Moving average":
            trace.marker.color = "red"
            trace.marker.size = 6
        elif trace.name == "Values excluded from the moving average":
            trace.marker.color = "grey"
            trace.marker.size = 3
        elif trace.name == "Initial ecFEV1 values":
            trace.marker.color = "blue"
            trace.marker.size = 3
    title = f"{id} - Moving average of ecFEV1 with window {w} and threshold {t}"
    fig.update_layout(
        title=title,
        xaxis_title="Date",
        yaxis_title="ecFEV1 (L)",
        width=1600,
        height=500,
        font=dict(size=16),
    )
    # Put legend on top
    fig.update_layout(legend=dict(y=1.1, orientation="h"))
    # fig.write_image(
    #     f"{dh.get_path_to_main()}PlotsBreathe/FEV1_modelling/Moving averages/{title}.pdf"
    # )
    fig.show()

In [None]:
# IMPORTANT

mafev1_col = "MAecFEV1"
# mafev1_col = "MAFEV1"

print(f"{len(df_ma)} measurements, {df_ma.ID.nunique()} individuals")
df_ma = df_ma.dropna(subset=[mafev1_col])
print(
    f"{len(df_ma)} measurements, {df_ma.ID.nunique()} individuals after dropping NA values in the moving average"
)
# Drop counts < 10 values
df_ma = df_ma.groupby("ID").filter(lambda x: len(x) > 10)
print(
    f"{len(df_ma)} measurements, {df_ma.ID.nunique()} individuals after dropping individuals with less than 10 values"
)

In [None]:
df_ma["MAecFEV1 residuals"] = (df_ma["ecFEV1"] - df_ma["MAecFEV1"]).abs()
df_ma_healthiest = get_boxplot_of_std_per_healthy_individual(
    df_ma, 0, "MAecFEV1 residuals"
)
df_ma["MAFEV1 residuals"] = (df_ma["FEV1"] - df_ma["MAFEV1"]).abs()
df_ma_healthiest = get_boxplot_of_std_per_healthy_individual(
    df_ma, 0, "MAFEV1 residuals"
)

### How wrong is the additive std gauss?

#### Overall plot

In [None]:
# Compute mean and std per individual
def get_df_ids_from_df_ma(df_ma, fev1_col):
    s_stds = df_ma.groupby("ID")[f"MA{fev1_col} residuals"].agg(get_std)
    df_ids = pd.DataFrame(s_stds).reset_index()
    df_ids.columns = ["ID", f"MA{fev1_col} residuals std"]
    df_ids[f"Mean {fev1_col}"] = df_ma.groupby("ID")[f"{fev1_col}"].mean().values
    # Remove outlier
    df_ids = df_ids[df_ids[f"MA{fev1_col} residuals std"] < 0.3]
    return df_ids

In [None]:
# Create scatter plot
df_ids = get_df_ids_from_df_ma(df_ma, "FEV1")
df_ids = get_df_ids_from_df_ma(df_ma, "ecFEV1")

fig = px.scatter(
    df_ids,
    x="Mean ecFEV1",
    y="MAecFEV1 residuals std",
)

title = "How wrong is additive noise? (removed one outlier)"

# Update x-axis and y-axis properties
fig.update_xaxes(
    title_text="Mean ecFEV1 (L)",
    range=[0, 6],
    showgrid=True,
    gridwidth=1,
    gridcolor="lightgrey",
    linecolor="black",
    mirror=True,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=1,
    gridcolor="lightgrey",
    linecolor="black",
    mirror=True,
    title_text="Individual-level standard deviation<br>of the ecFEV1 moving average residuals (L)",
)

# Update marker size and layout
fig.update_traces(marker=dict(size=4))
fig.update_layout(plot_bgcolor="white", width=600, height=450, title=title)

# Show the figure
fig.show()

#### Stratify by different FEV1 levels, and check the avg std

In [None]:
fev1_col = "ecFEV1"
fev1_col = "FEV1"

fev1_prct_pred_col = f"{fev1_col} % Predicted"
ma_fev1_res_std_col = f"MA{fev1_col} residuals std"


df1 = df_ma
health_threshold = 0
df1 = df1[df1[f"{fev1_col} % Predicted"] > health_threshold]

# Cut the data in 3 bins of FEV1 values between 0 and 6
min = np.floor(df1[fev1_col].min() * 10) / 10
max = np.ceil(df1[fev1_col].max() * 10) / 10
bins = [min, 2, 3, max]
print(bins)
cat1 = f"[{bins[0]}; {bins[1]}) L"
cat2 = f"[{bins[1]}; {bins[2]}) L"
cat3 = f"[{bins[2]}; {bins[3]}) L"
cat1_midbin = (bins[0] + bins[1]) / 2
cat2_midbin = (bins[1] + bins[2]) / 2
cat3_midbin = (bins[2] + bins[3]) / 2

# Create dataframe
# s_std = df1.groupby("ID")["ecFEV1"].mean()
# df_std = pd.DataFrame(s_std)
# # Compute mean of predicted FEV1
# df_std["Mean ecFEV1%"] = df1.groupby("ID")["ecFEV1 % Predicted"].mean()
# # Rename column
# df_std = df_std.rename(columns={"ecFEV1": "Mean ecFEV1"})
# # Compute the std for those individuals
# df_std["individual-level std<br>of ecFEV1 measurement"] = df1.groupby("ID")[
#     "ecFEV1"
# ].std()
df_std = get_df_ids_from_df_ma(df1, fev1_col)

# print median and mean for the std
print(f"Median std: {df_std[f'MA{fev1_col} residuals std'].median()}")
print(f"Mean std: {df_std[f'MA{fev1_col} residuals std'].mean()}")
# Cut the avg FEV1 in 3 bins
df_std[f"{fev1_col} category"] = pd.cut(
    df_std[f"Mean {fev1_col}"], bins, labels=[cat1, cat2, cat3]
)
value_counts = df_std.value_counts(f"{fev1_col} category")
cat1_val = f"{cat1}<br>(#{value_counts[cat1]})"
cat2_val = f"{cat2}<br>(#{value_counts[cat2]})"
cat3_val = f"{cat3}<br>(#{value_counts[cat3]})"
# Rename the bins in {fev1_col} category
df_std[f"{fev1_col} category"] = df_std[f"{fev1_col} category"].replace(
    {cat1: cat1_val, cat2: cat2_val, cat3: cat3_val}
)

# Using px plot boxplots for each category
# fig = px.box(dftmp, x="{fev1_col} category", y="{fev1_col}", color="{fev1_col} category", title="{fev1_col} distribution per FEV1 category")
title = f"How wrong is the additive std for {fev1_col}?"
# title = f"How wrong is the additive std for {fev1_col}?<br>(individuals with {health_threshold}%+ avg predicted FEV1)"
fig = px.box(
    df_std,
    x=f"{fev1_col} category",
    y=f"MA{fev1_col} residuals std",
    color=f"{fev1_col} category",
    title=title,
)

# Enforce cat order on plot
fig.update_xaxes(categoryorder="array", categoryarray=[cat1_val, cat2_val, cat3_val])
# Apply colors to cat1_val in red
for i in range(3):
    if fig.data[i].name == cat1_val:
        fig.data[i].marker.color = "#EF553B"
    elif fig.data[i].name == cat2_val:
        fig.data[i].marker.color = "#AB63FA"
    elif fig.data[i].name == cat3_val:
        fig.data[i].marker.color = "#636EFA"
    else:
        fig.data[i].marker.color = "#00CC96"


# Hide legend
fig.update_layout(showlegend=False, width=500, height=400)
fig.show()
# Write image
fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/FEV1_modelling/{title}.pdf")

In [None]:
# UNUSED

# Same within 1 individual
df1 = df1[df1.ID == "108"]

# Cut the data in 3 bins of FEV1 values between 0 and 6
min = np.floor(df1.ecFEV1.min() * 10) / 10
max = np.ceil(df1.ecFEV1.max() * 10) / 10
bins = [min, 2.4, max]
print(bins)
cat1 = f"[{bins[0]}; {bins[1]}) L"
cat2 = f"[{bins[1]}; {bins[2]}) L"
# cat3 = f"[{bins[2]}; {bins[3]}) L"
cat1_midbin = (bins[0] + bins[1]) / 2
cat2_midbin = (bins[1] + bins[2]) / 2
# cat3_midbin = (bins[2] + bins[3]) / 2

# Create dataframe
s_std = df1.groupby("ID")["ecFEV1"].mean()
df_std = pd.DataFrame(s_std)
# Compute mean of predicted FEV1
df_std["Mean ecFEV1%"] = df1.groupby("ID")["ecFEV1 % Predicted"].mean()
# Rename column
df_std = df_std.rename(columns={"ecFEV1": "Mean ecFEV1"})
# Compute the std for those individuals
df_std["individual-level std<br>of ecFEV1 measurement"] = df1.groupby("ID")[
    "ecFEV1"
].std()
# Cut the avg FEV1 in 3 bins
df_std["ecFEV1 category"] = pd.cut(df_std["Mean ecFEV1"], bins, labels=[cat1, cat2])
value_counts = df_std.value_counts("ecFEV1 category")
cat1_val = f"{cat1}<br>(#{value_counts[cat1]})"
cat2_val = f"{cat2}<br>(#{value_counts[cat2]})"
# cat3_val = f"{cat3}<br>(#{value_counts[cat3]})"
# Rename the bins in ecFEV1 category
df_std["ecFEV1 category"] = df_std["ecFEV1 category"].replace(
    {cat1: cat1_val, cat2: cat2_val}
)

# Using px plot boxplots for each category
# fig = px.box(dftmp, x="ecFEV1 category", y="ecFEV1", color="ecFEV1 category", title="ecFEV1 distribution per FEV1 category")
title = f"How wrong is the additive std for ecFEV1?<br>(individuals with {health_threshold}%+ avg predicted FEV1)"
fig = px.box(
    df_std,
    x="ecFEV1 category",
    y="individual-level std<br>of ecFEV1 measurement",
    color="ecFEV1 category",
    title=title,
)

# Enforce cat order on plot
fig.update_xaxes(categoryorder="array", categoryarray=[cat1_val, cat2_val])
# Apply colors to cat1_val in red
for i in range(2):
    if fig.data[i].name == cat1_val:
        fig.data[i].marker.color = "#EF553B"
    elif fig.data[i].name == cat2_val:
        fig.data[i].marker.color = "#AB63FA"


# Hide legend
fig.update_layout(showlegend=False, width=500, height=400)
fig.show()
# Write image
fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/FEV1_modelling/{title}.pdf")

In [None]:
# UNUSED
stmp = df.groupby("ID")["ecFEV1 % Predicted"].agg("mean").sort_values()

# Remove unhealthy individuals
health_threshold = 80
ids_healthy = stmp[stmp > health_threshold].index
df1 = df[df.ID.isin(ids_healthy)]

# Remove individuals with less than 10 measurements
df1 = df1.groupby("ID").filter(lambda x: len(x) > 10)

# Cut the data in 3 bins of FEV1 values between 0 and 6
min = np.floor(df1.ecFEV1.min() * 10) / 10
max = np.ceil(df1.ecFEV1.max() * 10) / 10
bins = [min, 2.5, 3, 4, max]
cat1 = f"[{bins[0]}; {bins[1]}) L"
cat2 = f"[{bins[1]}; {bins[2]}) L"
cat3 = f"[{bins[2]}; {bins[3]}) L"
cat4 = f"[{bins[3]}; {bins[4]}) L"
cat1_midbin = (bins[0] + bins[1]) / 2
cat2_midbin = (bins[1] + bins[2]) / 2
cat3_midbin = (bins[2] + bins[3]) / 2
cat4_midbin = (bins[3] + bins[4]) / 2

# Create dataframe
s_std = df1.groupby("ID")["ecFEV1"].mean()
df_std = pd.DataFrame(s_std)
# Compute the std for those individuals
df_std["individual-level std<br>of ecFEV1 measurement"] = df1.groupby("ID")[
    "ecFEV1"
].std()
# Cut the avg FEV1 in 3 bins
df_std["ecFEV1 category"] = pd.cut(
    df_std["ecFEV1"], bins, labels=[cat1, cat2, cat3, cat4]
)
value_counts = df_std.value_counts("ecFEV1 category")
cat1_val = f"{cat1}<br>(#{value_counts[cat1]})"
cat2_val = f"{cat2}<br>(#{value_counts[cat2]})"
cat3_val = f"{cat3}<br>(#{value_counts[cat3]})"
cat4_val = f"{cat4}<br>(#{value_counts[cat4]})"
# Rename the bins in ecFEV1 category
df_std["ecFEV1 category"] = df_std["ecFEV1 category"].replace(
    {cat1: cat1_val, cat2: cat2_val, cat3: cat3_val, cat4: cat4_val}
)

# Using px plot boxplots for each category
# fig = px.box(dftmp, x="ecFEV1 category", y="ecFEV1", color="ecFEV1 category", title="ecFEV1 distribution per FEV1 category")
title = f"How wrong is the additive std for ecFEV1?<br>(individuals with {health_threshold}%+ avg predicted FEV1)"
fig = px.box(
    df_std,
    x="ecFEV1 category",
    y="individual-level std<br>of ecFEV1 measurement",
    color="ecFEV1 category",
    title=title,
)

# Enforce cat order on plot
fig.update_xaxes(
    categoryorder="array", categoryarray=[cat1_val, cat2_val, cat3_val, cat4_val]
)
# Apply colors to cat1_val in red
for i in range(4):
    if fig.data[i].name == cat1_val:
        fig.data[i].marker.color = "#EF553B"
    elif fig.data[i].name == cat2_val:
        fig.data[i].marker.color = "#AB63FA"
    elif fig.data[i].name == cat3_val:
        fig.data[i].marker.color = "#636EFA"
    else:
        fig.data[i].marker.color = "#00CC96"


# Hide legend
fig.update_layout(showlegend=False, width=500, height=400)
fig.show()
# Write image
fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/FEV1_modelling/{title}.pdf")

In [None]:
df_ids
df_std

In [None]:
# Compute median of stds for each category

fev1_col = "FEV1"
# fev1_col = "ecFEV1"

std_col = f"individual-level std<br>of {fev1_col} measurement"
std_col = f"MA{fev1_col} residuals std"

df_ids = df_std
s_stds = df_ids.groupby(f"{fev1_col} category")[std_col].agg("median")
df_cats = pd.DataFrame(s_stds)
# Rename col to "Median ..."
# median_col = f"Median of the individual-level std<br>of the {fev1_col} residuals (L)"
median_col = f"Noise per individual (L)<br>(Q1, median, Q3)"
df_cats.columns = [median_col]
# Compute 0.25 percentile and 0.75 percentile
df_cats["Q1"] = df_ids.groupby(f"{fev1_col} category")[std_col].agg(
    lambda x: np.percentile(x, 25)
)
df_cats["Q3"] = df_ids.groupby(f"{fev1_col} category")[std_col].agg(
    lambda x: np.percentile(x, 75)
)
# Compute Q1
df_cats["Q1 err"] = df_cats[median_col] - df_cats["Q1"]
# Compute Q3
df_cats["Q3 err"] = df_cats["Q3"] - df_cats[median_col]
df_cats.reset_index(inplace=True)

# Compute trendline
import statsmodels.api as sm

# Add category midbins
df_cats[f"{fev1_col} category midbin"] = [cat1_midbin, cat2_midbin, cat3_midbin]
# df_cats[f"{fev1_col} category midbin"] = [cat1_midbin, cat2_midbin, cat3_midbin, cat4_midbin]

# Fit a trendline using statsmodels
X = sm.add_constant(df_cats[f"{fev1_col} category midbin"])
model = sm.OLS(df_cats[median_col], X).fit()
df_cats["trendline"] = model.predict(X)
print(model.summary())
print(model._results.params)

# Scatter plot with median of stds
title = f"Individual-level std of {fev1_col} measurements"
fig = px.scatter(
    df_cats,
    x=f"{fev1_col} category midbin",
    y=median_col,
    color=f"{fev1_col} category",
    title=title,
    error_y="Q3 err",
    error_y_minus="Q1 err",
)
# Use same color for scatter plot
for i in range(3):
    # for i in range(4):
    fig.data[i].marker.color = "#636EFA"
# fig.add_traces(
#     px.line(df_cats, x=f"{fev1_col} category midbin", y="trendline")
#     .update_traces(line=dict(color="black", dash="dot"))
#     .data
# )
# Best fit for ecFEV1
# [0.03032977 0.00510174]
# Best fit for FEV1
# [0.03396603 0.00527939]
# Add line of best fit with a = 0.03032977 , b = 0.00510174. y=bx+a
# fig.add_traces(
#     px.line(df_cats, x=f"{fev1_col} category midbin", y=0.00510174 * df_cats[f"{fev1_col} category midbin"] + 0.03032977)
#     .update_traces(line=dict(color="black", dash="dot"))
#     .data
# )
fig.add_traces(
    px.line(
        df_cats,
        x=f"{fev1_col} category midbin",
        y=0.00527939 * df_cats[f"{fev1_col} category midbin"] + 0.03396603,
    )
    .update_traces(line=dict(color="black", dash="dot"))
    .data
)

# Replace x axis labels byf {fev1_col} category
fig.update_xaxes(
    title_text=f"Mean {fev1_col} group (L)<br>(#individuals)",
    tickvals=df_cats[f"{fev1_col} category midbin"],
    ticktext=df_cats[f"{fev1_col} category"],
)
# X range from min ecfev1 in df to max ecfev1 in df
# fig.update_xaxes(range=[min, max])
# Remove legend
fig.update_layout(showlegend=False, width=650, height=400)
fig.show()
df_cats

## Validate uniform x gaussian

In [None]:
uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 0.05, prior=None)
ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 0.05, prior={"type": "uniform"})

bin_idx = 60
bin = uecFEV1.get_bins_arr()[bin_idx]
print(f"bins uecFEV1: {bin}")

# Sample from bin
fev1_means = uecFEV1.sample_from_bin(bin, 5000000)
# Add gaussian noise
ecfev1_vals = np.random.normal(fev1_means, uecfev1.sigma_fn(fev1_means))

# Numerical solution
cpt_dbl = np.zeros(ecFEV1.card)
cpt_s = np.zeros(ecFEV1.card)
for i, z in enumerate(ecFEV1.get_bins_arr()):
    cpt_dbl[i] = uecfev1.p_uniform_x_gauss_add_mult_noise(
        z[0], z[1], bin[0], bin[1], abserr_tol=1e-8
    )
    cpt_s[i] = uecfev1.PDF_conv_uni_gausian_add_mult(
        (z[0] + z[1]) / 2, bin[0], bin[1], abserr_tol=1e-8
    )

cpt_dbl /= np.sum(cpt_dbl)
cpt_s /= np.sum(cpt_s)

In [None]:
# Plot
fig = go.Figure()
xbins = dict(start=0, end=6, size=0.05)

fig.add_trace(
    go.Histogram(x=ecfev1_vals, xbins=xbins, histnorm="probability", name="Sampling")
)
fig.add_trace(go.Bar(x=ecFEV1.midbins, y=cpt_dbl, name="Double integral"))
fig.add_trace(go.Bar(x=ecFEV1.midbins, y=cpt_s, name="Midbin approximation"))

hist, _ = np.histogram(ecfev1_vals, bins=ecFEV1.midbins)

# Print diff between sampling and double integral
# print(f"Diff between sampling and double integral: {cpt_dbl - cpt_s}")

title = f"ecFEV1 noise for uecFEV1 = {bin}L"
fig.update_layout(title=title, height=300, width=700)
fig.update_xaxes(title_text=ecFEV1.name, range=[2.7, 3.3])
fig.show()

In [None]:
# Show error nbetween dbl and sampling
ecfev1_binned = ecFEV1.bin_up(ecfev1_vals, normalise=True)
cpt_dbl - ecfev1_binned

## Compute CPT with additive (symmetric) gauss noise

In [None]:
# Wrong: this is the variability beteen 5-95th percentiles
# std_gauss = 0.23
# Median std from the variability analysis
std_gauss = 0.068

In [None]:
ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 0.05, prior={"type": "uniform"})
uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 0.05, prior=None)
# ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 1, prior=None)
# uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 1, prior=None)

# Select bin that's no troubled by borders
uecfev1_bin = uecFEV1.get_bins_arr()[uecFEV1.card // 2]
print("uecFEV1 bin", uecfev1_bin)
# Get the PDF
pdf = np.zeros(ecFEV1.card)
for i, z in enumerate(ecFEV1.midbins):
    pdf[i] = uecfev1.PDF_conv_uni_gausian_additive(
        z, uecfev1_bin[0], uecfev1_bin[1], std_gauss, abserr_tol=1e-8
    )
# Norm the pdf
pdf /= np.sum(pdf)
pdftmp = pdf
pdf = mh.get_p_in_log(uecFEV1, pdftmp)

# The same PDF will be shifted across all uecFEV1/ ecFEV1 pairs
# When hitting a border, the PDF will be truncated
cpt = np.zeros((ecFEV1.card, uecFEV1.card))
pdf_peek_idx = uecFEV1.card // 2
for uecFEV1_idx, uecfev1_bin in enumerate(uecFEV1.get_bins_arr()):
    pdf_trunc = np.zeros(len(pdf))
    ecFEV1_idx_peek = uecFEV1_idx
    peek_diff = pdf_peek_idx - ecFEV1_idx_peek
    if peek_diff == 0:
        pdf_trunc = pdf
    elif peek_diff > 0:
        pdf_trunc[0:-peek_diff] = pdf[peek_diff:]
    else:
        pdf_trunc[-peek_diff:] = pdf[:peek_diff]
    # Norm the pdf
    pdf_trunc /= np.sum(pdf_trunc)
    cpt[:, uecFEV1_idx] = pdf_trunc

In [None]:
import src.inference.helpers as ih

# Import make subplots
from plotly.subplots import make_subplots

In [None]:
# Plot pdf with bins in ecFEV1
fig = make_subplots(rows=1, cols=2)
ih.plot_histogram(fig, ecFEV1, pdf, ecFEV1.a, 1, 1, 1)
fig.update_xaxes(type="log", row=1, col=1)
fig.show()

In [None]:
import src.models.cpts.helpers as cpth
import src.data.helpers as dh

fig, title = cpth.plot_2d_cpt(
    cpt,
    ecFEV1,
    uecFEV1,
    height=5500,
    y_label_two_lines=True,
    p_range=[0, 0.4],
    vspace=0.003,
    invert=False,
)
title = title + f" for an ecFEV1 variability of {std_gauss}"
fig.update_layout(title=title)
fig.show()

# fig.write_image(f"{dh.get_path_to_main()}PlotsBreathe/CPTs/{title}.pdf")

In [None]:
cpth.save_cpt([ecFEV1, uecFEV1], cpt, suffix=f"_std_{std_gauss}_log")

## Compute CPT with multiplicative (asymmetric) gauss noise

In [None]:
ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 0.05, prior={"type": "uniform"})
uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 0.05, prior=None)
# ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 7, 0.2, prior=None)
# uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 7, 0.2, prior=None)

cpt = np.zeros((ecFEV1.card, uecFEV1.card))
# Can't integrate from uecFEV1 = 0 to 0.05 because error is too big
# I have to go through all states, because the std is prop to the value of uecFEV1
for j, y in enumerate(uecFEV1.get_bins_arr()):
    for i, z in enumerate(ecFEV1.get_bins_arr()):
        cpt[i, j] = uecfev1.p_uniform_x_gauss_add_mult_noise(
            z[0], z[1], y[0], y[1], abserr_tol=1e-8
        )
        # cpt[i, j] = uecfev1.PDF_conv_uni_gausian_add_mult(
        #     (z[0] + z[1]) / 2, y[0], y[1], abserr_tol=1e-8
        # )

cpt /= cpt.sum(axis=0)

In [None]:
cpth.save_cpt([ecFEV1, uecFEV1], cpt, suffix=f"_std_add_mult_ecfev1")

## Compute CPT with additive noise (for light model)

In [3]:
ecFEV1 = mh.VariableNode("ecFEV1 (L)", 0, 6, 1, prior={"type": "uniform"})
uecFEV1 = mh.VariableNode("Underlying ecFEV1 (L)", 0, 6, 1, prior=None)

std = 0.7
cpt = np.zeros((ecFEV1.card, uecFEV1.card))
# Can't integrate from uecFEV1 = 0 to 0.05 because error is too big
# I have to go through all states, because the std is prop to the value of uecFEV1
for j, y in enumerate(uecFEV1.get_bins_arr()):
    for i, z in enumerate(ecFEV1.get_bins_arr()):
        cpt[i, j] = uecfev1.p_uniform_x_gauss_add_noise(
            z[0], z[1], y[0], y[1], std, abserr_tol=1e-8
        )

cpt /= cpt.sum(axis=0)

In [4]:
cpth.save_cpt([ecFEV1, uecFEV1], cpt, suffix=f"_std{std}")

### Plot cpt results

In [None]:
cpt_midbin = cpth.get_cpt([ecFEV1, uecFEV1], suffix=f"_std_add_mult_ecfev1_midbin")

In [None]:
cpt_ecfev1 = cpth.get_cpt([ecFEV1, uecFEV1], suffix=f"_std_add_mult_ecfev1")

In [None]:
# Check that cpt_midbin and cpt_ecfev1 are the same
np.allclose(cpt_midbin, cpt_ecfev1)

In [None]:
# PLot the cpt
import src.models.cpts.helpers as cpth

fig, title = cpth.plot_2d_cpt(
    cpt,
    ecFEV1,
    uecFEV1,
    height=2000,
    y_label_two_lines=True,
    p_range=[0, 0.95],
    vspace=0.002,
    invert=True,
)

In [None]:
fig.show()