In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import data.helpers as dh

import modelling_o2.o2satffa as o2satffa

plotsdir = "../../../../PlotsBreathe/O2_modelling/"
exceldir = "../../../../ExcelFiles/"

In [None]:
df = pd.read_excel(f"{exceldir}airwayresistance_o2satffa_df.xlsx", index_col=0)
df.ID = df.ID.astype(str)

In [None]:
df.head()

## Infer O2SatFFA after observing FEV1
Done in model_up_to_O2SatFFA.ipynb

In [None]:
# Add title: inferred O2SatFFA from ecFEV1 vs. AR
title = "Inferred O2SatFFA vs. AR"
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    y="O2SatFFA from ecFEV1 (%)",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.update_layout(font=dict(size=10), title=title)
fig.show()

In [None]:
df.head()

# Plot F2
## Using O2SatFFA from ecFEV1

In [None]:
df["O2Sat % O2SatFFA"] = df["O2 Saturation"] / df["O2SatFFA from ecFEV1 (%)"] * 100

In [None]:
title = (
    f"O2Sat % Inferred O2SatFFA vs. AR ({df.ID.nunique()} IDs, {len(df)} datapoints)"
)
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    # y="O2SatFFA from ecFEV1 (%)",
    y="O2Sat % O2SatFFA",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2), opacity=0.3)
fig.update_layout(font=dict(size=10))
fig.show()

In [None]:
import o2_fev1_analysis.partition as partition

O2_col = "O2Sat % O2SatFFA"
# O2_col = "O2SatFFA from ecFEV1 (%)"


def create_AR_groups(df, AR_col="AR from ecFEV1 (%)"):
    # # Create 3 equally spaced bins for Airway Resistance
    # df["AR group"] = partition.partition_in_n_equal_groups(
    #     df["Airway Resistance mean from ecFEV1 (%)"], 5
    # )

    # Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
    df["AR group"] = pd.cut(
        df[AR_col],
        bins=np.arange(0, 100, 20),
        include_lowest=False,
    )

    group_labels = df["AR group"].unique().sort_values(ascending=False)

    print(f"AR groups: {group_labels}")
    return df, group_labels


def plot_drop_from_O2SatFFA(df, O2_col, AR_group_labels):
    # Create subplot with 3 rows
    fig = make_subplots(
        rows=len(AR_group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
    )
    # On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
    for i in range(len(AR_group_labels) - 1):
        fig.add_trace(
            go.Histogram(
                x=df[df["AR group"] == AR_group_labels[i]][O2_col],
                name=f"Airway Resistance {AR_group_labels[i]}",
                # Bin size of 1
                xbins=dict(start=75, end=110, size=0.2),
            ),
            row=i + 1,
            col=1,
        )
        fig.add_vline(
            x=100,
            row=i + 1,
            col=1,
            line_width=1,
            line_dash="dash",
        )

    title = f"Distribution of {O2_col} for different Airway Resistance groups"
    fig.update_layout(
        title=title,
        font=dict(size=10),
    )
    fig.update_xaxes(
        title_text=O2_col,
        row=len(AR_group_labels) - 1,
        col=1,
    )
    # Show more ticks on x
    fig.update_xaxes(tick0=75, dtick=1, row=len(AR_group_labels) - 1, col=1)
    # Save
    fig.write_image(f"{plotsdir}{title}.png")
    fig.show()

    return -1

In [None]:
df, group_labels = create_AR_groups(df)
plot_drop_from_O2SatFFA(df, O2_col, group_labels)

In [None]:
# Create subplot with 4 rows
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)

# For each group compute the std up and down
for i in np.arange(1, 4):
    print(f"Group {group_labels[i]}")
    o2sat_group = df[df["AR group"] == group_labels[i]][O2_col]
    # Compute group median
    # print(f"Median: {o2sat_group.median()}")
    # o2sat_group_up = o2sat_group[o2sat_group > o2sat_group.median()]
    # o2sat_group_down = o2sat_group[o2sat_group <= o2sat_group.median()]
    # print(f"Median: {o2sat_group.median()}")
    o2sat_group_up = o2sat_group[o2sat_group > 100]
    o2sat_group_down = o2sat_group[o2sat_group <= 100]
    # Print std for both
    print(f"Std up: {o2sat_group_up.std()}")
    print(f"Std down: {o2sat_group_down.std()}")
    # Print diff
    diff = o2sat_group_down.std() - o2sat_group_up.std()
    print(f"Std diff: {diff}")

    # Generate 1000 points from normal distribution with mean 100 and std diff
    sample = np.random.normal(100, o2sat_group_down.std(), 100000)
    # Remove samples above 100
    sample = sample[sample <= 100]
    # Add to plot
    fig.add_trace(
        go.Histogram(
            x=sample,
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.02),
        ),
        row=i,
        col=1,
    )

title = f"Distribution of the uncertainty in F2 due to alveoli damage"
fig.update_layout(title=title, font=dict(size=10), height=400)
fig.update_xaxes(range=[80, 100], tick0=75, dtick=1)
fig.update_xaxes(tick0=75, dtick=1, row=len(group_labels) - 1, col=1)
fig.update_xaxes(
    title_text="Uncertainty due to alveoli damage",
    row=len(group_labels) - 1,
    col=1,
)
# Put first colour to red on row 1, second to green row 2, third to purple row 3
fig.update_traces(marker_color="#EF553B", row=1, col=1)
fig.update_traces(marker_color="#00CC96", row=2, col=1)
fig.update_traces(marker_color="#AB63FA", row=3, col=1)

fig.show()

In [None]:
# Fit alveoli damage's std in function of airway resistance
# Std = f(AR), f being a function of order 4
# Plot the points: (10, 0.495), (30, 0.574), (50, 0.813)
fig = px.scatter(
    x=[10, 30, 50],
    # TODO: Shouldn't use the diff, but the std down
    y=[0.495, 0.574, 0.813],
    title="Evolution of the uncertainty in F2 due to alveoli damage",
)
fig.update_layout(
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Airway Resistance (%)",
)
fig.update_yaxes(
    title_text="Uncertainty due to alveoli damage",
)
# Add quadratic fit
x = np.arange(0, 100, 0.1)
y = 0.000085 * x**2 + 0.000000018 * x**4 + 0.485
fig.add_trace(go.Scatter(x=x, y=y, name="Quadratic fit"))
fig.show()

## Same using max o2 sat

In [None]:
max_o2_sat = df.groupby("ID").max()["O2 Saturation"]
# Sort by asc
max_o2_sat = max_o2_sat.sort_values(ascending=True)
# Plot scatter of values
fig = px.scatter(
    max_o2_sat,
    x=max_o2_sat.index,
    y=max_o2_sat.values,
    title="Max O2 Saturation per individual",
)
fig.update_layout(font=dict(size=10))
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.show()

In [None]:
# Join max_o2_sat with df
df_max = df.join(max_o2_sat, on="ID", rsuffix="_max")
# Rename columns
df = df_max.rename(columns={"O2 Saturation_max": "HO2Sat_max"})
df_max["O2SatFFA_max"] = df_max["HO2Sat_max"] * o2satffa.multiplicative_drop_func(
    df_max["AR from ecFEV1 (%)"].values
)
df_max["O2Sat % O2SatFFA_max"] = df_max["O2 Saturation"] / df_max["O2SatFFA_max"] * 100

O2_col = "O2Sat % O2SatFFA_max"

df_max, group_labels = create_AR_groups(df_max)
plot_drop_from_O2SatFFA(df_max, O2_col, group_labels)

## 01.2024: Using IA inference
Using the model from 2024-01-25_AR_IA_study.ipynb I'll reproduce the plots

In [None]:
# Read from excel
df = dh.load_excel(
    "../../../../ExcelFiles/inferred_AR_IA_with_FEV1_O2Sat_no_AR-IA_factor.xlsx",
    ["AR", "IA"],
)
df.head()

### Plot drop curve using dist means

In [None]:
df, group_labels = create_AR_groups(df, "AR mean")

O2_col = "IA mean"


def compute_squared_deviation_from_0(arr):
    """
    Computes the squares of the deviation from y axis
    sum[(x - x0)^2]/len(x), with x0 = 0

    It's the same as computing the std if we consider that we have only the right half of the distribution
    """
    return sum(arr**2 / len(arr))


fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    df_tmp = df[df["AR group"] == group_labels[i]]

    print(
        f"Squared of the deviation from 0 for group {group_labels[i]}: {compute_squared_deviation_from_0(df_tmp[O2_col].values):.2f}"
    )
    fig.add_trace(
        go.Histogram(
            x=df_tmp[O2_col],
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=0, end=50, size=0.1),
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} full dist for different Airway Resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text=O2_col,
    row=len(group_labels) - 1,
    col=1,
)
# Show more ticks on x
fig.update_xaxes(tick0=75, dtick=1, row=len(group_labels) - 1, col=1)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

### PLot drop curve after sampling from dists

In [None]:
from itertools import chain
import models.helpers as mh

In [None]:
AR = mh.VariableNode("Airway resistance (%)", 0, 90, 2, prior={"type": "uniform"})

ar_sample = df.AR.apply(
    lambda x: AR.get_distribution_as_sample(
        x, p_threshold=0.01, print_sample_size=False
    )
)
# Flatten the arrays into one array
ar_sample = list(chain.from_iterable(ar_sample))

IA = mh.VariableNode("Inactive alveoli (%)", 0, 30, 1, prior={"type": "uniform"})
ia_sample = df.IA.apply(
    lambda x: IA.get_distribution_as_sample(
        x, p_threshold=0.01, print_sample_size=False
    )
)
# Flatten the arrays into one array
ia_sample = list(chain.from_iterable(ia_sample))

In [None]:
len(ar_sample), len(ia_sample)
# Remove last ia values to match the length of ar
ia_sample = ia_sample[: len(ar_sample)]

In [None]:
df_from_samples = pd.DataFrame({"AR": ar_sample, "IA": ia_sample})
df_from_samples.head()

To plot the drop curve with a higher resolution, I should reduce the vars bin width before running the inference. Doing this for AR and IA only won't suffice if the other variables have a too high resolution, I'd to update bin_width for all variables and recompute all CPTs. This is very expensive

In [None]:
df, group_labels = create_AR_groups(df_from_samples, "AR")

O2_col = "IA"


def compute_squared_deviation_from_0(arr):
    """
    Computes the squares of the deviation from y axis
    sum[(x - x0)^2]/len(x), with x0 = 0

    It's the same as computing the std if we consider that we have only the right half of the distribution
    """
    return sum(arr**2 / len(arr))


fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    df_tmp = df[df["AR group"] == group_labels[i]]

    print(
        f"Squared of the deviation from 0 for group {group_labels[i]}: {compute_squared_deviation_from_0(df_tmp[O2_col].values):.2f}"
    )
    fig.add_trace(
        go.Histogram(
            x=df_tmp[O2_col],
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=0, end=50, size=0.1),
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} full dist for different Airway Resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text=O2_col,
    row=len(group_labels) - 1,
    col=1,
)
# Show more ticks on x
fig.update_xaxes(tick0=75, dtick=1, row=len(group_labels) - 1, col=1)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

# Archive

## Remove measurement noise and ho2sat model spread to get F2

In [None]:
# Plot the overall distribution of O2SatFFA with airway resistance
import plotly.figure_factory as ff

O2_col = "O2Sat % O2SatFFA"


def o2sat_prct_o2satffa_displot(array):
    fig = ff.create_distplot(
        [array],
        ["O2Sat % O2SatFFA"],
        bin_size=0.2,
        show_rug=False,
        show_curve=True,
        histnorm="probability density",
        colors=["#636EFA"],
    )

    fig.update_layout(font=dict(size=10))
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()
    return -1


o2sat_prct_o2satffa_displot(df[O2_col])

In [None]:
# Fit a gaussian distribution
import scipy.stats as stats


def fit_gaussian(array, bin_width=0.2):
    # Print data median
    mu, std = stats.norm.fit(array)
    print(f"Unconstrained gaussian fit - mu: {mu}, std: {std}")
    # Redo a fit with a fixed mu
    mu = np.median(array)
    std = stats.norm.fit(array, floc=mu)[1]
    print(f"Gaussian fit with mu = median - mu: {mu}, std: {std}")
    # Redo a fit using the same mu, but taking the std as the std of the right hand side from the median
    mu = np.median(array)
    right_hand_side = array[array > mu]
    std = np.sqrt(np.sum((right_hand_side - mu) ** 2) / len(array))
    print(
        f"Gaussian fit centered on median, defining std to the right hand side deviation from median - mu: {mu}, std: {std}"
    )

    # Then plot the pdf on top of the histogram
    # Create x vector from 75 to 110 with bin_width
    x = np.arange(75, 110, bin_width)
    pdf = stats.norm.pdf(x, mu, std)
    # Normalise pdf
    # pdf = pdf / np.sum(pdf)
    # print(sum(pdf))

    fig = go.Figure()
    fig.add_trace(
        go.Histogram(
            x=array,
            name=f"O2Sat % O2SatFFA",
            xbins=dict(start=75, end=110, size=bin_width),
            histnorm="probability density",
        )
    )
    # Add gaussian fit
    fig.add_trace(
        go.Scatter(
            x=x,
            y=pdf,
            mode="lines",
            name="Gaussian fit",
            line=dict(color="black", width=1),
        )
    )
    fig.update_layout(
        title=f"Distribution of O2Sat % O2SatFFA for different airway resistance groups",
        font=dict(size=10),
    )
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()


fit_gaussian(df[O2_col])

### Use the O2 saturation with gaussian noise to smooth the histogram.

That means for each O2Sat value, get the denoised distribution, sample 100 O2Sat values from this distribution.
Thus each O2Sat value has equal weight in this new denoised dataset.

Then replot the histogram and redo the gaussian fit

In [None]:
import modelling_o2.o2sat as o2sat


def smart_sample(bins, p):
    """
    Smartly sampling so as to respect the probability distribution
    """
    n_vals_per_bin_arr = p * 100
    n_vals_per_bin_arr = np.round(n_vals_per_bin_arr)
    n_vals_per_bin_arr = n_vals_per_bin_arr.astype(int)

    # Create an array with n times the values of the bin
    bin_vals = np.repeat(bins, n_vals_per_bin_arr)
    return bin_vals


def get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width=0.1, n_samples=100000):
    O2Sat = o2sat.emulate_gaussian_distribution(o2sat_obs, bin_width=bin_width)
    # sample = O2Sat.sample(n_samples)
    sample = smart_sample(O2Sat.bins, O2Sat.cpt[:, 0])
    return sample

In [None]:
# How many points should you sample to have a good representation of the distribution? -> 100000
bin_width = 0.2
sample = get_unbiased_o2sat_set_from_value(100, bin_width=bin_width, n_samples=100)
print(f"Sample size: {len(sample)}")

# Plot
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=sample,
        name=f"O2SatFFA",
        xbins=dict(start=75, end=110, size=bin_width),
    )
)
fig.update_layout(
    font=dict(size=10),
    xaxis=dict(range=[80, 100]),
)
fig.show()

In [None]:
df.head()

In [None]:
# Remove AR group 60, 80
print(f"Removing group label: {group_labels[0]}")
df_trusted_drop = df[df["AR group"] != group_labels[0]]
print(f"Initial entries: {len(df)}, after removing AR group 80: {len(df_trusted_drop)}")


def calc_unbiased_o2sat_prct_o2satffa(o2sat_obs, o2satffa, bin_width, n_samples):
    unbiased_o2sat = get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width, n_samples)
    return unbiased_o2sat / o2satffa * 100


print("Initially N values:", len(df_trusted_drop))

unbiased_o2sat_prct_o2satffa = df_trusted_drop.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)

unbiased_o2sat_prct_o2satffa_flat = np.concatenate(unbiased_o2sat_prct_o2satffa.values)
print("N values:", len(unbiased_o2sat_prct_o2satffa_flat))

In [None]:
o2sat_prct_o2satffa_displot(unbiased_o2sat_prct_o2satffa_flat)

In [None]:
fit_gaussian(unbiased_o2sat_prct_o2satffa_flat)

### Reproduce hist by AR groups

In [None]:
# Reproduce the plot with the 3 AR groups
df["Unbiased O2Sat % O2SatFFA"] = df.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)


import o2_fev1_analysis.partition as partition

O2_col = "Unbiased O2Sat % O2SatFFA"

# Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
df["AR group"] = pd.cut(
    df["AR from ecFEV1 (%)"],
    bins=np.arange(0, 100, 20),
    include_lowest=False,
)

group_labels = df["AR group"].unique()
print(f"AR groups: {group_labels}")

# Create subplot with 3 rows
fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    fig.add_trace(
        go.Histogram(
            x=np.concatenate(df[df["AR group"] == group_labels[i]][O2_col].values),
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.2),
            # histnorm="probability density",
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} for different airway resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Unbiased O2Sat%O2SatFFA",
    row=len(group_labels) - 1,
    col=1,
)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

In [None]:
df.head()