In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time

from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import BeliefPropagation
from pgmpy.models import BayesianNetwork

import src.modelling_o2.o2satffa as o2satffa
import src.models.helpers as mh
import src.models.builders as mb
import src.inference.helpers as ih


plotsdir = "../../../../PlotsBreathe/O2_modelling/"

In [2]:
df = pd.read_excel(f"{plotsdir}airwayresistance_o2satffa_df.xlsx", index_col=0)
df.ID = df.ID.astype(str)

In [3]:
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,ecFEV1,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,AR from ecFEV1 (%),AR from FEV1 (%),O2SatFFA from ecFEV1 (%),O2SatFFA from FEV1 (%)
0,101,2019-02-20,1.31,97.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.287474,99.767593,61.338399,61.338399,95.859373,95.859373
1,101,2019-02-21,1.29,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,35.733466,98.739061,61.338399,62.796188,95.859373,95.751622
2,101,2019-02-22,1.32,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.564477,98.739061,61.338399,61.338399,95.859373,95.859373
3,101,2019-02-23,1.28,97.0,1.33,53,Male,173.0,3.610061,97.22596,36.841481,35.456463,99.767593,61.338399,62.796188,95.859373,95.751622
4,101,2019-02-24,1.33,98.0,1.36,53,Male,173.0,3.610061,97.22596,37.672492,36.841481,100.796125,59.881062,61.338399,95.953329,95.859373


## Infer O2SatFFA after observing FEV1
Done in model_up_to_O2SatFFA.ipynb

In [15]:
# Add title: inferred O2SatFFA from ecFEV1 vs. AR
title = "Inferred O2SatFFA vs. AR"
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    y="O2SatFFA from ecFEV1 (%)",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.update_layout(font=dict(size=10), title=title)
fig.show()

In [28]:
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,ecFEV1,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,O2Sat % O2SatFFA,AR group,AR from ecFEV1 (%),AR from FEV1 (%),O2SatFFA from ecFEV1 (%),O2SatFFA from FEV1 (%)
0,101,2019-02-20,1.31,97.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.287474,99.767593,101.233466,"(60, 80]",61.338399,61.338399,95.818116,95.818116
1,101,2019-02-21,1.29,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,35.733466,98.739061,100.189822,"(60, 80]",61.338399,62.796188,95.818116,95.694631
2,101,2019-02-22,1.32,96.0,1.32,53,Male,173.0,3.610061,97.22596,36.564477,36.564477,98.739061,100.189822,"(60, 80]",61.338399,61.338399,95.818116,95.818116
3,101,2019-02-23,1.28,97.0,1.33,53,Male,173.0,3.610061,97.22596,36.841481,35.456463,99.767593,101.233466,"(60, 80]",61.338399,62.796188,95.818116,95.694631
4,101,2019-02-24,1.33,98.0,1.36,53,Male,173.0,3.610061,97.22596,37.672492,36.841481,100.796125,102.158253,"(40, 60]",59.881062,61.338399,95.929597,95.818116


In [59]:
max_o2_sat = df.groupby("ID").max()["O2 Saturation"]
# Sort by asc
max_o2_sat = max_o2_sat.sort_values(ascending=True)
# Plot scatter of values
fig = px.scatter(
    max_o2_sat,
    x=max_o2_sat.index,
    y=max_o2_sat.values,
    title="Max O2 Saturation per individual",
)
fig.update_layout(font=dict(size=10))
# Reduce marker size
fig.update_traces(marker=dict(size=2))
fig.show()

# Plot F2

In [4]:
df["O2Sat % O2SatFFA"] = df["O2 Saturation"] / df["O2SatFFA from ecFEV1 (%)"] * 100

In [5]:
title = (
    f"O2Sat % Inferred O2SatFFA vs. AR ({df.ID.nunique()} IDs, {len(df)} datapoints)"
)
fig = px.scatter(
    df,
    x="AR from ecFEV1 (%)",
    # y="O2SatFFA from ecFEV1 (%)",
    y="O2Sat % O2SatFFA",
    title=title,
    hover_data=["ID", "ecFEV1"],
)
# Reduce marker size
fig.update_traces(marker=dict(size=2), opacity=0.3)
fig.update_layout(font=dict(size=10))
fig.show()

In [43]:
import src.o2_fev1_analysis.partition as partition

O2_col = "O2Sat % O2SatFFA"
# O2_col = "O2SatFFA from ecFEV1 (%)"

# # Create 3 equally spaced bins for Airway Resistance
# df["AR group"] = partition.partition_in_n_equal_groups(
#     df["Airway Resistance mean from ecFEV1 (%)"], 5
# )

# Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
df["AR group"] = pd.cut(
    df["AR from ecFEV1 (%)"],
    bins=np.arange(0, 100, 20),
    include_lowest=False,
)

group_labels = df["AR group"].unique()
print(f"AR groups: {group_labels}")

# Create subplot with 3 rows
fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    fig.add_trace(
        go.Histogram(
            x=df[df["AR group"] == group_labels[i]][O2_col],
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.2),
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} for different Airway Resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli",
    row=len(group_labels) - 1,
    col=1,
)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

AR groups: [(60.0, 80.0], (40.0, 60.0], (20.0, 40.0], (0.0, 20.0], NaN]
Categories (4, interval[int64, right]): [(0, 20] < (20, 40] < (40, 60] < (60, 80]]


In [8]:
# Create subplot with 3 rows
fig = make_subplots(
    rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02
)

# For each group compute the std up and down
for i in np.arange(1, 4):
    print(f"Group {group_labels[i]}")
    o2sat_group = df[df["AR group"] == group_labels[i]][O2_col]
    o2sat_group_up = o2sat_group[o2sat_group > 100]
    o2sat_group_down = o2sat_group[o2sat_group <= 100]
    # Print std for both
    print(f"Std up: {o2sat_group_up.std()}")
    print(f"Std down: {o2sat_group_down.std()}")
    # Print diff
    diff = o2sat_group_down.std() - o2sat_group_up.std()
    print(f"Std diff: {diff}")

    # Generate 1000 points from normal distribution with mean 100 and std diff
    sample = np.random.normal(100, diff, 100000)
    # Remove samples above 100
    sample = sample[sample <= 100]
    # Add to plot
    fig.add_trace(
        go.Histogram(
            x=sample,
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.02),
        ),
        row=i,
        col=1,
    )

title = f"Distribution of the uncertainty in F2 due to alveoli damage"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Uncertainty due to alveoli damage",
    row=len(group_labels) - 1,
    col=1,
)

fig.show()

Group (40, 60]
Std up: 0.7463867627522509
Std down: 1.5589558434514992
Std diff: 0.8125690806992483
Group (20, 40]
Std up: 0.570738007727022
Std down: 1.1451593579325807
Std diff: 0.5744213502055587
Group (0, 20]
Std up: 0.4751487369237554
Std down: 0.9696977280878075
Std diff: 0.4945489911640521


In [61]:
# Fit alveoli damage's std in function of airway resistance
# Std = f(AR), f being a function of order 4
# Plot the points: (10, 0.495), (30, 0.574), (50, 0.813)
fig = px.scatter(
    x=[10, 30, 50],
    y=[0.495, 0.574, 0.813],
    title="Evolution of the uncertainty in F2 due to alveoli damage",
)
fig.update_layout(
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Airway Resistance (%)",
)
fig.update_yaxes(
    title_text="Uncertainty due to alveoli damage",
)
# Add quadratic fit
x = np.arange(0, 100, 0.1)
y = 0.000085 * x ** 2 + 0.000000018 * x ** 4 + 0.485
fig.add_trace(go.Scatter(x=x, y=y, name="Quadratic fit"))
fig.show()

# Remove measurement noise and ho2sat model spread to get F2

In [None]:
# Plot the overall distribution of O2SatFFA with airway resistance
import plotly.figure_factory as ff

O2_col = "O2Sat % O2SatFFA"


def o2sat_prct_o2satffa_displot(array):
    fig = ff.create_distplot(
        [array],
        ["O2Sat % O2SatFFA"],
        bin_size=0.2,
        show_rug=False,
        show_curve=True,
        histnorm="probability density",
        colors=["#636EFA"],
    )

    fig.update_layout(font=dict(size=10))
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()
    return -1


o2sat_prct_o2satffa_displot(df[O2_col])

In [None]:
# Fit a gaussian distribution
import scipy.stats as stats


def fit_gaussian(array, bin_width=0.2):
    # Print data median
    mu, std = stats.norm.fit(array)
    print(f"Unconstrained gaussian fit - mu: {mu}, std: {std}")
    # Redo a fit with a fixed mu
    mu = np.median(array)
    std = stats.norm.fit(array, floc=mu)[1]
    print(f"Gaussian fit with mu = median - mu: {mu}, std: {std}")
    # Redo a fit using the same mu, but taking the std as the std of the right hand side from the median
    mu = np.median(array)
    right_hand_side = array[array > mu]
    std = np.sqrt(np.sum((right_hand_side - mu) ** 2) / len(array))
    print(
        f"Gaussian fit centered on median, defining std to the right hand side deviation from median - mu: {mu}, std: {std}"
    )

    # Then plot the pdf on top of the histogram
    # Create x vector from 75 to 110 with bin_width
    x = np.arange(75, 110, bin_width)
    pdf = stats.norm.pdf(x, mu, std)
    # Normalise pdf
    # pdf = pdf / np.sum(pdf)
    # print(sum(pdf))

    fig = go.Figure()
    fig.add_trace(
        go.Histogram(
            x=array,
            name=f"O2Sat % O2SatFFA",
            xbins=dict(start=75, end=110, size=bin_width),
            histnorm="probability density",
        )
    )
    # Add gaussian fit
    fig.add_trace(
        go.Scatter(
            x=x,
            y=pdf,
            mode="lines",
            name="Gaussian fit",
            line=dict(color="black", width=1),
        )
    )
    fig.update_layout(
        title=f"Distribution of O2Sat % O2SatFFA for different airway resistance groups",
        font=dict(size=10),
    )
    fig.update_xaxes(
        title_text="O2 Saturation in % of O2 Saturation if Fully Functional Alveoli"
    )
    fig.show()


fit_gaussian(df[O2_col])

## Use the O2 saturation with gaussian noise to smooth the histogram.

That means for each O2Sat value, get the denoised distribution, sample 100 O2Sat values from this distribution.
Thus each O2Sat value has equal weight in this new denoised dataset.

Then replot the histogram and redo the gaussian fit

In [None]:
import src.modelling_o2.o2sat as o2sat


def smart_sample(bins, p):
    """
    Smartly sampling so as to respect the probability distribution
    """
    n_vals_per_bin_arr = p * 100
    n_vals_per_bin_arr = np.round(n_vals_per_bin_arr)
    n_vals_per_bin_arr = n_vals_per_bin_arr.astype(int)

    # Create an array with n times the values of the bin
    bin_vals = np.repeat(bins, n_vals_per_bin_arr)
    return bin_vals


def get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width=0.1, n_samples=100000):
    O2Sat = o2sat.emulate_gaussian_distribution(o2sat_obs, bin_width=bin_width)
    # sample = O2Sat.sample(n_samples)
    sample = smart_sample(O2Sat.bins, O2Sat.prior[:, 0])
    return sample

In [None]:
# How many points should you sample to have a good representation of the distribution? -> 100000
bin_width = 0.2
sample = get_unbiased_o2sat_set_from_value(100, bin_width=bin_width, n_samples=100)
print(f"Sample size: {len(sample)}")

# Plot
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=sample,
        name=f"O2SatFFA",
        xbins=dict(start=75, end=110, size=bin_width),
    )
)
fig.update_layout(
    font=dict(size=10),
    xaxis=dict(range=[80, 100]),
)
fig.show()

In [None]:
df.head()

In [None]:
# Remove AR group 60, 80
print(f"Removing group label: {group_labels[0]}")
df_trusted_drop = df[df["AR group"] != group_labels[0]]
print(f"Initial entries: {len(df)}, after removing AR group 80: {len(df_trusted_drop)}")


def calc_unbiased_o2sat_prct_o2satffa(o2sat_obs, o2satffa, bin_width, n_samples):
    unbiased_o2sat = get_unbiased_o2sat_set_from_value(o2sat_obs, bin_width, n_samples)
    return unbiased_o2sat / o2satffa * 100


print("Initially N values:", len(df_trusted_drop))

unbiased_o2sat_prct_o2satffa = df_trusted_drop.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)

unbiased_o2sat_prct_o2satffa_flat = np.concatenate(unbiased_o2sat_prct_o2satffa.values)
print("N values:", len(unbiased_o2sat_prct_o2satffa_flat))

In [None]:
o2sat_prct_o2satffa_displot(unbiased_o2sat_prct_o2satffa_flat)

In [None]:
fit_gaussian(unbiased_o2sat_prct_o2satffa_flat)

## Reproduce hist by AR groups

In [None]:
# Reproduce the plot with the 3 AR groups
df["Unbiased O2Sat % O2SatFFA"] = df.apply(
    lambda x: calc_unbiased_o2sat_prct_o2satffa(
        x["O2 Saturation"], x["O2SatFFA from ecFEV1 (%)"], bin_width=0.2, n_samples=1000
    ),
    axis=1,
)


import src.o2_fev1_analysis.partition as partition

O2_col = "Unbiased O2Sat % O2SatFFA"

# Cut Airway Resistance into bins of 0-20, 20-40, 40-60, 60-80
df["AR group"] = pd.cut(
    df["AR from ecFEV1 (%)"],
    bins=np.arange(0, 100, 20),
    include_lowest=False,
)

group_labels = df["AR group"].unique()
print(f"AR groups: {group_labels}")

# Create subplot with 3 rows
fig = make_subplots(
    rows=len(group_labels) - 1, cols=1, shared_xaxes=True, vertical_spacing=0.02
)
# On first subplot add histogram of Drop from O2 Saturation FFA (%) for 1st AR group
for i in range(len(group_labels) - 1):
    fig.add_trace(
        go.Histogram(
            x=np.concatenate(df[df["AR group"] == group_labels[i]][O2_col].values),
            name=f"Airway Resistance {group_labels[i]}",
            # Bin size of 1
            xbins=dict(start=75, end=110, size=0.2),
            # histnorm="probability density",
        ),
        row=i + 1,
        col=1,
    )


title = f"Distribution of {O2_col} for different airway resistance groups"
fig.update_layout(
    title=title,
    font=dict(size=10),
)
fig.update_xaxes(
    title_text="Unbiased O2Sat%O2SatFFA",
    row=len(group_labels) - 1,
    col=1,
)
# Save
fig.write_image(f"{plotsdir}{title}.png")
fig.show()

In [None]:
df.head()