In [1]:
import data.O2_FEV1_df as O2_FEV1_df
import data.antibiotics_data as antibiotics_data
import o2_fev1_analysis.factors_data as factors_data
import data.ex_labels_data as ex_labels_data
from modelling_fev1.pred_fev1 import *
import modelling_o2.healthy_o2_sat as healthy_o2_sat

from partition import *
from desaturation_fev1 import *
from normalise import *
from smooth import *
from plot_helpers import *
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go


# Settings
plotsdir = "../../../../PlotsSmartcare/O2_FEV1/"

In [2]:
# Process O2-FEV1 data
O2_FEV1 = O2_FEV1_df.create()


** Loading measurements data **


  pd.read_csv(smartcare_data_dir + "mydata.csv")



* Processing measures *

FEV1
* Analysing same day duplicates *
Dropped 1065 same day FEV1 duplicates
Dropping 1 entries with FEV1 = 3.45 for user Kings004

O2 Saturation
* Analysing same day duplicates *
Dropped 93 same day O2 Saturation duplicates
IDs with O2 Saturation outside 70-100 % range: 
          UserName  O2 Saturation
3575      Kings005            127
4836   Papworth001            127
4911   Papworth002            127
6257   Papworth010            103
6261   Papworth010            104
8131   Papworth027            127
8265   Papworth028            127
8284   Papworth028            127
10854   leeds01222            127
11065   leeds01320            127
Dropping 10 entries with O2 Saturation outside 70-100 % range

Loaded measurements data with 14443 entries (initially 129841, removed 115398)

** Loading ID map **
Correct ID 101's Patient_ID from ['0HeWh64M_zc5U512xqzAs4'] to 0HeWh64M_zc5U5l2xqzAs4
Correct ID 125's Patient_ID from ['1au5biSTt0bNWgfl0Wltr5'] to 1au5biSTt0bNWg

  for idx, row in parser.parse():


In [3]:
# What are we analysing?
# Get the number of measurements per patient in O2_FEV1
counts = (
    O2_FEV1.groupby("ID")
    .count()
    .sort_values(by="FEV1", ascending=False)
    .reset_index()[["ID", "Date Recorded"]]
)
# Rename Date Recorded to count
counts.rename(columns={"Date Recorded": "Count"}, inplace=True)
# Bar plot of counts with ID on x-axis and "Date Recorded" on y-axis
fig = px.bar(counts, x="ID", y="Count", title="Number of measurements per patient")
fig.show()

# O2-FEV1 analysis
## Definitions
- O2 Saturation
- FEV 1
- Predicted FEV 1 in %: measure for the amount of airway obstruction either due to sputum load (non-permanent obstruction) or scars in the lungs (permanent obstruction).

## Literature
[The association between forced expiratory volume in one second (FEV1) and pulse oximetric measurements of arterial oxygen saturation (SpO2) in the patients with COPD: A preliminary study](https://pubmed.ncbi.nlm.nih.gov/24949035/).
- Context: 31 patients with COPD
- Key results: There was not statistically significant correlation between FEV1 % predicted and SpO2 values (P < 0.05), but a great correlation existed between FEV1/FVC % predicted and SpO2 values (r = 0.556, P < 0.001).

 [ANALYSIS OF CORRELATION BETWEEN FEV1/FEV6 AND OXYGEN SATURATION DURING SIX-MINUTE WALK TEST (6MWT) IN COPD PATIENTS](https://www.researchgate.net/publication/351322676_ANALYSIS_OF_CORRELATION_BETWEEN_FEV1FEV6_AND_OXYGEN_SATURATION_DURING_SIX-MINUTE_WALK_TEST_6MWT_IN_COPD_PATIENTS)
 - Need to download

Note: nobody segments the input by groups. Probably because they don't have enough data to do this.

## Raw scatter plots

In [3]:
# Raw plot without trendline
fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1")
# fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1", trendline="ols")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
fig.write_image(plotsdir + "FEV1-O2 raw.pdf")

fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1 % Predicted")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
# fig.write_image(plotsdir + "FEV1 % Predicted-O2 raw.pdf")

O2_FEV1.shape

(9526, 12)

In [4]:
var = "FEV1"
xaxis_range = [0, 5.1]
var = "FEV1 % Predicted"
xaxis_range = [0, 155]

# Plot with height
O2_FEV1["Height Group"], height_labels = partition_in_n_equal_groups(
    O2_FEV1["Height"], 3, True
)

fig = px.scatter(
    O2_FEV1,
    y="O2 Saturation",
    x=var,
    color="Height Group",
    marginal_x="histogram",
    marginal_y="histogram",
)
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
# fig.update_xaxes(range=[0.9*min(O2_FEV1["FEV1"]), 1.1*max(O2_FEV1["FEV1"])])
# fig.update_yaxes(range=[0.9*min(O2_FEV1["O2 Saturation"]), max(1.1*O2_FEV1["O2 Saturation"])])
# fig.show()
# Partition FEV1 % Predicted in 3 groups

for height_group in height_labels:
    mask_height = O2_FEV1["Height Group"] == height_group

    fig = px.scatter(
        O2_FEV1[mask_height],
        y="O2 Saturation",
        x=var,
        title="Height group {}".format(height_group),
    )
    fig.update_layout(autosize=False, width=500, height=500)
    fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
    fig.update_xaxes(range=xaxis_range)
    fig.update_yaxes(range=[74, 102])
    fig.show()
    filename = "Height group " + height_group + " " + var + "-O2 raw"
    # fig.write_image(plotsdir + filename + ".pdf")

In [5]:
predicted = True
x_var = "Height"
lung_function_var = "FEV1 % Predicted" if predicted else "FEV1"

fig1, fig2, _ = desaturation_FEV1_for_variable(
    O2_FEV1, x_var, n_var_groups=6, predicted=predicted
)
fig1.show()
# fig1.write_image(
#     plotsdir + "Desaturation {} for {} main plot.pdf".format(lung_function_var, x_var)
# )

fig2.show()

# fig2.write_image(
#     plotsdir + "Desaturation {} for {} with lines.pdf".format(lung_function_var, x_var)
# )

# Observations
# o2 10, var 15 contains a lot of noise and groups have very low number of datapoints. It gets too patient specific

## Per patient plots

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
O2_FEV1.head()


# Include time evolution into the graph
def to_float(dt_time):
    return (365 * dt_time.year + 12 * dt_time.month + dt_time.day) / 12


mindate = to_float(min(O2_FEV1["Date Recorded"]))
O2_FEV1["Months since study start"] = O2_FEV1["Date Recorded"].apply(
    lambda x: to_float(x) - mindate
)

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
IDs = O2_FEV1.ID.unique()
for ID in IDs:
    plot_o2_fev1_raw_for_id(O2_FEV1, ID, plotsdir)

## Summary plot (+ partitioning)

In [7]:
# Add exacerbated labels
exclude_no_ex_ids = True
# Compute exacerbation labels
# O2_FEV1_processed = compute_ex_labels_from_heuristics(antibioticsdata, patientsdata, O2_FEV1)

# Get exacerbation labels from the predictive classifier
O2_FEV1_processed = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load(), exclude_no_ex=exclude_no_ex_ids
)
transition_before = 1
transition_after = 2
O2_FEV1_processed = ex_labels_data.mark_ex_transition_period(
    O2_FEV1_processed, transition_before, transition_after
)

# Smooth
O2_FEV1_processed = smooth(O2_FEV1_processed, ["FEV1", "FEV1 % Predicted"], mode="max")
O2_FEV1_processed = smooth(O2_FEV1_processed, ["O2 Saturation"], mode="mean")
print(
    "Removed {} nan entries after max smoothing, kept {}".format(
        O2_FEV1_processed.isna().sum().max(), O2_FEV1_processed.shape[0]
    )
)
O2_FEV1_processed.dropna(inplace=True)

# Normalise
O2_FEV1_processed = norm_by_stable_baseline(
    O2_FEV1_processed,
    [
        "O2 Saturation",
        "O2 Saturation smoothed",
        "FEV1",
        "FEV1 % Predicted",
        "FEV1 smoothed",
        "FEV1 % Predicted smoothed",
    ],
)

# Settings
prefix, ex_column, x, y = set_x_y_vars(
    with_predicted_labels=True,
    with_predicted_fev1=True,
    is_smoothed_fev=True,
    is_smoothed_o2=False,
    is_normalised=False,
    exclude_no_ex_ids=exclude_no_ex_ids,
)

with_ex_transition = False
transition = (
    f"with {transition_before}-{transition_after} transition"
    if with_ex_transition
    else ""
)
ex_column = "Exacerbation State" if with_ex_transition else "Is Exacerbated"

title = f"Ex labels for {x}-{y} ({O2_FEV1_processed.ID.nunique()} IDs, {prefix}, {transition})"

is_partitioned = False
if not is_partitioned:
    # Plot O2-FEV with respective displots
    fig = plot_o2_fev_with_displots(O2_FEV1_processed, x, y, ex_column, title)
    fig.show()
    fig.write_image(
        f"{plotsdir}/{title}.pdf",
        width=1400,
        height=600,
    )

    # Plot subsampled O2-FEV scatter
    fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=7)
    fig.show()

else:
    # Use partitioned data. Note that this requires smoothed True and Predicted FEV1 False

    predicted_fev1_thresholds = [2.4, 3.2, 4]

    # Print number of uniaue ID
    print(
        "There are {} different individuals".format(len(O2_FEV1_processed.ID.unique()))
    )

    # Filter by unique ID
    O2_FEV1_processed = O2_FEV1_processed.groupby("ID").filter(lambda l: len(l) > 1)
    # Check if individual's Predicted FEV1 position wrt to groups thresholds
    O2_FEV1_processed["Predicted FEV1 Group"] = O2_FEV1_processed[
        "Predicted FEV1"
    ].apply(lambda l: value_to_group(l, predicted_fev1_thresholds))

    # Create a for loop for each Group
    # parition_variable = 'FEV1 % Predicted smoothed'
    parition_variable = "Predicted FEV1"

    for fev_group in O2_FEV1_processed[parition_variable + " Group"].unique():
        mask = O2_FEV1_processed[parition_variable + " Group"] == fev_group
        # Count patients
        n_patients = len(O2_FEV1_processed[mask].ID.unique())

        # Plot scatter plot using plotly with O2_FEV1_processed[mask], x, y, ex_column with color from get_ex_color() and get_stable_color()
        fig = px.scatter(
            O2_FEV1_processed[mask],
            x=x,
            y=y,
            color=ex_column,
            color_discrete_sequence=[get_stable_color(), get_ex_color()],
        )
        # Set x axes range to min max of O2_FEV1_processed[x]
        fig.update_xaxes(
            range=[O2_FEV1_processed[x].min() * 0.95, O2_FEV1_processed[x].max() * 1.05]
        )
        fig.update_layout(
            title="{} patients, {} {}".format(n_patients, parition_variable, fev_group)
        )

        fig.show()
        fig.write_image(
            "{}/Ex labels {} {}-{} group {}.pdf".format(
                plotsdir, prefix, x, y, fev_group
            ),
            width=1400,
            height=600,
        )

# Save O2_FEV1_processed to excel file in plotsdir
O2_FEV1_processed.to_excel(plotsdir + "O2_FEV1_processed.xlsx")


** Loading exacerbation labels from the predictive classifier **
Initially: 14452 entries (1445 True, 13006 False)
Excluding 1 NaN entry
Finally: 14451 entries (1445 True, 13006 False)

** Inner merge of O2_FEV1 and exacerbated labels on 'ID' and 'Date Recorded' **
Data has now 7994 entries and 103 IDs (initially 9526 & 146 in O2_FEV1, 14451 in pred_ex_labels)

Excluding all datapoints for IDs that have no exacerbated labels
Removed 50 IDs, 4002 stable labels (initially 103 IDs and 7994 labels)
Data has now 3992 entries and 53 IDs

** Marking (1, 2) transition window around exacerbation start **
Initially:
False    3269
True      723
Name: Is Exacerbated, dtype: int64
Finally:
0.0    3235
1.0     622
0.5     135
Name: Exacerbation State, dtype: int64
Removed 0 nan entries after max smoothing, kept 3992


## Test the sensitivity of random states on subsampled plots
We wonder if different random states will give largely different results, to an extend that it could challenge the model assumptions.
It turns out that, despite an effect on the x axis range, the subsampled plots are very similar.

In [None]:
# Add subsampled plot
prefix, ex_column, x, y = set_x_y_vars(
    with_predicted_labels=True,
    with_predicted_fev1=True,
    is_smoothed_fev=True,
    is_smoothed_o2=False,
    is_normalised=False,
)

for random_state in range(1, 10):
    print(random_state)
    fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=random_state)
    fig.show()

## Full sized displots

In [None]:
# This part fo the code is unused now
# But the displots can be useful for the report, so I keep it here

if predicted:
    if normalised:
        xrange = [-1.8, 1.2]
    else:
        xrange = [0, 155]
    bin_size = 4
else:
    if normalised:
        xrange = [-42, 40]
    else:
        xrange = [0, 5.1]
    bin_size = 0.14

# Raw plot with superposed exacerbation labels
fig = px.scatter(O2_FEV1_processed, y=y, x=x, color=ex_column)
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
# fig.show()
# fig.write_image("{}/{}-{} by exacerbated labels superposed raw.pdf".format(plotsdir, x, y))

# Raw individual plot for each exacerbation label]
fig_scatter = px.scatter(
    O2_FEV1_processed.dropna(),
    y=y,
    x=x,
    # color='Exacerbation Labels',
    facet_col=ex_column,
)
fig_scatter.update_layout(autosize=False, width=1000, height=500)
fig_scatter.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
# fig_scatter.update_xaxes(range=xrange)
if not (normalised):
    fig_scatter.update_yaxes(range=[74, 102])
# fig_scatter.show()
# fig.write_image("{}/Ex labels {} {}-{} individual raw.pdf".format(plotsdir, prefix, x, y), width=1300, height=400)


def distplot(x, group_labels, bin_size):
    data = []
    for exacerbation_label in group_labels:
        mask = O2_FEV1_processed[ex_column] == exacerbation_label
        data.append(O2_FEV1_processed[x][mask])
        print(
            "{} measurements with {} = {}".format(
                sum(mask), ex_column, exacerbation_label
            )
        )
    fig = ff.create_distplot(data, group_labels, bin_size=bin_size, show_rug=False)
    fig.update_xaxes(title_text=x)
    fig.update_layout(autosize=False, width=1000, height=400)
    return fig


# Create distplot for x = lung function
group_labels = (
    [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
)
fig_distplot_fev1 = distplot(x, group_labels, bin_size)
fig_distplot_fev1.show()
# fig.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, x), width=1000, height=400)

# Create distplot for y = O2 saturation
group_labels = (
    [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
)
fig_displot_o2 = distplot(y, group_labels, 1)
fig_displot_o2.show()
# fig_displot_o2.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, y), width=1000, height=400)

## Patient level plots to validate or validate outliers down

In [None]:
O2_FEV1_processed

In [None]:
# O2-FEV1: Outliers down can happen, however if you are persistently getting low reading it means that it's not an outlier
# Checking O2 sat stable period measurements outliers
exacerbation_label = "Stable Period"

# Case 1:
IDs = ["180", "202"]
o2_saturation_threshold = 87

# Case 2: smoothed
o2_saturation_threshold = 90

# Plot
ids = O2_FEV1_processed[
    (O2_FEV1_processed["Exacerbation Labels"] == exacerbation_label)
    & (O2_FEV1_processed["O2 Saturation"] < o2_saturation_threshold)
].ID.unique()
print(ids)
for id in ids:
    plot_o2_fev1_raw_for_id(O2_FEV1_processed, id, save=False, show=True)

## Validate smooth max

In [None]:
def plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id):
    O2_FEV1 = O2_FEV1[O2_FEV1["ID"] == id]

    mode = "max"
    O2_FEV1 = smooth(O2_FEV1, ["FEV1"], mode=mode)

    # Plot
    # Create a subplots that share the same x and y axes with plotly
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, shared_yaxes=True)
    # Add scatter of FEV1 with Date Recorded with a name 'FEV1' with marker opacity
    fig.add_scatter(
        x=O2_FEV1["Date Recorded"],
        y=O2_FEV1["FEV1"],
        mode="markers+lines",
        name="FEV1",
        opacity=0.5,
        row=1,
        col=1,
        marker=dict(size=5),
        line=dict(width=1),
    )
    # Add scatter of FEV1 smoothed with Date Recorded with a name 'FEV1 smoothed'
    fig.add_scatter(
        x=O2_FEV1["Date Recorded"],
        y=O2_FEV1["FEV1 smoothed"],
        mode="markers+lines",
        name="FEV1 smoothed",
        opacity=1,
        row=1,
        col=1,
        marker=dict(size=5),
        line=dict(width=1),
    )
    # Update figure size
    fig.update_layout(
        autosize=False,
        width=1000,
        height=300,
        title="FEV1 and FEV1 smoothed for ID {}".format(id),
        yaxis_title="Volume (L)",
    )
    fig.show()


# Plot plot_fev1_and_fev1_smoothed_for_id for the first 10 IDs of O2_FEV1
for id in O2_FEV1.ID.unique()[:10]:
    plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id)

## How to smooth O2?

In [None]:
# Plots O2 saturation against Date for a given patient id
def plot_o2_saturation_for_id(df, id, plotsdir, save=False, show=False):
    mask = df["ID"] == id
    fig = px.scatter(df[mask], x="Date Recorded", y="O2 Saturation")
    fig.update_layout(autosize=False, width=1000, height=400)
    fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
    if save:
        fig.write_image(
            "{}/O2 for ID/O2 saturation for ID {}.pdf".format(plotsdir, id),
            width=1000,
            height=400,
        )
    if show:
        fig.show()


# For each patient id, plot O2 saturation against Date Recorded
for id in O2_FEV1.ID.unique():
    plot_o2_saturation_for_id(O2_FEV1, id, plotsdir, save=True, show=False)

## Why are there measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L?
image.png
The scatter plot of O2 vs FEV1 for the FEV1 % Predicted group between 70% and 90% shows several measurements with an FEV1 smoothed below 1L. This looks erroneous.
There are 98 measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L. 
- 92 measurements belong to individual with ID 216. There are no other measurements for this individual. Is there a problem for the data collected by this individual?

Action: add function `remove_all_measurements_with_ID_216()`

In [None]:
114 * 0.01 * 3.95 - 44 * 0.025 - 2.6

In [None]:
# Find rows where FEV1 % Predicted smoothed > 70% and FEV1 smoothed < 1 L
mask = (O2_FEV1_processed["FEV1 % Predicted smoothed"] > 70) & (
    O2_FEV1_processed["FEV1 smoothed"] < 1
)
print(
    "{} measurements with FEV1 % Predicted smoothed > 70% and FEV1 < 1 L".format(
        sum(mask)
    )
)
# Set pandas display max row option to 100
# pd.set_option('display.max_rows', 100)
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][
    [
        "ID",
        "Age",
        "Sex",
        "Height",
        "Weight",
        "FEV1",
        "FEV1 smoothed",
        "Predicted FEV1",
        "Predicted FEV1",
        "FEV1 % Predicted",
        "FEV1 % Predicted smoothed",
        "Is Exacerbated",
        "O2 Saturation",
    ]
].sort_values(by=["ID", "FEV1 % Predicted smoothed"])

In [None]:
# Filter measurements done by ID 216
mask = O2_FEV1_processed["ID"] == "216"

# Add scatter of FEV1 smoothed against Date Recorded
fig = px.scatter(O2_FEV1_processed[mask], x="Date Recorded", y="FEV1 smoothed")
# Add scatter of FEV1 with trace name FEV1
fig.add_scatter(
    x=O2_FEV1_processed[mask]["Date Recorded"],
    y=O2_FEV1_processed[mask]["FEV1"],
    mode="markers",
    name="FEV1",
)
fig.show()
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][
    [
        "ID",
        "Date Recorded",
        "FEV1",
        "FEV1 smoothed",
        "Age",
        "Sex",
        "Height",
        "Weight",
        "FEV1 % Predicted",
        "FEV1 % Predicted smoothed",
        "Is Exacerbated",
    ]
].sort_values(by=["FEV1 smoothed"])

## Predictive classifier

#### Predictive classifier
* 90% of the measurements are labelled as "stable"
* 50% of the data from the predictive classifier belongs to individuals that did not have any exacerbation

In [None]:
# Are missing smartcare IDs from the ID mapping table, present in the pred classifier table?

# IDs that are present in the pred classifier table, but absent in the O2_FEV1 table
df_pred = ex_labels_data.load()
pred_ids = df_pred.reset_index().ID.unique()

mapping_ids = O2_FEV1.ID.unique()

# Find elements in pred_ids that are not in mapping_ids
np.setdiff1d(pred_ids, mapping_ids)

In [None]:
# Are there any individuals with no exacerbation labels?
df_pred = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load()
)  # results can be influenced by the addition of the exclude_no_ex_ids flag on 06.09.2023
df_pred_stable = df_pred[df_pred["Is Exacerbated"] == False]
ids_ex = df_pred_ex.ID.unique()
ids_stable = df_pred_stable.ID.unique()

print("{} IDs with exacerbated labels".format(ids_ex.size))
print("{} IDs with stable labels".format(ids_stable.size))


ids_stable_but_no_ex = np.setdiff1d(ids_stable, ids_ex)

print(
    "IDs with stable labels but no exacerbation labels: {}".format(ids_stable_but_no_ex)
)


# Let's check what's the amount belonging to individuals that have stable but no ex labels
count_stable_but_no_ex = df_pred_stable[
    df_pred_stable.ID.isin(np.setdiff1d(ids_stable, ids_ex))
].shape[0]

print(
    "There are {} rows corresponding to individuals with stable but no ex labels".format(
        count_stable_but_no_ex
    )
)
print(
    "This represents {}% of the total data".format(
        count_stable_but_no_ex / df_pred.shape[0] * 100
    )
)

print(
    "As a comparison, {}% of the labels are stable".format(
        df_pred_stable.shape[0] / df_pred.shape[0] * 100
    )
)

#### Number of exacerbations: looking at the antibiotics data
SmartCare contains individuals that have had a high number of exacerbations.
However, within the SmartCare study 1/3 individuals did not have an exacerbations. As per Damian's plot:

![image.png](attachment:image.png)

It turns out that the plots are not identical at all. Possible reasons for this
* Damian merged exacerbations that were "sequential"

**All individuals have an exacerbation... Why? Why didn't we get the original data?**

In [None]:
# From the patient data, get the ids list
import patient_data

df_patient_data = patient_data.load()
# Get the ids list
patient_ids = df_patient_data.ID.unique()

In [None]:
# Looking at the antibiotics data
# Let's try to reproduce this plot

df_antibio = antibiotics_data.load()

# Get the list of ids that are in patient_ids but not in df_n_antibio
ids_not_in_antibio = np.setdiff1d(patient_ids, df_antibio.ID.unique())

print(
    "\nIDs list of individuals that did not have an antibiotics treatment (i.e. an exacerbation): \n{}".format(
        ids_not_in_antibio
    )
)

# Compute number of antibiotics treatments per individual
df_n_antibio = (
    df_antibio.groupby("ID").size().reset_index(name="Number of antibiotics treatments")
)

# Create dataframe with ids that did not have an antibiotics treatment and 0 antibiotics treatments
df_0_antibio = pd.DataFrame(
    {"ID": ids_not_in_antibio, "Number of antibiotics treatments": 0}
)

df_n_antibio = pd.concat([df_n_antibio, df_0_antibio], ignore_index=True)

# Add column with 0 when 0 antibiotic treatments, and 1 otherwise
df_n_antibio["Has ab"] = df_n_antibio["Number of antibiotics treatments"].apply(
    lambda x: 0 if x == 0 else 1
)

# Plot bar chart of count of individuals with n antibiotics treatments
fig = px.histogram(df_n_antibio, x="Number of antibiotics treatments", color="Has ab")
# Use categorical x axis
fig.update_xaxes(type="category")
# Order by x axis label
fig.update_layout(
    xaxis={"categoryorder": "array", "categoryarray": [str(i) for i in range(0, 30)]},
    autosize=False,
    width=800,
    height=400,
)
# Rename y axis to participants
fig.update_yaxes(title_text="Number of participants")
fig.show()

In [None]:
df_antibio

In [None]:
df_pred

#### Merging antibiotics with predictive classifier
* Are there right or left only IDs?

In [None]:
# Merging antibiotics with predictive classifier

# Add a column when the ID is in the list of IDs with stable but no ex labels
df_n_antibio["Only stable labels"] = df_n_antibio["ID"].isin(ids_stable_but_no_ex)

# Is there any na in the Stable but no ex column?
print(
    "Number of nan in the Stable but no ex column: {}".format(
        df_n_antibio["Only stable labels"].isna().sum()
    )
)

# Plot bar chart of count of individuals with n antibiotics treatments
fig = px.histogram(
    df_n_antibio,
    x="Number of antibiotics treatments",
    color="Only stable labels",
    barmode="stack",
)
# Use categorical x axis
fig.update_xaxes(type="category")
# Order by x axis label
fig.update_layout(
    xaxis={"categoryorder": "array", "categoryarray": [str(i) for i in range(1, 30)]}
)
# Rename y axis to participants
fig.update_yaxes(title_text="Number of participants")
fig.show()

Learnings
* All pred labels correspond to an antibiotic treatment
* 

Are antibiotic treatment present in the pred labels?

In [None]:
# Count number of individuals with n antibiotics treatment
df.ID.tolist()

## Spread of O2 Saturation (IQR)

In [None]:
full = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load()
)  # results can be influenced by the addition of the exclude_no_ex_ids flag on 06.09.2023
print(
    "{} IDs have measurements in exacerbation periods".format(
        full.ID[full["Is Exacerbated"] == True].nunique()
    )
)
print(
    "{} IDs have measurements in stable periods".format(
        full.ID[full["Is Exacerbated"] == False].nunique()
    )
)

nonan = full.dropna()

# Select only rows where isna is True
nan = full[full.isna().any(axis=1)]
full.head()

In [None]:
# What's nan?

nan.head()

In [None]:
# We'd like to plot the O2 saturation per ID to see the patterns at patient level

x_col = "Date Recorded"

# Scatter plot nan with x Date Recorded, y O2 saturation, color ID with px library
full_ex = full[full["Is Exacerbated"] == True]
print(full_ex.shape)


def set_axes(fig):
    # Set y axis range to min max of O2 saturation of full
    fig.update_yaxes(
        range=[full["O2 Saturation"].min() - 1, full["O2 Saturation"].max() + 1]
    )
    # Set x axis range to min max of Date Recorded of full
    fig.update_xaxes(range=[full[x_col].min(), full[x_col].max()])
    return -1


fig = px.scatter(full_ex, x=x_col, y="O2 Saturation", color="ID")
set_axes(fig)
fig.add_shape(
    type="line",
    x0=full[x_col].min(),
    y0=95,
    x1=full[x_col].max(),
    y1=95,
    line=dict(color="Black", width=0.5),
)
fig.update_layout(
    title="Measurements in exacerbated period ({} points)".format(full_ex.shape[0])
)
fig.show()

In [None]:
# There is a signal within the population for the measurements done during exacerbation periods
# Let's try to group individuals that have different behaviours during exacerbation periods.

# Plot patients by interquartile range of O2 saturation during exacerbation periods

# Plot a boxplot of the O2 saturation values by ID
fig = px.box(full, x="ID", y="O2 Saturation")
# Sort by mean O2 saturation
mean_o2 = full.groupby("ID")["O2 Saturation"].mean().sort_values()
fig.update_layout(xaxis={"categoryorder": "array", "categoryarray": mean_o2.index})
# Add title: Individual O2 saturation during exacerbation periods (number of IDs)
fig.update_layout(
    title="Individual O2 saturation during exacerbation periods ({} IDs)".format(
        full_ex["ID"].nunique()
    )
)
fig.show()

In [None]:
# 3 groups of patients can be identified:
# iqr < 94, i.e. max iqr < 94
# iqr in [94, 96], i.e. min iq >= 94 and max iqr <= 96
# iqr > 96, i.e. min iqr > 96

# Because of the data imbalance (much more recordings in stable period), the IQR min is often lower for stable than for ex, surprisingly.

# Let's plot distribution of O2 saturation for measurements made in ex and stable periods for each group


# Compute the interquartile range of O2 saturation during exacerbation periods
def get_upper_iqr_val(series):
    return series.quantile(0.75)


def get_lower_iqr_val(series):
    return series.quantile(0.25)


def get_median(series):
    return series.quantile(0.5)


def get_mean(series):
    return series.mean()


# Create a DataFrame with the interquartile range of O2 saturation during exacerbation periods for each patient
df_iqr_ex = (
    full[full["Is Exacerbated"] == True]
    .groupby("ID")["O2 Saturation"]
    .agg([get_upper_iqr_val, get_lower_iqr_val])
    .reset_index()
)
df_iqr_stable = (
    full[full["Is Exacerbated"] == False]
    .groupby("ID")["O2 Saturation"]
    .agg([get_upper_iqr_val, get_lower_iqr_val])
    .reset_index()
)

# Merge the two DataFrames - do an outer join to keep IDs with stable labels
df_iqr = df_iqr_ex.merge(
    df_iqr_stable, how="outer", on="ID", suffixes=("_ex", "_stable")
)

print("Number of IDs after merge: {}".format(df_iqr.shape[0]))

df_iqr["lower_iqr"] = df_iqr[["get_lower_iqr_val_ex", "get_lower_iqr_val_stable"]].min(
    axis=1
)
df_iqr["upper_iqr"] = df_iqr[["get_upper_iqr_val_ex", "get_upper_iqr_val_stable"]].max(
    axis=1
)
df_iqr["Median"] = (
    full.groupby("ID")["O2 Saturation"].agg(get_median).reset_index()["O2 Saturation"]
)
df_iqr["Mean"] = (
    full.groupby("ID")["O2 Saturation"].agg(get_mean).reset_index()["O2 Saturation"]
)
df_iqr.describe()

In [None]:
def set_o2_sat_iqr(df):
    # Initialize O2 Sat IQR column
    df["O2 Sat IQR"] = ""
    # Set O2 Sat IQR to <94 if max iqr < 94
    df.loc[df["upper_iqr"] < 94, "O2 Sat IQR"] = "<94"
    # Set O2 Sat IQR to [94,96] if min iqr >= 94 and max iqr <= 96
    df.loc[(df["lower_iqr"] >= 94) & (df["upper_iqr"] <= 96), "O2 Sat IQR"] = "[94,96]"
    # Set O2 Sat IQR to >96 if min iqr > 96
    df.loc[df["lower_iqr"] > 96, "O2 Sat IQR"] = ">96"
    return df


df_iqr = set_o2_sat_iqr(df_iqr)

# Merge df_iqr 'O2 Sat IQR' with full on ID
full_iqr = full.merge(df_iqr[["ID", "O2 Sat IQR"]], on="ID", how="inner")

print(full_iqr.shape)
print("{} unique IDs in full_iqr".format(full_iqr.ID.nunique()))
full_iqr["O2 Sat IQR"].value_counts()
# Print number of IDs in each group
print(
    "{} IDs in group <94".format(full_iqr[full_iqr["O2 Sat IQR"] == "<94"].ID.nunique())
)
print(
    "{} IDs in group [94,96]".format(
        full_iqr[full_iqr["O2 Sat IQR"] == "[94,96]"].ID.nunique()
    )
)
print(
    "{} IDs in group >96".format(full_iqr[full_iqr["O2 Sat IQR"] == ">96"].ID.nunique())
)

In [None]:
def set_o2_sat_mean(df, min_o2, type="Median"):
    # Initialize O2 Sat Mean column
    df["O2 Sat {}".format(type)] = ""
    # Set O2 Sat Mean to <94 if mean < 94
    df.loc[df[type] < min_o2, "O2 Sat {}".format(type)] = "<{}".format(min_o2)
    # Set O2 Sat Mean to [94,96] if mean >= 94 and mean <= 96
    df.loc[
        (df[type] >= min_o2) & (df[type] <= 96), "O2 Sat {}".format(type)
    ] = "[{},96]".format(min_o2)
    # Set O2 Sat Mean to >96 if mean > 96
    df.loc[df[type] > 96, "O2 Sat {}".format(type)] = ">96"
    return df


min_o2 = 93

df_iqr = set_o2_sat_mean(df_iqr, min_o2, type="Median")
full_iqr_2 = full_iqr.merge(df_iqr[["ID", "O2 Sat Median"]], on="ID", how="inner")

print("Median")
full_iqr_2["O2 Sat Median"].value_counts()
# Print number of IDs in each group
print(
    "{} IDs in group <{}".format(
        full_iqr_2[full_iqr_2["O2 Sat Median"] == "<{}".format(min_o2)].ID.nunique(),
        min_o2,
    )
)
print(
    "{} IDs in group [{},96]".format(
        full_iqr_2[
            full_iqr_2["O2 Sat Median"] == "[{},96]".format(min_o2)
        ].ID.nunique(),
        min_o2,
    )
)
print(
    "{} IDs in group >96".format(
        full_iqr_2[full_iqr_2["O2 Sat Median"] == ">96"].ID.nunique()
    )
)

df_iqr = set_o2_sat_mean(df_iqr, min_o2, type="Mean")
full_iqr_3 = full_iqr_2.merge(df_iqr[["ID", "O2 Sat Mean"]], on="ID", how="inner")

print("Mean")
full_iqr_3.head()
full_iqr_3["O2 Sat Mean"].value_counts()
# Print number of IDs in each group
print(
    "{} IDs in group {}".format(
        full_iqr_3[full_iqr_3["O2 Sat Mean"] == "<{}".format(min_o2)].ID.nunique(),
        min_o2,
    )
)
print(
    "{} IDs in group [{},96]".format(
        full_iqr_3[full_iqr_3["O2 Sat Mean"] == "[{},96]".format(min_o2)].ID.nunique(),
        min_o2,
    )
)
print(
    "{} IDs in group >96".format(
        full_iqr_3[full_iqr_3["O2 Sat Mean"] == ">96"].ID.nunique()
    )
)

full_iqr = full_iqr_3.copy()

In [None]:
# For a small number of individuals, the base O2 saturation is low (<93) and the O2 saturation in exacerbation shifts even lower
# For most individuals, the base O2 saturation and the one in exacerbation have a similar distribution
# Question is there something wrong with 141?

col = "O2 Sat IQR"
col = "O2 Sat Median"
col = "O2 Sat Mean"

# Create figure with 3 rows and 1 column
fig = make_subplots(
    rows=3,
    cols=1,
    shared_xaxes=True,
    x_title="O2 saturation (%)",
    y_title="Probability",
    subplot_titles=(
        "iqr <{} ({} IDs)".format(
            min_o2, full_iqr[full_iqr[col] == "<{}".format(min_o2)]["ID"].nunique()
        ),
        "iqr in [{},96] ({} IDs)".format(
            min_o2, full_iqr[full_iqr[col] == "[{},96]".format(min_o2)]["ID"].nunique()
        ),
        "iqr >96 ({} IDs)".format(full_iqr[full_iqr[col] == ">96"]["ID"].nunique()),
    ),
)


# Compare the displots before and after removing the rows with nan
def distplot(df, fig, row, col):
    fig.add_trace(
        go.Histogram(
            x=df["O2 Saturation"][df["Is Exacerbated"] == True],
            histnorm="probability",
            marker=dict(color=get_ex_color()),
            name="Exacerbated",
        ),
        row=row,
        col=col,
    )
    fig.add_trace(
        go.Histogram(
            x=df["O2 Saturation"][df["Is Exacerbated"] == False],
            histnorm="probability",
            marker=dict(color=get_stable_color()),
            name="Stable",
        ),
        row=row,
        col=col,
    )
    # Add vertical gridlines x axis
    fig.update_xaxes(showgrid=True, gridwidth=1, row=row, col=col)
    fig.update_layout(barmode="overlay")
    return -1


distplot(full_iqr[full_iqr[col] == "<{}".format(min_o2)], fig, 1, 1)
distplot(full_iqr[full_iqr[col] == "[{},96]".format(min_o2)], fig, 2, 1)
distplot(full_iqr[full_iqr[col] == ">96"], fig, 3, 1)

fig.update_layout(
    width=800,
    height=800,
    title_text="Distribution of O2 saturation during exacerbation and stable periods",
)
fig.show()

# Create Factor Functions
We want the characterise the factor function that links the Unblocked FEV1 with its parents: Healthy FEV1 and Lung Damage. Here's a model of the relations betweeen those three varaibles: the unblocked FEV1 (L) of an individual is the healthy FEV1 (L), the theoretical lung function based on height, DOB, gdner, ethnicity (TBC exactly which), pejorated by the % in lung damage

## Lung damage for FEV1

In [8]:
# Factor function for unblocked FEV1 (L)
# This uses all the data from the O2_FEV1 table - without the exacerbation labels information

df_hfev1_ld_factor = factors_data.compute_hfev1_ld_factor(O2_FEV1)

# Add patient data to the unblocked factor table
df_hfev1_ld_factor = pd.merge(
    df_hfev1_ld_factor, patient_data.load(), on="ID", how="left"
)
print(df_hfev1_ld_factor.shape)
df_hfev1_ld_factor.head()


** Loading patient data **

* Dropping unnecessary columns from patient data *
Columns filtered: ['ID', 'Study Date', 'DOB', 'Age', 'Sex', 'Height', 'Weight', 'Predicted FEV1', 'FEV1 Set As']
Columns dropped: {'Study Number', 'Unable Informed Consent', 'Age 18 Years', 'Remote Monitoring App User ID', 'Date Consent Obtained', 'Telemetric Measures', 'Sputum Samples', 'GP Letter Sent', 'Date Last PE Stop', 'Less Exacerbation', 'Transplant Recipients', 'Inconvenience Payment', 'Freezer Required', 'Informed Consent', 'Genetic Testing', 'Unable Sputum Samples', 'Comments', 'CFQR Quest Comp', 'Pulmonary Exacerbation', 'Study Email', 'Date Last PE Start', 'Hospital'}

* Correcting patient data *
ID 60: Corrected height 60 from 1.63 to 163.0
ID 66: Corrected height for ID 66 from 1.62 to 162.0
Replace Age by calculate age
Drop FEV1 Set As and Predicted FEV1
Compute Calculated Predicted FEV1 using GLI reference equations



Unknown extension is not supported and will be removed




* Applying data sanity checks *
Loaded patient data with 147 entries (147 initially)
(146, 11)


Unnamed: 0,ID,Unblocked FEV1 (L),Lung Damage (%),Healthy FEV1 (L),Study Date,DOB,Age,Sex,Height,Weight,Predicted FEV1
0,100,2.7,27.896812,3.744633,2015-12-14,1990-08-10,25,Female,173.0,55.2,3.744633
1,101,1.87,42.044122,3.226592,2015-12-14,1978-09-09,37,Female,166.0,61.0,3.226592
2,102,1.69,33.167611,2.528714,2015-12-21,1993-12-12,22,Female,143.0,44.6,2.528714
3,107,2.24,49.01124,4.393125,2015-12-21,1997-11-12,18,Male,175.0,57.8,4.393125
4,113,1.32,52.827874,2.798263,2015-12-23,1984-04-28,32,Female,153.0,50.4,2.798263


In [10]:
# Plot a scatter of unblocked FEV1 (L) against healthy FEV1 (L)
def scatter_lung_damage(df, write=False, subsampled=False):
    fig = px.scatter(
        df,
        x="Healthy FEV1 (L)",
        y="Lung Damage (%)",
        color="Age",
        # color_continuous_scale='RdBu')
        color_continuous_scale="YlGnBu",
    )
    # color_continuous_scale='deep')
    fig.add_shape(
        type="line",
        x0=df["Healthy FEV1 (L)"].min(),
        y0=0,
        x1=df["Healthy FEV1 (L)"].max(),
        y1=0,
        line=dict(color="Black", width=1),
    )
    title = "Impact of lung damage on healthy FEV1 ({} points{})".format(
        df.shape[0], ", subsampled" if subsampled else ""
    )
    fig.update_layout(
        autosize=False, width=500, height=500, title=title, title_font_size=15
    )
    fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
    fig.show()
    # Save figure
    fig.write_image(
        "{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500
    )


scatter_lung_damage(df_hfev1_ld_factor)
# Observations:
# Looks like a classic uncorrelated scatter plot
# It's great because it shows that our factors are not correlated with each other
# We rotated the scatter plot by 45 degrees to see if there was a correlation between lung damage and healthy FEV1
# There might be a correlation with age, let's have a look at it.

In [11]:
# Plot a scatter of Lung Damage (%) against Age
fig = px.scatter(df_hfev1_ld_factor, x="Age", y="Lung Damage (%)")
fig.add_shape(
    type="line",
    x0=df_hfev1_ld_factor["Age"].min(),
    y0=0,
    x1=df_hfev1_ld_factor["Age"].max(),
    y1=0,
    line=dict(color="Black", width=1),
)
title = "Impact of lung damage on age ({} points)".format(df_hfev1_ld_factor.shape[0])
fig.update_layout(autosize=False, width=500, height=500, title=title)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
fig.show()

# Expectations:
# We're looking at a population, not an individual. Hence, we expect a possible correlation to be weak

# Observations:
# 60 years, 0% lung damage. Let's see who that is - maybe we're the linear approximation is not good enough - it looks ok
# It can seem that lung damage diminishes with age...
# Huge difference in data density. We have a lot of data for young people, and very few for old people, they died unfortunately

In [12]:
def box_lung_damage_age(df, write=False, subsampled=False):
    # Same but grouping by age group; #pointss
    print(df["Age"].describe())
    df["Age Group; #points"] = pd.cut(
        df["Age"].values,
        bins=[18, 30, 40, 50, 66],
        labels=["18-30", "30-40", "40-50", "50-66"],
    )
    print(df["Age Group; #points"].value_counts())
    # Rename the Age Group; #points value '18-30' to '18-29'
    df["Age Group; #points"] = df["Age Group; #points"].replace(
        "18-30", "18-30; {}".format(df["Age Group; #points"].value_counts()["18-30"])
    )
    df["Age Group; #points"] = df["Age Group; #points"].replace(
        "30-40", "30-40; {}".format(df["Age Group; #points"].value_counts()["30-40"])
    )
    df["Age Group; #points"] = df["Age Group; #points"].replace(
        "40-50", "40-50; {}".format(df["Age Group; #points"].value_counts()["40-50"])
    )
    df["Age Group; #points"] = df["Age Group; #points"].replace(
        "50-66", "50-66; {}".format(df["Age Group; #points"].value_counts()["50-66"])
    )
    # Sort by age group, #points
    df = df.sort_values(by=["Age Group; #points"])

    # Plot a boxplot of Lung Damage (%) against Age
    fig = px.box(df, x="Age Group; #points", y="Lung Damage (%)")
    title = "Impact of lung damage on age ({} points{})".format(
        df.shape[0], ", subsampled" if subsampled else ""
    )
    fig.update_layout(
        autosize=False, width=500, height=500, title=title, title_font_size=15
    )
    fig.show()
    fig.write_image(
        "{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500
    )
    return -1


box_lung_damage_age(df_hfev1_ld_factor)

count    146.000000
mean      31.636986
std        9.336528
min       18.000000
25%       24.000000
50%       30.000000
75%       37.000000
max       66.000000
Name: Age, dtype: float64
18-30    70
30-40    48
40-50    20
50-66     4
Name: Age Group; #points, dtype: int64


-1

#### Subsampled version of the LD - Age plots

In [18]:
# Plot a subsampled version of the above plots
df = df_hfev1_ld_factor
# Sub sample df so that there's randomly 20 points per age group
df_unblocked_factor_subampled_age = df.groupby(
    "Age Group; #points", group_keys=False
).apply(lambda x: x.sample(min(len(x), 20)))

print(
    "Initial shape {}, subsampled shape {}".format(
        df_hfev1_ld_factor.shape, df_unblocked_factor_subampled_age.shape
    )
)

box_lung_damage_age(df_unblocked_factor_subampled_age, subsampled=True)
scatter_lung_damage(df_unblocked_factor_subampled_age, subsampled=True)

Initial shape (57, 12), subsampled shape (51, 12)
count    51.000000
mean     33.588235
std      10.511282
min      19.000000
25%      25.500000
50%      32.000000
75%      39.000000
max      66.000000
Name: Age, dtype: float64
18-30    20
30-40    20
40-50     7
50-66     4
Name: Age Group; #points, dtype: int64


## Small airway blockage for FEV1

In [13]:
# This code is needed to run any of the code in the sub chapter
df_ufev1_sab_factor = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load(), exclude_no_ex=True
)

transition_before = 1
transition_after = 2
df_ufev1_sab_factor = ex_labels_data.mark_ex_transition_period(
    df_ufev1_sab_factor, transition_before, transition_after
)

# Dropping unnecessary columns: LD and HFEV1
df_hfev1_ld_factor = factors_data.compute_hfev1_ld_factor(O2_FEV1).drop(
    columns=["Lung Damage (%)", "Healthy FEV1 (L)"]
)

# Left join df_blockage_factor with df_unblocked_factor on ID
df_ufev1_sab_factor = pd.merge(
    df_ufev1_sab_factor, df_hfev1_ld_factor, on="ID", how="left"
)
df_ufev1_sab_factor["Small Airway Blockage (%)"] = (
    1 - df_ufev1_sab_factor.FEV1 / df_ufev1_sab_factor["Unblocked FEV1 (L)"]
) * 100

# Sort values by Unblocked FEV1 (L)
df_ufev1_sab_factor = df_ufev1_sab_factor.sort_values(by=["Unblocked FEV1 (L)"])
df_ufev1_sab_factor.head()


** Loading exacerbation labels from the predictive classifier **
Initially: 14452 entries (1445 True, 13006 False)
Excluding 1 NaN entry
Finally: 14451 entries (1445 True, 13006 False)

** Inner merge of O2_FEV1 and exacerbated labels on 'ID' and 'Date Recorded' **
Data has now 7994 entries and 103 IDs (initially 9526 & 146 in O2_FEV1, 14451 in pred_ex_labels)

Excluding all datapoints for IDs that have no exacerbated labels
Removed 50 IDs, 4002 stable labels (initially 103 IDs and 7994 labels)
Data has now 3992 entries and 53 IDs

** Marking (1, 2) transition window around exacerbation start **
Initially:
False    3269
True      723
Name: Is Exacerbated, dtype: int64
Finally:
0.0    3235
1.0     622
0.5     135
Name: Exacerbation State, dtype: int64


Unnamed: 0,ID,Date Recorded,O2 Saturation,FEV1,Study Date,DOB,Age,Sex,Height,Weight,Predicted FEV1,FEV1 % Predicted,Height Group,O2 Group,Is Exacerbated,Exacerbation State,Unblocked FEV1 (L),Small Airway Blockage (%)
3459,59,2016-03-23,94.0,0.68,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,20.065108,[161.0;165.0[,[93;96[,True,0.5,0.69,1.449275
3465,59,2016-04-04,88.0,0.57,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,16.819281,[161.0;165.0[,<93,False,0.0,0.69,17.391304
3464,59,2016-04-03,91.0,0.53,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,15.638981,[161.0;165.0[,<93,True,1.0,0.69,23.188406
3463,59,2016-04-02,87.0,0.59,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,17.409432,[161.0;165.0[,<93,True,1.0,0.69,14.492754
3462,59,2016-04-01,93.0,0.54,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,15.934056,[161.0;165.0[,[93;96[,True,1.0,0.69,21.73913


### SAB % - vertical line for each ID

In [14]:
# Plot unblocked fev1 (L) in x, measured fev1 in y, and color by Is Exacerbated
fig = px.scatter(
    df_ufev1_sab_factor, x="ID", y="Small Airway Blockage (%)", color="Is Exacerbated"
)
title = "Impact of % small airways blockage on unblocked FEV1 ({} points)".format(
    df_ufev1_sab_factor.shape[0]
)
# Set True to blue and False to red
fig.for_each_trace(
    lambda trace: trace.update(marker_color=get_ex_color(0.2))
    if trace.name == "True"
    else trace.update(marker_color=get_stable_color(0.2))
)
fig.update_layout(autosize=False, width=1300, height=600, title=title)


## THE TRACE BELOW ADDS A BLACK MARK FOR EACH INDIVIDUAL'S UNBLOCKED FEV1 (L) VALUE
## WE REMOVED IT BECAUSE IT GIVES MORE NOISE THAN INFORMATION
# Add scatter plot with marker lines of the Unblocked FEV1 (L) values on a new y axis
# Get df_unblocked_factor with only the IDs present in df_blockage_factor
# df_unblocked_with_pred_ids = df_hfev1_ld_factor[
#     df_hfev1_ld_factor["ID"].isin(df_ufev1_sab_factor["ID"])
# ]
# Put this trace on a new y axis
# fig.update_layout(yaxis2=dict(overlaying="y", side="right", title="Unblocked FEV1 (L)"))
#
# # Plot the Unblocked FEV1 (L) values on the new y axis
# fig.add_trace(
#     go.Scatter(
#         x=df_unblocked_with_pred_ids["ID"],
#         y=df_unblocked_with_pred_ids["Unblocked FEV1 (L)"],
#         mode="markers",
#         marker=dict(symbol="line-ew", color="black", line=dict(width=1)),
#         yaxis="y2",
#     )
# )
# # Hide legend for this trace
# fig.update_traces(showlegend=False, selector=dict(yaxis="y2"))

fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))

# Reduce x axis tick labels size
fig.update_xaxes(tickfont=dict(size=7))
fig.show()
# Save figure
# fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))

# Expectations:
# Measurements done in exacerbated period are related to a higher % of airway blockage
# The higher the unblocked FEV1, the sicker the people = more likely to be in permanent inflammatory state = the higher the % of airway blockage

# Observations:
#

### Boxplot FEV1 % Predicted - stable FEV1 % predicted

In [15]:
# Show the FEV1 % Predicted avg for an ID next to the ID in the x axis
transition = "ignored"  # baseline
# transition = "included"
transition = "excluded"

# transition = "ignored":
ex_col = "Is Exacerbated"
title_suffix = ""
df_ufev1_sab_factor_plot = factors_data.compute_avg_lung_func_stable(
    df_ufev1_sab_factor
)

# Removing IDs that don't have exacerbated labels outside of the transition period
if transition == "excluded" or transition == "included":
    mask_excluding_transition = df_ufev1_sab_factor_plot["Exacerbation State"] != 0.5
    df_ufev1_sab_factor_plot = ex_labels_data.exclude_IDs_where_no_ex_labels(
        df_ufev1_sab_factor_plot[mask_excluding_transition]
    )

if transition == "excluded":
    # Drop rows where Exacerbation State is 0.5
    df_ufev1_sab_factor_plot = df_ufev1_sab_factor_plot[
        df_ufev1_sab_factor_plot["Exacerbation State"] != 0.5
    ]
    title_suffix = (
        f", excluding ({transition_before}, {transition_after}) transition period"
    )
if transition == "included":
    ex_col = "Exacerbation State"
    title_suffix = (
        f", with  ({transition_before}, {transition_after}) transition period"
    )

# Compute mean of mean diffs between stable and exacerbated periods
avg_diff_of_means = factors_data.get_avg_diff_of_means(
    ex_labels_data.at_least_n_datapoints(df_ufev1_sab_factor_plot, 10),
    "FEV1 % Predicted",
    ex_col,
)

# Reproduce the plot but split exacerbated/stable datapoints into two sub box plots
title = "Boxplots for FEV1 % Pred, ordered by avg FEV1 % Pred in stable period ({} points{})".format(
    df_ufev1_sab_factor_plot.shape[0],
    title_suffix,
)

# TODO: use mean instead of median in the boxplot?
fig = px.box(
    df_ufev1_sab_factor_plot,
    x="ID (avg FEV1 % Predicted in stable period)",
    y="FEV1 % Predicted",
    color=ex_col,
)

# Add annotation for mean difference
fig.add_annotation(
    x=0,
    y=1.05,
    xref="paper",
    yref="paper",
    text="Avg diff of means: {:.2f}".format(avg_diff_of_means),
    showarrow=False,
)

# Set True to blue and False to red
fig.for_each_trace(
    lambda trace: trace.update(marker_color=get_ex_color(1))
    if trace.name == "True" or trace.name == "1.0"
    else trace.update(marker_color=get_stable_color(1))
    if trace.name == "False" or trace.name == "0.0"
    else trace.update(marker_color=get_transition_color(0.2))
)
traces = fig.update_traces(boxmean=True)
fig.update_layout(autosize=False, width=2100, height=700, title=title)
fig.show()

fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))


Excluding all datapoints for IDs that have no exacerbated labels
Removed 6 IDs, 234 stable labels (initially 53 IDs and 3857 labels)

Excluding IDs that have less than 10 stable or exacerbated datapoints
Removed 23/47 IDs with too few data (<10 ex or stable measurements)


## Lung damage for O2 saturation

In [16]:
# Assumptions
# Normal Pa O2 is constantly 100%
# Unblocked Pa O2 is the reverse max of O2 saturations measured


df_hfev1_ld_factor = factors_data.compute_ho2_ld_factor(O2_FEV1)
# Add patient data to the unblocked factor table
df_hfev1_ld_factor = pd.merge(
    df_hfev1_ld_factor, patient_data.load(), on="ID", how="left"
)
df_hfev1_ld_factor.head()


** Loading patient data **

* Dropping unnecessary columns from patient data *
Columns filtered: ['ID', 'Study Date', 'DOB', 'Age', 'Sex', 'Height', 'Weight', 'Predicted FEV1', 'FEV1 Set As']
Columns dropped: {'Study Number', 'Unable Informed Consent', 'Age 18 Years', 'Remote Monitoring App User ID', 'Date Consent Obtained', 'Telemetric Measures', 'Sputum Samples', 'GP Letter Sent', 'Date Last PE Stop', 'Less Exacerbation', 'Transplant Recipients', 'Inconvenience Payment', 'Freezer Required', 'Informed Consent', 'Genetic Testing', 'Unable Sputum Samples', 'Comments', 'CFQR Quest Comp', 'Pulmonary Exacerbation', 'Study Email', 'Date Last PE Start', 'Hospital'}

* Correcting patient data *
ID 60: Corrected height 60 from 1.63 to 163.0
ID 66: Corrected height for ID 66 from 1.62 to 162.0
Replace Age by calculate age
Drop FEV1 Set As and Predicted FEV1
Compute Calculated Predicted FEV1 using GLI reference equations



Unknown extension is not supported and will be removed




* Applying data sanity checks *
Loaded patient data with 147 entries (147 initially)


Unnamed: 0,ID,Unblocked O2 Saturation (%),Lung Damage (%),Healthy O2 Saturation (%),Study Date,DOB,Age,Sex,Height,Weight,Predicted FEV1
0,100,98.0,2.0,100,2015-12-14,1990-08-10,25,Female,173.0,55.2,3.744633
1,101,98.0,2.0,100,2015-12-14,1978-09-09,37,Female,166.0,61.0,3.226592
2,102,98.0,2.0,100,2015-12-21,1993-12-12,22,Female,143.0,44.6,2.528714
3,107,94.0,6.0,100,2015-12-21,1997-11-12,18,Male,175.0,57.8,4.393125
4,113,97.0,3.0,100,2015-12-23,1984-04-28,32,Female,153.0,50.4,2.798263


In [8]:
fig = px.scatter(
    df_hfev1_ld_factor,
    x="Age",
    y="Lung Damage (%)",
    color="Sex",
    color_continuous_scale="YlGnBu",
)
fig.add_shape(
    type="line",
    x0=df_hfev1_ld_factor["Healthy O2 Saturation (%)"].min(),
    y0=0,
    x1=df_hfev1_ld_factor["Healthy O2 Saturation (%)"].max(),
    y1=0,
    line=dict(color="Black", width=1),
)
title = "Impact of lung damage on healthy O2 Sat ({} points)".format(
    df_hfev1_ld_factor.shape[0]
)
fig.update_layout(autosize=False, width=700, height=500, title=title)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

## Small airway blockage for O2 saturation

In [3]:
df_uo2_sab_factor = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load(), exclude_no_ex=True
)

transition_before = 1
transition_after = 2
df_uo2_sab_factor = ex_labels_data.mark_ex_transition_period(
    df_uo2_sab_factor, transition_before, transition_after
)

df_ho2_ld_factor = factors_data.compute_ho2_ld_factor(O2_FEV1).drop(
    columns=["Lung Damage (%)", "Healthy O2 Saturation (%)"]
)

# Left join df_blockage_factor with df_unblocked_factor on ID
df_uo2_sab_factor = pd.merge(df_uo2_sab_factor, df_ho2_ld_factor, on="ID", how="left")
df_uo2_sab_factor["Small Airway Blockage (%)"] = (
    1
    - df_uo2_sab_factor["O2 Saturation"]
    / df_uo2_sab_factor["Unblocked O2 Saturation (%)"]
) * 100

# Sort values by Unblocked O2 Saturation (%)
df_uo2_sab_factor = df_uo2_sab_factor.sort_values(by=["Unblocked O2 Saturation (%)"])
df_uo2_sab_factor.head()


** Loading exacerbation labels from the predictive classifier **
Initially: 14452 entries (1445 True, 13006 False)
Excluding 1 NaN entry
Finally: 14451 entries (1445 True, 13006 False)

** Inner merge of O2_FEV1 and exacerbated labels on 'ID' and 'Date Recorded' **
Data has now 7994 entries and 103 IDs (initially 9526 & 146 in O2_FEV1, 14451 in pred_ex_labels)

Excluding all datapoints for IDs that have no exacerbated labels
Removed 50 IDs, 4002 stable labels (initially 103 IDs and 7994 labels)
Data has now 3992 entries and 53 IDs

** Marking (1, 2) transition window around exacerbation start **
Initially:
False    3269
True      723
Name: Is Exacerbated, dtype: int64
Finally:
0.0    3235
1.0     622
0.5     135
Name: Exacerbation State, dtype: int64


Unnamed: 0,ID,Date Recorded,O2 Saturation,FEV1,Study Date,DOB,Age,Sex,Height,Weight,Predicted FEV1,FEV1 % Predicted,Is Exacerbated,Exacerbation State,Unblocked O2 Saturation (%),Small Airway Blockage (%)
3464,59,2016-04-03,91.0,0.53,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,15.638981,True,1.0,93.0,2.150538
3456,59,2016-03-16,92.0,0.7,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,20.655258,False,0.0,93.0,1.075269
3457,59,2016-03-19,93.0,0.74,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,21.835558,False,0.0,93.0,0.0
3458,59,2016-03-20,90.0,0.68,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,20.065108,False,0.0,93.0,3.225806
3459,59,2016-03-23,94.0,0.68,2016-02-10,1995-01-17,21,Female,164.0,53.0,3.388968,20.065108,True,0.5,93.0,-1.075269


### SAB % - vertical line for each ID - O2

In [18]:
# Plot Unblocked O2 Saturation (%) in x, measured fev1 in y, and color by Is Exacerbated
fig = px.scatter(
    df_uo2_sab_factor,
    x="ID",
    y="Small Airway Blockage (%)",
    color="Is Exacerbated",
    color_discrete_sequence=[get_stable_color(0.2), get_ex_color(0.2)],
)
# Set True to blue and False to red
fig.for_each_trace(
    lambda trace: trace.update(marker_color=get_ex_color(0.2))
    if trace.name == "True"
    else trace.update(marker_color=get_stable_color(0.2))
)
title = "Impact of % small airways blockage on unblocked O2 Sat ({} points)".format(
    df_uo2_sab_factor.shape[0]
)
fig.update_layout(autosize=False, width=1300, height=600, title=title)


# Add scatter plot with marker lines of the Unblocked O2 Saturation (%) values on a new y axis
# Get df_unblocked_factor with only the IDs present in df_blockage_factor
df_unblocked_with_pred_ids = df_ho2_ld_factor[
    df_ho2_ld_factor["ID"].isin(df_ufev1_sab_factor["ID"])
]
# Put this trace on a new y axis
fig.update_layout(
    yaxis2=dict(overlaying="y", side="right", title="Unblocked O2 Saturation (%)")
)
# Plot the Unblocked O2 Saturation (%) values on the new y axis
fig.add_trace(
    go.Scatter(
        x=df_unblocked_with_pred_ids["ID"],
        y=df_unblocked_with_pred_ids["Unblocked O2 Saturation (%)"],
        mode="markers",
        marker=dict(symbol="line-ew", color="black", line=dict(width=1)),
        yaxis="y2",
    )
)
# Hide legend for this trace
fig.update_traces(showlegend=False, selector=dict(yaxis="y2"))


fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
# Reduce x axis tick labels size
fig.update_xaxes(tickfont=dict(size=7))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))

# Expectations:
# Measurements done in exacerbated period are related to a higher % of airway blockage
# The higher the unblocked FEV1, the sicker the people = more likely to be in permanent inflammatory state = the higher the % of airway blockage

# Observations:
# Can't analyse because of point overlap (O2 sat values are integers)

### Boxplot O2 saturation - stable FEV1 % predicted

In [6]:
# Add Predicted SpO2
df_uo2_sab_factor["Predicted SpO2"] = df_uo2_sab_factor.apply(
    lambda x: healthy_o2_sat.calc_healthy_O2_sat(x["Height"], x.Sex)["mean"],
    axis=1,
)

# Reproduce the plot but split exacerbated/stable datapoints into two sub box plots

## Params
### O2 Saturation values
O2_col = "O2 Saturation"
O2_col = "Predicted SpO2"

### Lung function column name to order the IDs by
use_pred_fev1 = False
# use_pred_fev1 = True
fev1_col = "FEV1 % Predicted"
# fev1_col = "FEV1"

# transition = "ignored"  # baseline
# transition = "included"
transition = "excluded"

# transition = "ignored":
ex_col = "Is Exacerbated"
title_suffix = ""

df_uo2_sab_factor_plot = factors_data.compute_avg_lung_func_stable(
    df_uo2_sab_factor, fev1_col
)

# Removing IDs that don't have exacerbated labels outside of the transition period
if transition == "excluded" or transition == "included":
    mask_excluding_transition = df_uo2_sab_factor_plot["Exacerbation State"] != 0.5
    df_uo2_sab_factor_plot = ex_labels_data.exclude_IDs_where_no_ex_labels(
        df_uo2_sab_factor_plot[mask_excluding_transition]
    )

if transition == "excluded":
    # Drop rows where Exacerbation State is 0.5
    df_uo2_sab_factor_plot = df_uo2_sab_factor_plot[
        df_uo2_sab_factor_plot["Exacerbation State"] != 0.5
    ]
    title_suffix = (
        f", excluding ({transition_before}, {transition_after}) transition period"
    )
if transition == "included":
    ex_col = "Exacerbation State"
    title_suffix = f", with ({transition_before}, {transition_after}) transition period"

# Compute average diff of means between an ID's stable and exacerbated periods
avg_diff_of_means = factors_data.get_avg_diff_of_means(
    ex_labels_data.at_least_n_datapoints(df_uo2_sab_factor_plot, 10),
    O2_col,
    ex_col,
)

if use_pred_fev1:
    # Order by avg FEV1 % Pred in stable period
    df_uo2_sab_factor_plot = df_uo2_sab_factor_plot.sort_values(
        by=["Predicted FEV1"], ascending=True
    )
    df_uo2_sab_factor_plot["ID (predicted FEV1)"] = df_uo2_sab_factor_plot.apply(
        lambda x: f"{x.ID} ({str(round(x[f'Predicted FEV1'],1))}L, {x['Age']}yr, {int(x['Height'])}cm, {x['Sex']})",
        axis=1,
    )
    ## Plot
    title = "Boxplots for O2 Sat, ordered by Predicted FEV1 ({} points{})".format(
        df_uo2_sab_factor_plot.shape[0], title_suffix
    )

    fig = px.box(
        df_uo2_sab_factor_plot,
        x="ID (predicted FEV1)",
        y=O2_col,
        color=ex_col,
    )
else:
    ## Plot
    title = f"Boxplots for {O2_col}, ordered by avg {fev1_col} in stable period ({df_uo2_sab_factor_plot.shape[0]} points{title_suffix})"

    fig = px.box(
        df_uo2_sab_factor_plot,
        x=f"ID (avg {fev1_col} in stable period)",
        y=O2_col,
        color=ex_col,
    )

# Set True to blue and False to red
fig.for_each_trace(
    lambda trace: trace.update(marker_color=get_ex_color(1))
    if trace.name == "True" or trace.name == "1.0"
    else trace.update(marker_color=get_stable_color(1))
    if trace.name == "False" or trace.name == "0.0"
    else trace.update(marker_color=get_transition_color(0.2))
)
fig.update_traces(boxmean=True)
# Add annotation for mean difference
fig.add_annotation(
    x=0,
    y=1.05,
    xref="paper",
    yref="paper",
    text="Avg diff of means: {:.2f}".format(avg_diff_of_means),
    showarrow=False,
)

fig.update_layout(autosize=False, width=2100, height=500, title=title)
fig.show()
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))


Excluding all datapoints for IDs that have no exacerbated labels
Removed 6 IDs, 234 stable labels (initially 53 IDs and 3857 labels)

Excluding IDs that have less than 10 stable or exacerbated datapoints
Removed 23/47 IDs with too few data (<10 ex or stable measurements)


### Plot the mean of stable O2 saturation for each ID

In [28]:
def stable_measurements(df):
    return df["Is Exacerbated"] == False


# Healthy O2 Sat - Lund Damage curve
o2_25_prctile_col_name = "Mean O2 Saturation"
df_ho2_ld_curve = (
    df_uo2_sab_factor_plot.loc[stable_measurements(df_uo2_sab_factor)]
    .groupby(
        [
            "ID (avg FEV1 % Predicted in stable period)",
            "Avg FEV1 % Predicted in stable period",
        ],
        dropna=False,
    )["O2 Saturation"]
    .mean()
    # .quantile(0.25)
    .reset_index()
    .rename(columns={"O2 Saturation": o2_25_prctile_col_name})
    .sort_values(by="Avg FEV1 % Predicted in stable period")
)

# Smooth the curve

## Use medically informed model: increase up to 60% avg FEV1 % Pred in stable period, then constant
medical_smoothing = "increase up to 60%, then constant"
AVG_STABLE_FEV1_PRED_THRESHOLD = 60
o2_constant_mask = (
    df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"]
    > AVG_STABLE_FEV1_PRED_THRESHOLD
)

### Constant part
O2_SAT_LOW_LUNG_DAMAGE = (
    df_ho2_ld_curve[o2_25_prctile_col_name].loc[o2_constant_mask].mean()
)
print(f"O2_SAT_LOW_LUNG_DAMAGE: {O2_SAT_LOW_LUNG_DAMAGE}")

### Regression
### Linear regression of the 25th percentile of O2 saturation for avg FEV1 % Pred in stable period < 60
from sklearn.linear_model import LinearRegression

print("*** Apply linear regression ***")
x = (
    df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"]
    .loc[~o2_constant_mask]
    .values.reshape(-1, 1)
)
y = df_ho2_ld_curve[o2_25_prctile_col_name].loc[~o2_constant_mask]
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")
print("")
O2_sat_high_lung_damage = model.predict(
    df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"]
    .loc[~o2_constant_mask]
    .values.reshape(-1, 1)
)

df_ho2_ld_curve[o2_25_prctile_col_name + " - " + medical_smoothing] = np.nan
df_ho2_ld_curve[o2_25_prctile_col_name + " - " + medical_smoothing].loc[
    o2_constant_mask
] = O2_SAT_LOW_LUNG_DAMAGE
df_ho2_ld_curve[o2_25_prctile_col_name + " - " + medical_smoothing].loc[
    ~o2_constant_mask
] = O2_sat_high_lung_damage


## Moving average: not used at it leads to information loss
df_ho2_ld_curve[o2_25_prctile_col_name + " - moving avg"] = (
    df_ho2_ld_curve[o2_25_prctile_col_name].rolling(5).mean()
)

## Smoothing spline
from scipy.interpolate import splrep, BSpline

### Base value for smoothing parameter
s = df_ho2_ld_curve.shape[0] - np.sqrt(2 * df_ho2_ld_curve.shape[0])
print(f"Smoothing parameter: {s}")
s = 80
### Create a spline representation of the curve
### tck-tuple: (t,c,k) containing the vector of knots, the B-spline coefficients, and the degree of the spline.
tck = splrep(
    df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"],
    df_ho2_ld_curve[o2_25_prctile_col_name],
    s=s,
)
### Evalute the spline repr on a new set of points
df_ho2_ld_curve[o2_25_prctile_col_name + " - smoothed"] = BSpline(*tck)(
    df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"]
)

# Plot scatter of ID (avg FEV1 % Pred in stable period) in x, 25th percentile of O2 saturation in y, sorted by Avg FEV1 % Predicted in stable period with go
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"],
        y=df_ho2_ld_curve[o2_25_prctile_col_name],
        mode="markers",
        name="raw",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"],
        y=df_ho2_ld_curve[o2_25_prctile_col_name + " - " + medical_smoothing],
        mode="lines",
        name=medical_smoothing,
    )
)
fig.add_trace(
    go.Scatter(
        x=df_ho2_ld_curve["Avg FEV1 % Predicted in stable period"],
        y=df_ho2_ld_curve[o2_25_prctile_col_name + " - smoothed"],
        mode="lines",
        name="smoothed",
    )
)
title = f"Healthy O2 Saturation - Lung Damage ({df_ho2_ld_curve.shape[0]} individuals)"
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
    title=title,
    xaxis_title="Avg FEV1 % Predicted in stable period",
    yaxis_title=o2_25_prctile_col_name,
)
fig.show()
# Save
fig.write_image(
    "{}/Factors - {}.pdf".format(plotsdir, title),
    width=1200,
    height=500,
)

O2_SAT_LOW_LUNG_DAMAGE: 97.0023013881034
*** Apply linear regression ***
coefficient of determination: 0.34895442178865077
intercept: 90.8899024914976
slope: [0.1030517]

Smoothing parameter: 37.30464028516734


### Study SpO2 sex bias
All: mean female SpO2 - mean male SpO2 = 96.2 (1.8%) - 93.9 (2.2%) = 2.3%

\>60% FEV1 % Predicted avg: 97 (1.4%) - 96 (1.4%) = 1%

To few data for >80%

In [7]:
df_males = df_uo2_sab_factor_plot[df_uo2_sab_factor_plot.Sex == "Male"]
df_females = df_uo2_sab_factor_plot[df_uo2_sab_factor_plot.Sex == "Female"]
assert len(df_males) + len(df_females) == len(df_uo2_sab_factor_plot)

In [9]:
df_females["O2 Saturation"].describe()

count    2360.000000
mean       96.187712
std         1.844512
min        84.000000
25%        95.000000
50%        96.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

In [8]:
df_males["O2 Saturation"].describe()

count    1263.000000
mean       93.862233
std         2.212561
min        83.000000
25%        93.000000
50%        94.000000
75%        95.000000
max        99.000000
Name: O2 Saturation, dtype: float64

In [17]:
threshold = 60
df_females[df_females["Avg FEV1 % Predicted in stable period"] > threshold][
    "O2 Saturation"
].describe()

count    873.000000
mean      96.982818
std        1.359954
min       92.000000
25%       96.000000
50%       97.000000
75%       98.000000
max      100.000000
Name: O2 Saturation, dtype: float64

In [18]:
df_males[df_males["Avg FEV1 % Predicted in stable period"] > threshold][
    "O2 Saturation"
].describe()

count    189.000000
mean      96.042328
std        1.428549
min       91.000000
25%       95.000000
50%       96.000000
75%       97.000000
max       99.000000
Name: O2 Saturation, dtype: float64

## O2 sat grouped by  vs lung damage

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_lms.Age,
        y=df_lms.M,
        mode="markers+lines",
        name="M curve",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_lms.Age,
        y=df_lms.M - df_lms.S,
        mode="lines",
        name="M - S curve",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_lms.Age,
        y=df_lms.LLN,
        mode="lines",
        name="LLN curve",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_lms.Age,
        y=df_lms.M - df_lms["sigma from LLN"],
        mode="lines",
        name="Sigma curve (LLN - M)/(-1.64)",
    )
)
fig.update_layout(
    xaxis_title="Age",
    yaxis_title="Predicted FEV1 (L) (male, )",
)

In [None]:
# Group O2 sat by IQR

## Archive

In [None]:
# Factor function for unblocked FEV1 (L)
df_hfev1_ld_factor = pd.DataFrame(
    columns=["ID,", "Unblocked FEV1 (L)", "Healthy FEV1 (L)"]
)
for id in O2_FEV1.ID.unique():
    # For a given patient id, filter the FEV1 measurements
    mask = O2_FEV1["ID"] == id
    O2_FEV1_patient = O2_FEV1[mask]
    # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
    # To avoid taking an outlier up, which is third highest FEV1 measurement
    rmax = O2_FEV1_patient["FEV1"].nlargest(3).iloc[-1]
    # Get the theoretical healthy FEV1 (L)
    healthy_fev1 = O2_FEV1_patient["Predicted FEV1"].iloc[0]
    # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
    new_row = pd.DataFrame(
        {"ID": [id], "Unblocked FEV1 (L)": [rmax], "Healthy FEV1 (L)": [healthy_fev1]}
    )
    df_hfev1_ld_factor = pd.concat([df_hfev1_ld_factor, new_row])


def add_lung_damage(fig, df, lung_damage_prct):
    xmax = df["Healthy FEV1 (L)"].max()
    xmin = df["Healthy FEV1 (L)"].min()
    a = 1 - lung_damage_prct / 100
    fig.add_shape(
        type="line",
        x0=xmin,
        y0=a * xmin,
        x1=xmax,
        y1=a * xmax,
        line=dict(color="Red", width=0.5),
    )
    # Add line legend for no lung damage
    fig.add_annotation(
        x=1.02 * xmax,
        y=a * xmax,
        text="{}%".format(lung_damage_prct),
        showarrow=False,
        font=dict(size=10, color="Red"),
    )


# Plot a scatter of unblocked FEV1 (L) against healthy FEV1 (L)
fig = px.scatter(df_hfev1_ld_factor, x="Healthy FEV1 (L)", y="Unblocked FEV1 (L)")
add_lung_damage(fig, df_hfev1_ld_factor, 0)
add_lung_damage(fig, df_hfev1_ld_factor, 50)
title = "Impact of lung damage on healthy FEV1 ({} points)".format(
    df_hfev1_ld_factor.shape[0]
)
fig.update_layout(autosize=False, width=500, height=500, title=title)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

In [None]:
df_ufev1_sab_factor = ex_labels_data.inner_merge_with(
    O2_FEV1, ex_labels_data.load()
)  # results can be influenced by the addition of the exclude_no_ex_ids flag on 06.09.2023

df_hfev1_ld_factor = pd.DataFrame(
    columns=["ID,", "Unblocked FEV1 (L)", "Healthy FEV1 (L)"]
)
for id in O2_FEV1.ID.unique():
    # For a given patient id, filter the FEV1 measurements
    mask = O2_FEV1["ID"] == id
    O2_FEV1_patient = O2_FEV1[mask]
    # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
    # To avoid taking an outlier up, which is third highest FEV1 measurement
    rmax = O2_FEV1_patient["FEV1"].nlargest(3).iloc[-1]
    # Get the theoretical healthy FEV1 (L)
    healthy_fev1 = O2_FEV1_patient["Predicted FEV1"].iloc[0]
    # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
    new_row = pd.DataFrame(
        {"ID": [id], "Unblocked FEV1 (L)": [rmax], "Healthy FEV1 (L)": [healthy_fev1]}
    )
    df_hfev1_ld_factor = pd.concat([df_hfev1_ld_factor, new_row])

# Left join df_blockage_factor with df_unblocked_factor on ID
df_ufev1_sab_factor = pd.merge(
    df_ufev1_sab_factor, df_hfev1_ld_factor, on="ID", how="left"
)

# Plot unblocked fev1 (L) in x, measured fev1 in y, and color by Is Exacerbated
fig = px.scatter(
    df_ufev1_sab_factor,
    x="Unblocked FEV1 (L)",
    y="FEV1",
    color="Is Exacerbated",
    color_discrete_sequence=[get_stable_color(0.3), get_ex_color(0.3)],
)
title = "Impact of % small airways blockage on unblocked FEV1 ({} points)".format(
    df_ufev1_sab_factor.shape[0]
)
fig.update_layout(autosize=False, width=800, height=800, title=title)
fig.update_traces(marker=dict(size=5), selector=dict(mode="markers"))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))

In [None]:
# Boxplots with mean instead of median
id = "141"
df_for_ID = df_ufev1_sab_factor[df_ufev1_sab_factor["ID"] == id]

# Compute mean of FEV1 % Predicted for this ID for exacerbated and stable periods
mean_ex = df_for_ID[df_for_ID[ex_col] == True]["FEV1 % Predicted"].mean()
mean_stable = df_for_ID[df_for_ID[ex_col] == False]["FEV1 % Predicted"].mean()
# Compute sd
sd_ex = df_for_ID[df_for_ID[ex_col] == True]["FEV1 % Predicted"].std()
sd_stable = df_for_ID[df_for_ID[ex_col] == False]["FEV1 % Predicted"].std()

max_ex = df_for_ID[df_for_ID[ex_col] == True]["FEV1 % Predicted"].max()
min_ex = df_for_ID[df_for_ID[ex_col] == True]["FEV1 % Predicted"].min()
max_stable = df_for_ID[df_for_ID[ex_col] == False]["FEV1 % Predicted"].max()
min_stable = df_for_ID[df_for_ID[ex_col] == False]["FEV1 % Predicted"].min()

# Create figure trace
fig = go.Figure(
    data=[
        {
            "alignmentgroup": "True",
            "hovertemplate": "<extra></extra>",
            "legendgroup": "True",
            "lowerfence": [min_ex],
            "marker": {"color": "rgba(213,094,000,1)"},
            "median": [mean_ex],
            "name": "",
            "notched": False,
            "offsetgroup": "",
            "orientation": "v",
            "q1": [mean_ex - sd_ex],
            "q3": [mean_ex + sd_ex],
            "showlegend": True,
            "type": "box",
            "upperfence": [max_ex],
            "x0": " ",
            "xaxis": "x",
            "y0": " ",
            "yaxis": "y",
        },
        {
            "alignmentgroup": "True",
            "hovertemplate": "<extra></extra>",
            "legendgroup": "False",
            "lowerfence": [min_stable],
            "marker": {"color": "rgb(000,114,178)"},
            "median": [mean_stable],
            "name": "",
            "notched": False,
            "offsetgroup": "",
            "orientation": "v",
            "q1": [mean_stable - sd_stable],
            "q3": [mean_stable + sd_stable],
            "showlegend": True,
            "type": "box",
            "upperfence": [max_stable],
            "x0": " ",
            "xaxis": "x",
            "y0": " ",
            "yaxis": "y",
        },
    ],
    layout=go.Layout(
        boxmode="group",
        legend={"title": {"text": "Is Exacerbated"}},
        xaxis={"title": {"text": "ID (avg FEV1 % Pred in stable period)"}},
        yaxis={"title": {"text": "FEV1 % Predicted"}},
    ),
)

fig.update_layout(
    autosize=False, width=500, height=500, title="FEV1 % Predicted for ID {}".format(id)
)
fig.show()