The modelling of O2 saturation is difficult because we observed trends/signals in the SmartCare data that are medically hard to explain, or counter intuitive. Hence, we're doing a complementary analysis on the O2 saturation patterns using the Breathe data.

Boxplots of O2 saturation grouped by ID:
- ordered by predicted FEV1
- ordered by avg FEV1 % predicted
- ordered by avg FEV1

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Import biology module
import sys

sys.path.append("../data")
import biology as bio
import breathe_data


plotsdir = "../../../../PlotsSmartcare/O2_FEV1/"


# Data processing

In [3]:
df = breathe_data.build_O2_FEV1_df()


*** Loading patients data ***
Number of IDs:  258
*** Loading measurements data ***
FEV1 and SpO2 NaN together:  0
FEV1 or SpO2 is NaN:  6414
Number of rows:  26812
Dropping NaN rows
Number of rows:  20398
Number of IDs:  258
Removed 149/20398 rows, 40/214 patients


# SpO2 boxplots per individual

In [43]:
# Parametrisation
## O2 Saturation
O2_col = "O2 Saturation"
O2_col = "Predicted SpO2"

# Sort values by ascending FEV1 % Predicted
FEV1_col = "FEV1 % Predicted"
# FEV1_col = "Predicted FEV1"
# FEV1_col = "FEV1"

gender = "both"
# gender = "male"
# gender = "female"

if gender == "both":
    df_plot = df
    title_suffix = ""
elif gender == "male":
    df_plot = df[df.Sex == "Male"]
    title_suffix = ", males only"
elif gender == "female":
    df_plot = df[df.Sex == "Female"]
    title_suffix = ", females only"

df_plot = df_plot.sort_values(
    by=[FEV1_col if FEV1_col == "Predicted FEV1" else f"{FEV1_col}_avg"]
)

title = f"Breathe - Boxplots for {O2_col}, ordered by {FEV1_col} ({df_plot.shape[0]} points{title_suffix})"

fig = px.box(
    df_plot,
    x=f"ID ({FEV1_col if FEV1_col == 'Predicted FEV1' else 'avg ' + FEV1_col})",
    y=O2_col,
)
# Add title
fig.update_layout(title=title)
fig.update_traces(boxmean=True)

# Update fig size
fig.update_layout(height=600, width=3000)
fig.show()
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))


In [162]:
col = "mean FEV1 % Predicted"
col = "Predicted FEV1"
# col = 'mean FEV1'
col = "Height"
col = "Age"

SpO2_is_sex_corrected = True
# SpO2_is_sex_corrected = False

n = 5
O2_type = "mean"
O2_type = f"rmax-{n}"
O2_colname = (
    f"Sex corrected SpO2 {O2_type} agg"
    if SpO2_is_sex_corrected
    else f"SpO2 {O2_type} agg"
)

# Get the FEV1 % Predicted for the each patient
sexes = df.groupby("ID")["Sex"].first()
heights = df.groupby("ID")["Height"].first()
ages = df.groupby("ID")["Age"].first()

# Create a df with means and fev1
df_agg = pd.DataFrame(
    {"ID": O2_agg.index, "Sex": sexes, "Height": heights, "Age": ages}
)


def get_rmax(s_SpO2_for_ID):
    return s_SpO2_for_ID.nlargest(n).iloc[-1]


def apply_correction(row, mean_m, mean_f, std_m, std_f, O2_colname):
    if row.Sex == "Male":
        return (row[O2_colname] - mean_m) / std_m
    if row.Sex == "Female":
        return (row[O2_colname] - mean_f) / std_f


def correct_SpO2_for_sex(df, O2_colname):
    spo2_mean_m = df[df.Sex == "Male"][O2_colname].mean()
    spo2_mean_f = df[df.Sex == "Female"][O2_colname].mean()
    spo2_std_m = df[df.Sex == "Male"][O2_colname].std()
    spo2_std_f = df[df.Sex == "Female"][O2_colname].std()

    return df.apply(
        lambda row: apply_correction(
            row, spo2_mean_m, spo2_mean_f, spo2_std_m, spo2_std_f, O2_colname
        ),
        axis=1,
    )


df_agg["Predicted FEV1"] = df.groupby("ID")["Predicted FEV1"].first()
df_agg["mean FEV1 % Predicted"] = df.groupby("ID")["FEV1 % Predicted"].mean()
df_agg[f"SpO2 rmax-{n} agg"] = df.groupby("ID")["O2 Saturation"].apply(get_rmax)
df_agg[f"SpO2 mean agg"] = df.groupby("ID")["O2 Saturation"].mean()
df_agg[f"Sex corrected SpO2 {O2_type} agg"] = correct_SpO2_for_sex(
    df_agg, f"SpO2 {O2_type} agg"
)

# Order by lung function variable
df_agg = df_agg.sort_values(by=col)

# Scatter plot of means
fig = px.scatter(x=df_agg[col], y=df_agg[O2_colname], color=df_agg.Sex)
# fig = px.scatter(x=df_agg[col], y=df_agg[f"SpO2 {O2_type} agg"], color=df_agg.Sex)
# Update font size
fig.update_layout(
    font=dict(
        size=8,
    ),
)


# Compute Pearson correlation coefficient
def calc_bootstrapped_corr(df, var1, var2, size=0.9, n=2000):
    """
    Calculate Pearson correlation coefficient by bootstrapping n times with 90% of the data
    """
    corr_all = []
    corr_m = []
    corr_f = []
    for i in range(n):
        sample_all = df.sample(frac=size)
        sample_m = sample_all[sample_all.Sex == "Male"]
        sample_f = sample_all[sample_all.Sex == "Female"]
        corr_all.append(sample_all.corr().loc[var1, var2])
        corr_m.append(sample_m.corr().loc[var1, var2])
        corr_f.append(sample_f.corr().loc[var1, var2])
    # Return max min string for corr_all, corr_m, corr_f
    return (
        f"[{round(np.min(corr_all), 2)};{round(np.max(corr_all), 2)}]",
        f"[{round(np.min(corr_m), 2)};{round(np.max(corr_m), 2)}]",
        f"[{round(np.min(corr_f), 2)};{round(np.max(corr_f), 2)}]",
    )


# corr_all = df_agg.corr().loc[col, O2_colname]
# corr_m = df_agg[df_agg.Sex == "Male"].corr().loc[col, O2_colname]
# corr_f = df_agg[df_agg.Sex == "Female"].corr().loc[col, O2_colname]

print(
    f"Calculating Bootstrapped Pearson correlation coefficients for {col} and {O2_colname}"
)
corr_all_range, corr_m_range, corr_f_range = calc_bootstrapped_corr(
    df_agg, col, O2_colname
)

# title = f"Breathe - {O2_colname} vs {col} ({len(df_agg)} individuals, Pearson corr: all={round(corr_all, 2)}, males={round(corr_m,2)}, females={round(corr_f,2)}"
title = f"Breathe - {O2_colname} vs {col} ({len(df_agg)} IDs, boots. corr.: all={corr_all_range}, m={corr_m_range}, f={corr_f_range}"

# Add Pearson correlation coefficient to title
fig.update_layout(title=title, width=800, height=300)
# Reduce marker size
fig.update_traces(marker=dict(size=3))
# Set axis labels
fig.update_xaxes(title_text=f"{col}")
# fig.update_xaxes(title_text=f'{len(df_agg)} individuals (ordered by FEV1 % Predicted)', showgrid=False, showticklabels=False)
# Put the x axis title on bottom
if SpO2_is_sex_corrected:
    fig.update_yaxes(title_text=O2_colname)
if not (SpO2_is_sex_corrected):
    fig.update_yaxes(range=[90, 101], nticks=10, title_text=f"SpO2 {O2_type} agg")
fig.show()
# Save fig
fig.write_image(f"{plotsdir}/Factors - {title}.pdf")


Calculating Bootstrapped Pearson correlation coefficients for Age and Sex corrected SpO2 rmax-5 agg


# Study the 60% FEV1 % Predicted cut-off

In [23]:
# Split dataframe into 2 based on FEV1 % Predicted = 60%
df_low = df[df["FEV1 % Predicted"] <= 60]
df_high = df[df["FEV1 % Predicted"] > 60]

# For each individual, compute the average O2 saturation
means_low = df_low.groupby("ID")["O2 Saturation"].mean()
means_high = df_high.groupby("ID")["O2 Saturation"].mean()


# Compute the achievable range of values between 75th percentile and 25th percentile
achievable_ranges_low = df_low.groupby("ID")["O2 Saturation"].quantile(
    0.75
) - df_low.groupby("ID")["O2 Saturation"].quantile(0.25)
achievable_ranges_high = df_high.groupby("ID")["O2 Saturation"].quantile(
    0.75
) - df_high.groupby("ID")["O2 Saturation"].quantile(0.25)


In [24]:
# Compare mean of means_low with mean of means_high
print(f"Mean of means_low: {means_low.mean()}")
print(f"Mean of means_high: {means_high.mean()}")

# Same for achievable ranges
print(f"Mean of achievable_ranges_low: {achievable_ranges_low.mean()}")
print(f"Mean of achievable_ranges_high: {achievable_ranges_high.mean()}")

Mean of means_low: 96.45803558105217
Mean of means_high: 97.44298835023818
Mean of achievable_ranges_low: 1.231818181818182
Mean of achievable_ranges_high: 0.9890510948905109


In [25]:
# Plot a boxplot of the achievable_ranges_low and achievable_ranges_high side by side
fig = go.Figure()
fig.add_trace(
    go.Box(y=achievable_ranges_low, name="FEV1 % Predicted <60%", boxmean=True)
)
fig.add_trace(
    go.Box(y=achievable_ranges_high, name="FEV1 % Predicted >60%", boxmean=True)
)
# Update fig size
fig.update_layout(
    title="Achievable ranges of O2 saturation (75th-25th percentiles)",
    height=600,
    width=600,
)
fig.show()


# Study SpO2 sex bias

In [26]:
df_males = df[df.Sex == "Male"]
df_females = df[df.Sex == "Female"]
assert len(df_males) + len(df_females) == len(df)

In [27]:
df_females["O2 Saturation"].describe()

count    10988.000000
mean        97.156656
std          1.704953
min         85.000000
25%         96.000000
50%         98.000000
75%         98.000000
max        100.000000
Name: O2 Saturation, dtype: float64

In [28]:
df_males["O2 Saturation"].describe()

count    9255.000000
mean       96.748664
std         1.496404
min        89.000000
25%        96.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

Conclusions:

All: mean female SpO2 - mean male SpO2 = 97.2 (1.7%) - 96.8 (1.5%) = 0.4%

\>60% FEV1 % Predicted avg: 97.6 (1.3%) - 97.2 (1.2%) = 0.4%

\>80% FEV1 % Predicted avg: 97.9 (1.2%) - 97.3 (1.1%) = 0.6%

\>90% FEV1 % Predicted avg: 98.2 (0.9%) - 97.2 (1.2%) = 1%

____________
Only within 18-40yr range (this excludes 20 females and 10 males)

All: 97.2 (1.7%) - 96.8 (1.5%) = 0.4%

\>90% FEV1 % Predicted avg: 98 (0.9%) - 97.2 (1.2%) = 0.8%

In [29]:
# Threshold for healthy individuals
threshold_fev1_prct_pred = 60
df_healthy_males = df_males[
    df_males["FEV1 % Predicted_avg"] >= threshold_fev1_prct_pred
]
df_healthy_females = df_females[
    df_females["FEV1 % Predicted_avg"] >= threshold_fev1_prct_pred
]


In [30]:
df_healthy_females["O2 Saturation"].describe()

count    5576.000000
mean       97.622788
std         1.385999
min        88.000000
25%        97.000000
50%        98.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

In [31]:
df_healthy_males["O2 Saturation"].describe()

count    5191.000000
mean       97.194259
std         1.189012
min        91.000000
25%        97.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

In [32]:
# Within 18-40 age range
df_young_males = df_males[(df_males["Age"] >= 18) & (df_males["Age"] <= 40)]
df_young_females = df_females[(df_females.Age >= 18) & (df_females.Age <= 40)]
# Print length diff against df_males and df_females
print(
    f"{len(df_young_females.ID.unique())}/{len(df_females.ID.unique())} females between 18 and 40"
)
print(
    f"{len(df_young_males.ID.unique())}/{len(df_males.ID.unique())} males between 18 and 40"
)


77/97 females between 18 and 40
63/77 males between 18 and 40


In [33]:
df_young_females["O2 Saturation"].describe()

count    7937.000000
mean       97.197472
std         1.695689
min        85.000000
25%        97.000000
50%        98.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

In [34]:
df_young_males["O2 Saturation"].describe()

count    6533.000000
mean       96.775815
std         1.547469
min        89.000000
25%        96.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2 Saturation, dtype: float64

In [35]:
# Threshold for healthy individuals
threshold_fev1_prct_pred = 90
df_young_males[df_young_males["FEV1 % Predicted_avg"] >= threshold_fev1_prct_pred][
    "O2 Saturation"
].describe()

count    956.000000
mean      97.284414
std        1.125001
min       93.000000
25%       97.000000
50%       98.000000
75%       98.000000
max      100.000000
Name: O2 Saturation, dtype: float64

In [36]:
df_young_females[df_young_females["FEV1 % Predicted_avg"] >= threshold_fev1_prct_pred][
    "O2 Saturation"
].describe()

count    993.000000
mean      98.008056
std        0.967171
min       89.000000
25%       98.000000
50%       98.000000
75%       99.000000
max      100.000000
Name: O2 Saturation, dtype: float64

# Factors
## Healthy O2 Saturation Factor

In [4]:
df.head()

Unnamed: 0,FEV1,O2 Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted,FEV1 % Predicted_avg,ID (avg FEV1 % Predicted),FEV1_avg,ID (avg FEV1),ID (Predicted FEV1),Predicted SpO2
0,1.31,97.0,101,53,Male,173.0,3.610061,36.287474,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),100.103199
1,1.29,96.0,101,53,Male,173.0,3.610061,35.733466,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),99.071207
2,1.32,96.0,101,53,Male,173.0,3.610061,36.564477,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),99.071207
3,1.28,97.0,101,53,Male,173.0,3.610061,35.456463,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),100.103199
4,1.33,98.0,101,53,Male,173.0,3.610061,36.841481,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),101.135191


## Fit O2 saturation with sex, height

In [200]:
# Fit O2 Saturation = a + b*isMale
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from plotly.subplots import make_subplots

model_type = "sex"  # baseline
# model_type = "sex and height"  # Difficult to interprete as height/sex can be dependent in their relation to O2 Sat
model_type = "sex then height"  # Ensure sex captures all relationship with O2 Sat, then fit height
# model_type = "sex then height (no height intercept)"

# Compute isMale
df["isMale"] = pd.get_dummies(df.Sex).Male


def fit_O2sat_by_sex(df, with_height: bool):
    """
    Run a Linear Regression (OLS) to learn the parameters of the model for healthy O2 Saturation
    O2 Saturation = a + b*isMale (+ c*Height)
    """
    if with_height:
        X = np.array([df.isMale, df["Height"]]).T
    else:
        X = np.array([df.isMale]).T

    Y = df["O2 Saturation"]

    regr = (
        linear_model.LinearRegression()
    )  # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
    regr.fit(X, Y)
    return regr


def compute_O2Sat_residual_after_sex_fit(row, intercept, isMaleScale):
    return row["O2 Saturation"] - intercept - isMaleScale * row.isMale


def compute_O2Sat_residual_after_sex_height_fit(
    row, intercept, isMaleScale, heightScale
):
    return (
        row["O2 Saturation"]
        - intercept
        - isMaleScale * row.isMale
        - heightScale * (row.Height - df.Height.mean())
    )


def further_fit_O2sat_by_height(df, a: float, b: float, with_intercept: bool):
    """
    a: intercept
    b: coefficient of isMale

    Once we have fit O2 Saturation = a + b*isMale, we can compute the residuals of this fit
    We then fit those residuals against Height to capture the remaining relationship of O2 Sat with height
    Id est we fit O2 Saturation - a - b*isMale = c*Height + d using an OLS
    """
    O2Sat_residuals = df.apply(
        lambda x: compute_O2Sat_residual_after_sex_fit(x, a, b), axis=1
    )
    Height_scaled = df.Height - df.Height.mean()

    X = np.array([Height_scaled]).T
    Y = O2Sat_residuals

    regr = linear_model.LinearRegression(fit_intercept=with_intercept)
    regr.fit(X, Y)
    return regr


health_thresholds = range(10, 95, 5)  # avg FEV1 % Predicted
a = []
b = []
if (
    model_type == "sex and height"
    or model_type == "sex then height (no height intercept)"
):
    c = []
if model_type == "sex then height":
    c = []
    d = []
n_IDs = []
n_m_IDs = []
n_f_IDs = []
n_datapoints = []
valid_health_thresholds = []


def filter_healthiest(df, health_threshold):
    return df[df["FEV1 % Predicted"] >= health_threshold]


for health_threshold in health_thresholds:
    df_healthiest = filter_healthiest(df, health_threshold)

    if len(df_healthiest) == 0:
        continue

    n_IDs.append(len(df_healthiest.ID.unique()))
    n_m_IDs.append(len(df_healthiest[df_healthiest.Sex == "Male"].ID.unique()))
    n_f_IDs.append(len(df_healthiest[df_healthiest.Sex == "Female"].ID.unique()))
    n_datapoints.append(len(df_healthiest))
    valid_health_thresholds.append(health_threshold)

    regr = fit_O2sat_by_sex(df_healthiest, model_type == "sex and height")
    tmp_a = regr.intercept_
    tmp_b = regr.coef_[0]
    a.append(tmp_a)
    b.append(tmp_b)
    if model_type == "sex and height":
        c.append(regr.coef_[1])
    elif model_type == "sex then height":
        # In this case, we fit O2 Saturation - a - b*isMale = c*Height + d
        regr_height = further_fit_O2sat_by_height(
            df_healthiest,
            tmp_a,
            tmp_b,
            True,
        )
        c.append(regr_height.coef_[0])
        d.append(regr_height.intercept_)
    elif model_type == "sex then height (no height intercept)":
        # In this case, we fit O2 Saturation - a - b*isMale = c*Height
        regr_height = further_fit_O2sat_by_height(
            df_healthiest,
            tmp_a,
            tmp_b,
            False,
        )
        c.append(regr_height.coef_[0])

prct_datapoints = np.array(n_datapoints) / len(df) * 100
prct_females = np.array(n_f_IDs) / np.array(n_IDs) * 100

# X_chart = [
#     f"{t}% ({f_id}+{m_id}, {p})"
#     for t, p, f_id, m_id in zip(valid_health_thresholds, n_datapoints, n_f_IDs, n_m_IDs)
# ]
X_chart = [
    f"{t}% ({int(datapoints)}%, {int(females)}%)"
    for t, datapoints, females in zip(
        valid_health_thresholds, prct_datapoints, prct_females
    )
]

avg_height_str = f"avg_height_{int(df.Height.mean())}"

title = "Breathe - Fitting O2 Saturation = a + b*isMale"
nrows = 3
fig_height = 600
if model_type == "sex and height":
    nrows = 3
    title = "Breathe - Fitting O2 Saturation = a + b*isMale + c*Height"
    fig_height = 600
if model_type == "sex then height":
    nrows = 5
    title = f"Breathe - Fitting O2 Saturation = a + b*isMale + c*(Height - {avg_height_str})"
    fig_height = 800
if model_type == "sex then height (no height intercept)":
    nrows = 4
    title = f"Breathe - Fitting O2 Saturation = b*isMale + c*(Height - {avg_height_str})"
    fig_height = 600

fig = make_subplots(
    rows=nrows,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.02,
)
# On the first subplot, add a, on the second b, on the third c
fig.add_trace(go.Scatter(x=X_chart, y=a, name="a"), row=1, col=1)
# Add annotation below the point a[-3] to show the value of a[-3]
fig.add_annotation(
    x=X_chart[-3],
    y=a[-3],
    text=f"a={round(a[-3], 2)}",
    showarrow=True,
    arrowhead=1,
    row=1,
    col=1,
)
# Add line at 98.4% on for a
fig.add_shape(
    type="line",
    x0=0,
    y0=98.4,
    x1=len(X_chart) - 1,
    y1=98.4,
    line=dict(color="blue", width=1, dash="dash"),
    row=1,
    col=1,
)
fig.add_trace(go.Scatter(x=X_chart, y=b, name="b"), row=2, col=1)
# Add annotation below the point b[-3] to show the value of b[-3]
fig.add_annotation(
    x=X_chart[-3],
    y=b[-3],
    text=f"b={round(b[-3], 2)}",
    showarrow=True,
    arrowhead=1,
    row=2,
    col=1,
)
# Add a dashed red line at -0.6 for b
fig.add_shape(
    type="line",
    x0=0,
    y0=-0.6,
    x1=len(X_chart) - 1,
    y1=-0.6,
    line=dict(color="red", width=1, dash="dash"),
    row=2,
    col=1,
)
if model_type == "sex":
    fig.add_trace(
        go.Scatter(x=X_chart, y=np.array(b) + np.array(a), name="b+a"), row=3, col=1
    )
    fig.update_traces(row=3, col=1, marker_color="black")
elif model_type == "sex and height":
    fig.add_trace(go.Scatter(x=X_chart, y=c, name="c"), row=3, col=1)
elif (
    model_type == "sex then height"
    or model_type == "sex then height (no height intercept)"
):
    fig.add_trace(
        go.Scatter(x=X_chart, y=np.array(b) + np.array(a), name="b+a"), row=3, col=1
    )
    fig.update_traces(row=3, col=1, marker_color="black")
    fig.add_trace(go.Scatter(x=X_chart, y=c, name="c"), row=4, col=1)
    # Add annotation below the point c[-3] to show the value of c[-3]
    fig.add_annotation(
        x=X_chart[-3],
        y=c[-3],
        text=f"c={round(c[-3], 3)}",
        showarrow=True,
        arrowhead=1,
        row=4,
        col=1,
    )
    if model_type == "sex then height":
        fig.add_trace(go.Scatter(x=X_chart, y=d, name="d"), row=5, col=1)


# Update y axis names for each subplot
fig.update_yaxes(title="a (Female SpO2)", nticks=10, row=1, col=1, range=[94, 102])
fig.update_yaxes(title="b", nticks=10, row=2, col=1, range=[-1.1, -0.1])
if model_type == "sex":
    fig.update_yaxes(title="a+b (Male SpO2)", nticks=10, row=3, col=1, range=[94, 102])
if model_type == "sex and height":
    fig.update_yaxes(title="c", nticks=10, row=3, col=1)
elif model_type == "sex then height" or "sex then height (no height intercept)":
    fig.update_yaxes(title="a+b (Male SpO2)", nticks=10, row=3, col=1, range=[94, 102])
    fig.update_yaxes(title="c", nticks=10, row=4, col=1)
    if model_type == "sex then height":
        fig.update_yaxes(title="d", nticks=10, row=5, col=1)


fig.update_layout(title=title)
fig.update_xaxes(
    title_text="Healthiness threshold in FEV1 % Predicted (%datapoints, %females)"
)
fig.update_layout(height=fig_height, width=800)
fig.show()
fig.write_image(f"{plotsdir}/Factors - {title}.pdf")

# Literature would be a = 98.3, b = -1.4
# The healthier the the higher the base value of O2 saturation, and the bigger the difference between males and females.

In [214]:
# Calculate Residual Standard Error (RSE)

# RSE = sqrt(SSR / (n - p - 1))

# SSR: Sum of squared residuals.
# n: Total number of observations.
# p: Number of predictor variables (including the intercept).
# The degrees of freedom for the residuals is (n - p - 1).

## Taking a = 98.07 and b = -0.71 compute the residuals
df_healthiest = filter_healthiest(df, 80)
df_healthiest["(O2Sat - a)/b"] = df_healthiest.apply(
    lambda x: compute_O2Sat_residual_after_sex_fit(x, a[-3], b[-3]), axis=1
)
df_healthiest["O2Sat fit residuals"] = df_healthiest.apply(
    lambda x: compute_O2Sat_residual_after_sex_height_fit(x, a[-3], b[-3], c[-3]),
    axis=1,
)

# Compute RSS of those residuals
n = len(df_healthiest)
## For the fit O2 Saturation = a + b*isMale
rss = np.sum(df["O2Sat fit residuals"] ** 2)
p = 2 # intercept, isMale
rse = np.sqrt(rss / (n - p - 1))
print(f"For the sex fit: RSS={rss:.4f}, RSE={rse:.4f}")

## For the fit O2 Saturation = a + b*isMale + c*Height
rss_full = np.sum(df["(O2Sat - a)/b"] ** 2)
p = 3 # intercept, isMale, Height
rse_full = np.sqrt(rss_full / (n - p - 1))
print(f"For the sex, height fit, RSS={rss_full:.4f}, RSE={rse_full:.4f}")

print(f"Relative RSS improvement: {(rss-rss_full) / rss * 100:.1f}%")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



For the sex fit: RSS=5004.5283, RSE=1.0876
For the sex, height fit, RSS=5020.8233, RSE=1.0895
Relative RSS improvement: 0.3%




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [202]:
# Using the 80% healthiest individuals, compute the std dev of O2 Saturation measurements
# This is the noise of the measurement
# Then compute the std dev of the residuals of the fit
# This is the noise of the model
# Compare the 2
# If the noise of the model is smaller than the noise of the measurement, then the model is useful
# If the noise of the model is bigger than the noise of the measurement, then the model is not useful
df_healthiest = filter_healthiest(df, 80)
df_healthiest["O2 Saturation"].std()


1.1273008437529146

## Plot residuals of O2 Sat fit by Sex

In [187]:
# Plot the residuals of the fit O2 Saturation = a + b*isMale against Height

## Taking a = 98.07 and b = -0.71 compute the residuals
df_healthiest = filter_healthiest(df, 80)
df_healthiest["(O2Sat - a)/b"] = df_healthiest.apply(
    lambda x: compute_O2Sat_residual_after_sex_fit(x, a[-3], b[-3]), axis=1
)
df_healthiest["Height - mean_height"] = df_healthiest.Height - df_healthiest.Height.mean()

# Compute an print correlation between those residuals and height
print(
    f"Correlation between (O2Sat - a)/b and Height: {df['(O2Sat - a)/b'].corr(df.Height)}"
)

# Plot those residuals with height with trendline linera
fig = px.scatter(
    df,
    x="Height - mean_height",
    # x="Height",
    y="(O2Sat - a)/b",
    trendline="ols",
    trendline_color_override="black",
)
fig.show()

Correlation between (O2Sat - a)/b and Height: -0.05777965964306331


In [169]:
def get_HO2Sat(row, a, b, c, mean_height):
    """
    Return the healthy O2 Saturation
    """
    return a + b * row.isMale + c * row.Height - c * mean_height


df_healthiest = filter_healthiest(df, 80)
df_healthiest["Healthy O2 Saturation"] = df_healthiest.apply(
    lambda x: get_HO2Sat(x, a[-3], b[-3], c[-3], df_healthiest.Height.mean()), axis=1
)