In this notebook, I will model the factor that interconnects the airway resistance between two consecutive days.

In [1]:
import src.models.var_builders as var_builders
import src.data.helpers as dh
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import src.models.helpers as mh

In [2]:
(
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
    180, 10, "Male"
)

In [3]:
df = dh.load_excel(
    f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_FEV1.xlsx",
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_ecFEV1.xlsx",
    [AR.name],
    ["Day"],
).drop(columns=["Unnamed: 0", HO2Sat.name, IA.name, HFEV1.name])

In [6]:
def get_days_elapsed_df(df_for_ID, n_days_offset=1):
    df_for_ID = df_for_ID.copy()
    # In day 1, put the number of days that has pasat since day 0, repeat for each day

    def get_days_elapsed(curr, prev):
        if prev == None:
            return None
        return (curr - prev).total_seconds() / 3600 / 24

    df_for_ID["AR mean"] = df_for_ID.apply(lambda x: AR.get_mean(x[AR.name]), axis=1)
    # df_for_ID['AR skewness'] = df_for_ID.apply(lambda x: AR.get_skewness(x[AR.name]), axis=1)

    df_for_ID["Prev date"] = df_for_ID.shift(n_days_offset)["Day"]
    df_for_ID["Prev AR mean"] = df_for_ID.shift(n_days_offset)["AR mean"]
    # df_for_ID['Prev AR skewness'] = df_for_ID.shift(n_days_offset)['AR skewness']

    df_for_ID["Days elapsed"] = df_for_ID.apply(
        lambda x: get_days_elapsed(x["Day"], x["Prev date"]), axis=1
    )
    df_for_ID["AR mean shift"] = df_for_ID["AR mean"] - df_for_ID["Prev AR mean"]
    # df_for_ID['AR skewness shift'] = df_for_ID['AR skewness'] - df_for_ID['Prev AR skewness']

    return df_for_ID[["ID", "Day", "Days elapsed", "AR mean shift"]]
    # return df_for_ID[['ID', 'Day', 'Days elapsed', 'AR mean shift', 'AR skewness shift']]


# out = df.groupby('ID').apply(get_days_elapsed_df).reset_index(drop=True)

## Analyse and validate results for 1 day offset

In [9]:
df1 = df.merge(
    df.groupby("ID").apply(get_days_elapsed_df).reset_index(drop=True),
    on=["ID", "Day"],
    how="inner",
)

  df1 = df.merge(df.groupby('ID').apply(get_days_passed_df).reset_index(drop=True), on=['ID', 'Day'], how='inner')


In [10]:
df1.head()

Unnamed: 0,ID,Day,Airway resistance (%),Days passed,AR mean shift
0,101,2019-01-25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
1,101,2019-01-26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,-1.341903
2,101,2019-01-27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,2.332655
3,101,2019-01-28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0
4,101,2019-01-29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,-0.901317


In [11]:
df1.describe()

Unnamed: 0,Days passed,AR mean shift
count,40908.0,40908.0
mean,4.73702,-0.036692
std,20.357683,4.347104
min,1.0,-45.480774
25%,1.0,-1.63458
50%,1.0,0.0
75%,3.0,1.607764
max,980.0,53.672746


In [12]:
df1[df1["Days elapsed"] > 100]

Unnamed: 0,ID,Day,Airway resistance (%),Days passed,AR mean shift
2296,103,2023-09-25,"[0.0, 0.0, 0.0, 0.0, 6.27525413e-05, 0.0003433...",192.0,2.478966
2406,104,2020-03-23,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",152.0,-8.661761
2476,104,2023-03-20,"[0.0214458477, 0.0266964061, 0.0329117122, 0.0...",708.0,-5.112841
2787,106,2021-11-11,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",117.0,0.179750
2858,106,2023-01-18,"[0.0, 0.0, 0.0, 0.0, 1.09412661e-05, 0.0001043...",146.0,6.280519
...,...,...,...,...,...
38909,507,2022-08-09,"[0.06523372, 0.08521032, 0.09294863, 0.0988228...",125.0,-0.430656
39702,513,2023-05-26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",308.0,33.354319
39784,514,2023-10-17,"[0.0, 0.000309536674, 0.0019575308, 0.00472002...",110.0,-0.579326
40141,518,2022-09-16,"[0.0, 9.20574027e-05, 0.00250103785, 0.0067597...",117.0,2.459672


In [13]:
# Verify that the prev day is indeed correct)
df1.iloc[2295:2297]
# Count number of None
print(df1["Days elapsed"].isna().sum())
# Count number if ids
print(df1["ID"].nunique())
# They should be equal

352
352


In [14]:
df1[df1["AR mean shift"] > 20]
df1.iloc[2537:2539]

Unnamed: 0,ID,Day,Airway resistance (%),Days passed,AR mean shift
2537,104,2023-10-31,"[0.00645966881, 0.00855669673, 0.011294808, 0....",6.0,-3.175633
2538,104,2023-11-06,"[0.0161213898, 0.0203813231, 0.0255672073, 0.0...",6.0,-4.565804


In [16]:
vc = df1["Days elapsed"].value_counts()
# 1/3 of the consecutive indices are more than 1 day apart (~10k entries)
# 97% of the entries are less than 5 days apart from the previous entry
# For the CPT, I'll take 1, 2, 3, 4, 5 days apart, then avg 6-50 -> this last up to the max days diff


# Plot the histogram with vc index and vc values
fig = px.bar(x=vc.index, y=vc.values / sum(vc.values) * 100)
# Set x axis label to day to day difference
fig.update_xaxes(
    title_text="Number of days between two consecutive entries",
    range=[0, 30],
    tickvals=list(range(0, 31, 1)),
)
# Set y axis label to percentage
fig.update_yaxes(
    title_text="Percentage of total entries (%)", tickvals=[2] + list(range(0, 55, 5))
)

title = "Distribution of the time between two measurements"
# Set title
fig.update_layout(title=title, width=800, height=350, font=dict(size=10))

fig.show()

# Save figure
fig.write_image(
    f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
)

## Compute shift in AR mean

In [7]:
# Build aggregate df of shift in AR for different offsets

df_mixed_offset = pd.DataFrame()

max_days_elapsed = 3

for n_days_offset in range(1, max_days_elapsed + 1):
    df_offset = (
        df.groupby("ID")
        .apply(lambda row: get_days_elapsed_df(row, n_days_offset))
        .reset_index(drop=True)
    )
    df_offset["Offset"] = n_days_offset
    # Remove nan
    df_offset = df_offset.dropna()

    # Add to mix offset
    df_mixed_offset = pd.concat([df_mixed_offset, df_offset])

  .apply(lambda row: get_days_elapsed_df(row, n_days_offset))
  .apply(lambda row: get_days_elapsed_df(row, n_days_offset))
  .apply(lambda row: get_days_elapsed_df(row, n_days_offset))


### Study the shift in AR mean

In [6]:
# Scatter plot with days elapsed on x axis and AR diff on y axis, using px
y_col = "AR mean shift"
# y_col = 'AR skewness shift'
fig = px.scatter(df_mixed_offset, x="Days elapsed", y=y_col, color="ID")
# Set x axis range to 0-100
fig.update_xaxes(range=[0, 200])
fig.update_xaxes(range=[0, 50], title="Number of days elapsed")
# Add more y axi tick vals
fig.update_yaxes(title="Mean airway resistance shift (%)")
# Reduce marker size
fig.update_traces(marker=dict(size=2))
title = "How much does the airway resistance change between different time periods<br>(AR inferred with non smoothed FEV1)?"
fig.update_layout(
    title=title, width=800, height=400, font=dict(size=10), showlegend=False
)
fig.show()
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

In [7]:
# I want to see the distribution of AR diffs for each day elapsed
from scipy.stats import norm

y_col = "AR mean shift"

fig = make_subplots(rows=6, cols=1, shared_xaxes=True)
# xbin_size = 0.2
xbin_size = 1
# xbin_absolute_span = 50
xbin_absolute_span = 10
xbins = dict(
    start=-xbin_absolute_span - 0.5, end=xbin_absolute_span + 0.5, size=xbin_size
)


def add_plot_for_offset(offset, row):
    df_tmp = df_mixed_offset[df_mixed_offset["Days elapsed"] == offset]
    print(offset, df_tmp.shape)
    fig.add_trace(
        go.Histogram(
            x=df_tmp[y_col],
            xbins=xbins,
            histnorm="probability",
            name=f"{offset} days offset",
        ),
        row=row,
        col=1,
    )
    # Model the data by a normal distribution
    mean = df_tmp[y_col].mean()
    std = df_tmp[y_col].std()
    x = list(range(-10, 11))
    y = norm.pdf(x, loc=mean, scale=std)
    # Add trace
    # fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f"Normal distribution for {offset} days offset"), row=row, col=1)


# for offset in range(1, 51):
#     add_plot_for_offset(offset, offset)

add_plot_for_offset(1, 1)
add_plot_for_offset(2, 2)
add_plot_for_offset(8, 3)
add_plot_for_offset(14, 4)
add_plot_for_offset(20, 5)
add_plot_for_offset(50, 6)

# Set y axis range to 0, 0.6
fig.update_yaxes(range=[0, 0.58])
# Set x axis label
fig.update_xaxes(title_text="Shift in mean airway resistance (%)", row=6, col=1)
# fig.update_xaxes(title_text='Change in skewness of airway resistance (%)', row=6, col=1)
# Add x axis tick vals
fig.update_xaxes(tickvals=np.arange(-10, 11, 1), row=6, col=1)
# fig.update_xaxes(tickvals=np.arange(-50, 55, 5), row=6, col=1)
# Update layout
title = f"Shift in airway resistance for different time periods elapsed (bin_width = {xbin_size}%, bin_span = {xbin_absolute_span}, raw FEV1)"
# fig.update_layout(height=2600, width=1000, title=title)
fig.update_layout(height=600, width=1000, title=title)
# Save image
# fig.write_image(f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf")
fig.show()

1 (21855, 5)
2 (22155, 5)
8 (2988, 5)
14 (1773, 5)
20 (556, 5)
50 (84, 5)


### Build CPT assuming only the 1st moment changes (not skewness or spread)


In [8]:
# Building P(AR_next | days_elapsed, AR_prev)
# import src.models.helpers as mh
import numpy as np
import src.modelling_ar.ar as model_ar

AR1 = mh.VariableNode(
    "Airway resistance curr day (%)", 0, 90, 2, prior={"type": "uniform"}
)
AR2 = mh.VariableNode(
    "Airway resistance next day (%)", 0, 90, 2, prior={"type": "uniform"}
)
DE = mh.DiscreteVariableNode("Days elapsed", 1, max_days_elapsed, 1)

In [9]:
# Build the shift distributions
shift_val = np.arange(-5, 6, 1)
shift_p = np.empty((max_days_elapsed, len(shift_val)))
for i, de in enumerate(DE.values):
    print("days elapsed: ", de)
    mean_shift = df_mixed_offset[df_mixed_offset["Days elapsed"] == de]["AR mean shift"]
    # Bin up the mean shift series into bins starting at -5 and ending at 5, with bin size 1
    shift_p[i, :] = np.histogram(
        mean_shift, bins=np.arange(-5.5, 6.5, 1), density=True
    )[0]

days elapsed:  1
days elapsed:  2
days elapsed:  3


In [10]:
model_ar.calc_cpt(AR2, AR1, DE, shift_p, shift_val, debug=True)

Shifting X bin 0 from [0.0;2.0] to [-5.0;-3.0], shift amount=-5%
Shift outside boundaries
Shifting X bin 1 from [2.0;4.0] to [-3.0;-1.0], shift amount=-5%
Shift outside boundaries
Shifting X bin 2 from [4.0;6.0] to [-1.0;1.0], shift amount=-5%
Shift partially outside boundaries, adjusting lower boundary
k=0, bin=[0.0;2.0], p=1.0 (=1.0/1.0)
Shifting X bin 3 from [6.0;8.0] to [1.0;3.0], shift amount=-5%
k=0, bin=[0.0;2.0], p=0.5 (=1.0/2.0)
k=1, bin=[2.0;4.0], p=0.5 (=1.0/2.0)
Shifting X bin 4 from [8.0;10.0] to [3.0;5.0], shift amount=-5%
k=1, bin=[2.0;4.0], p=0.5 (=1.0/2.0)
k=2, bin=[4.0;6.0], p=0.5 (=1.0/2.0)
Shifting X bin 5 from [10.0;12.0] to [5.0;7.0], shift amount=-5%
k=2, bin=[4.0;6.0], p=0.5 (=1.0/2.0)
k=3, bin=[6.0;8.0], p=0.5 (=1.0/2.0)
Shifting X bin 6 from [12.0;14.0] to [7.0;9.0], shift amount=-5%
k=3, bin=[6.0;8.0], p=0.5 (=1.0/2.0)
k=4, bin=[8.0;10.0], p=0.5 (=1.0/2.0)
Shifting X bin 7 from [14.0;16.0] to [9.0;11.0], shift amount=-5%
k=4, bin=[8.0;10.0], p=0.5 (=1.0/2.0)


AssertionError: The sum of the probabilities should be 1, got sum(cpt)=2.9999999999999996])

In [80]:
# For 1 day offset, get distribution of shift
df_tmp = df_mixed_offset[df_mixed_offset["Offset"] == 1]

# Bin up AR mean shift into 1% bins, centered on 0
pd.cut(df_tmp["AR mean shift"], bins=list(range()))

1       -1.341903
2        2.332655
3        0.000000
4       -2.332655
5        1.341903
           ...   
41255    0.000000
41256    0.000000
41257    0.000000
41258    0.000000
41259    0.000000
Name: AR mean shift, Length: 40838, dtype: float64

## Study the shift per bin

In [14]:
df

Unnamed: 0,ID,Day,Airway resistance (%),AR mean
0,101,2019-01-25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",55.623870
1,101,2019-01-26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.281967
2,101,2019-01-27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",56.614623
3,101,2019-01-28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",56.614623
4,101,2019-01-29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.281967
...,...,...,...,...
41255,553,2023-10-08,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323
41256,553,2023-10-11,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323
41257,553,2023-11-06,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323
41258,553,2023-11-08,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323


In [218]:
AR.midbins

array([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19., 21., 23., 25.,
       27., 29., 31., 33., 35., 37., 39., 41., 43., 45., 47., 49., 51.,
       53., 55., 57., 59., 61., 63., 65., 67., 69., 71., 73., 75., 77.,
       79., 81., 83., 85., 87., 89.])

In [42]:
df_exploded = df1.copy()

for i, row in df_exploded[0:10].iterrows():
    row = pd.DataFrame(data=row[AR.name])
    df_exploded = pd.concat([df_exploded, row], axis=1)

df_exploded

Unnamed: 0,ID,Day,Airway resistance (%),AR mean,Days passed,AR diff,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,101,2019-01-25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",55.623870,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,101,2019-01-26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.281967,1.0,-1.341903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,101,2019-01-27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",56.614623,1.0,2.332655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101,2019-01-28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",56.614623,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,101,2019-01-29,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.281967,1.0,-2.332655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41255,553,2023-10-08,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323,2.0,0.000000,,,,,,,,,,
41256,553,2023-10-11,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323,3.0,0.000000,,,,,,,,,,
41257,553,2023-11-06,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323,26.0,0.000000,,,,,,,,,,
41258,553,2023-11-08,"[0.0, 0.03160499, 0.10498536, 0.11549848, 0.11...",12.840323,2.0,0.000000,,,,,,,,,,
