In this notebook, I will model the factor that interconnects the airway resistance between two consecutive days.

In [1]:
import src.models.var_builders as var_builders
import src.data.helpers as dh
import src.data.breathe_data as bd
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import src.models.helpers as mh
import src.models.cpts.helpers as cpth
import src.modelling_ar.AR_joint_sampling as model_ar

In [None]:
(
    HFEV1,
    uecFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
    180, 10, "Male", ecfev1_noise_model_cpt_suffix=""
)
DE = mh.DiscreteVariableNode("Days elapsed", 1, 3, 1)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/tristan.trebaol/Desktop/DesktopMacTristan/PhD/Code/phd//src/models/cpts/ecFEV1_0_6_0.05_uecFEV1_0_6_0.05_std_0.23.npy'

# Use independent sampling (archive)

Not correct. Use joint sampling.

In [None]:
df = dh.load_excel(
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_ecFEV1_ecFEF2575.xlsx",
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_FEV1.xlsx",
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_ecFEV1_ecFEF2575_noise.xlsx",
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_ecFEV1_ecFEF2575_ecfev1noisestd0.23.xlsx",
    [AR.name],
    ["Day"],
    # ).drop(columns=["Unnamed: 0", HO2Sat.name, IA.name, HFEV1.name])
).drop(columns=[HO2Sat.name, HFEV1.name])

KeyboardInterrupt: 

In [9]:
def get_days_elapsed_for_offset(df_for_ID, idx_offset=1):
    """
    Links each recording with a previous recording that is idx_offset indices before it
    For idx_offset = 1, consecutive recordings are linked
    """
    df_for_ID = df_for_ID.copy()

    def calc_days_elapsed(curr, prev):
        """
        Takes in dates in format
        """
        if prev == None:
            return None
        return (curr - prev).days
        # return (curr - prev).total_seconds() / 3600 / 24

    df_for_ID["Prev date"] = df_for_ID.shift(idx_offset)["Day"]
    s_days_elapsed = df_for_ID.apply(
        lambda x: calc_days_elapsed(x["Day"], x["Prev date"]), axis=1
    )

    return s_days_elapsed


def get_days_elapsed_and_AR_mean_shift(df_for_ID, idx_offset=1):
    df_for_ID = df_for_ID.copy()
    df_for_ID["Days elapsed"] = get_days_elapsed_for_offset(df_for_ID, idx_offset)

    df_for_ID["AR mean"] = df_for_ID.apply(lambda x: AR.get_mean(x[AR.name]), axis=1)
    # df_for_ID['AR skewness'] = df_for_ID.apply(lambda x: AR.get_skewness(x[AR.name]), axis=1)

    df_for_ID["Prev AR mean"] = df_for_ID.shift(idx_offset)["AR mean"]
    # df_for_ID['Prev AR skewness'] = df_for_ID.shift(idx_offset)['AR skewness']

    df_for_ID["AR mean shift"] = df_for_ID["AR mean"] - df_for_ID["Prev AR mean"]
    # df_for_ID['AR skewness shift'] = df_for_ID['AR skewness'] - df_for_ID['Prev AR skewness']

    return df_for_ID[["ID", "Day", "Days elapsed", "AR mean shift"]]
    # return df_for_ID[['ID', 'Day', 'Days elapsed', 'AR mean shift', 'AR skewness shift']]


def generate_AR_change_sample(df_samples_for_ID, idx_offset=1):
    """
    1. Sample from AR1 and AR2
    2. Compute the change in AR and save it
    3. Repeat 500 times for this ID, then aggregate the results across IDs
    500*300 ID = 150,000 samples, can do more if needed
    """
    df_samples_for_ID = df_samples_for_ID.copy()

    df_samples_for_ID["Days elapsed"] = get_days_elapsed_for_offset(
        df_samples_for_ID, idx_offset
    )

    df_samples_for_ID["Prev AR samples"] = df_samples_for_ID.shift(idx_offset)[
        "AR samples"
    ]

    # Remove entries at the boundaries that have no previous recordings after applying the offset
    df_samples_for_ID = df_samples_for_ID.dropna(subset=["Prev AR samples"])

    df_samples_for_ID["AR samples shift"] = df_samples_for_ID.apply(
        lambda row: row["AR samples"] - row["Prev AR samples"], axis=1
    )
    df_samples_for_ID = df_samples_for_ID.explode("AR samples shift")

    return df_samples_for_ID[["ID", "Day", "Days elapsed", "AR samples shift"]]


# out = df.groupby('ID').apply(get_days_elapsed_and_AR_mean_shift).reset_index(drop=True)

## Compute day elapsed between two consecutive entries

In [10]:
df1 = df.merge(
    df.groupby("ID").apply(get_days_elapsed_and_AR_mean_shift).reset_index(drop=True),
    on=["ID", "Day"],
    how="inner",
)

  df.groupby("ID").apply(get_days_elapsed_and_AR_mean_shift).reset_index(drop=True),


In [11]:
df1.head()

Unnamed: 0,ID,Day,Airway resistance (%),Days elapsed,AR mean shift
0,101,2019-01-25,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",,
1,101,2019-01-26,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
2,101,2019-01-27,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
3,101,2019-01-28,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
4,101,2019-01-29,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0


### Validate the output

In [12]:
df1.describe()

Unnamed: 0,Days elapsed,AR mean shift
count,40908.0,40908.0
mean,4.73702,-0.000209
std,20.357683,0.103493
min,1.0,-0.89112
25%,1.0,0.0
50%,1.0,0.0
75%,3.0,0.0
max,980.0,0.89112


In [13]:
df1[df1["Days elapsed"] > 100]

Unnamed: 0,ID,Day,Airway resistance (%),Days elapsed,AR mean shift
2296,103,2023-09-25,"[1.53542028e-11, 9.70008564e-11, 2.45834002e-1...",192.0,0.000000
2406,104,2020-03-23,"[3.59479655e-11, 2.17398364e-10, 5.50797035e-1...",152.0,0.000000
2476,104,2023-03-20,"[3.10428391e-11, 1.78291168e-10, 4.44103507e-1...",708.0,0.000000
2787,106,2021-11-11,"[7.37139029e-12, 4.21583353e-11, 1.0417696e-10...",117.0,0.000000
2858,106,2023-01-18,"[7.37139029e-12, 4.21583353e-11, 1.0417696e-10...",146.0,-0.301074
...,...,...,...,...,...
38909,507,2022-08-09,"[4.86736748e-14, 2.27725127e-13, 5.13087714e-1...",125.0,0.313681
39702,513,2023-05-26,"[1.10199079e-11, 5.81928243e-11, 1.38740869e-1...",308.0,0.000000
39784,514,2023-10-17,"[1.31990793e-10, 7.46010224e-10, 1.80194708e-0...",110.0,0.000000
40141,518,2022-09-16,"[1.86624098e-12, 1.03606051e-11, 2.46677474e-1...",117.0,0.000000


In [14]:
# Verify that the prev day is indeed correct)
df1.iloc[2295:2297]
# Count number of None
print(df1["Days elapsed"].isna().sum())
# Count number if ids
print(df1["ID"].nunique())
# They should be equal

352
352


In [15]:
df1[df1["AR mean shift"] > 20]
df1.iloc[2537:2539]

Unnamed: 0,ID,Day,Airway resistance (%),Days elapsed,AR mean shift
2537,104,2023-10-31,"[1.50299458e-11, 8.61157534e-11, 2.17305522e-1...",6.0,0.29814
2538,104,2023-11-06,"[3.10428391e-11, 1.78291168e-10, 4.44103507e-1...",6.0,0.199855


### Analyse time between two consecutive entries

In [16]:
vc = df1["Days elapsed"].value_counts()
# 1/3 of the consecutive indices are more than 1 day apart (~10k entries)
# 97% of the entries are less than 5 days apart from the previous entry
# For the CPT, I'll take 1, 2, 3, 4, 5 days apart, then avg 6-50 -> this last up to the max days diff


# Plot the histogram with vc index and vc values
fig = px.bar(x=vc.index, y=vc.values / sum(vc.values) * 100)
# Set x axis label to day to day difference
fig.update_xaxes(
    title_text="Number of days between two consecutive entries",
    range=[0, 30],
    tickvals=list(range(0, 31, 1)),
)
# Set y axis label to percentage
fig.update_yaxes(
    title_text="Percentage of total entries (%)", tickvals=[2] + list(range(0, 55, 5))
)

title = "Distribution of the time between two measurements"
# Set title
fig.update_layout(title=title, width=800, height=350, font=dict(size=10))

fig.show()

# # Save figure
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

In [17]:
# Study per ID
# Get idx at which the days elapsed is more than 3

df1[df1["Days elapsed"] > 3].index


def get_idx_more_than_n_days_elapsed(df, n=3):
    df = df.reset_index()
    n_days_total = df.shape[0]
    df_tmp = df[df["Days elapsed"] > n]
    if df_tmp.empty:
        return n_days_total, n_days_total
    n_days_consec = df_tmp.index[0]
    return n_days_consec, n_days_total


s_n_entries_to_break = (
    df1.groupby("ID")
    .apply(lambda x: get_idx_more_than_n_days_elapsed(x, 3))
    .sort_values(ascending=False)
)

s_n_entries_to_break

ID
405    (1035, 1035)
101     (592, 1680)
272      (418, 800)
201      (290, 509)
203      (286, 845)
           ...     
213          (1, 1)
225          (1, 1)
354          (1, 1)
516          (1, 1)
355          (1, 1)
Length: 352, dtype: object

In [18]:
df1[df1.ID == "101"]

Unnamed: 0,ID,Day,Airway resistance (%),Days elapsed,AR mean shift
0,101,2019-01-25,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",,
1,101,2019-01-26,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
2,101,2019-01-27,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
3,101,2019-01-28,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
4,101,2019-01-29,"[8.20193838e-11, 4.84585391e-10, 1.1747963e-09...",1.0,0.0
...,...,...,...,...,...
1675,101,2023-11-08,"[7.08391591e-11, 3.9747851e-10, 9.47381625e-10...",1.0,0.0
1676,101,2023-11-09,"[7.08391591e-11, 3.9747851e-10, 9.47381625e-10...",1.0,0.0
1677,101,2023-11-10,"[7.08391591e-11, 3.9747851e-10, 9.47381625e-10...",1.0,0.0
1678,101,2023-11-11,"[7.08391591e-11, 3.9747851e-10, 9.47381625e-10...",1.0,0.0


## Compute AR shift (from sampling - archive)

In [19]:
# Max offset between entries will be equal to the max number of days elapsed in the model, to maximise the contributing data
max_offset = 3

In [20]:
df["AR norm"] = df.apply(lambda row: row[AR.name] / sum(row[AR.name]), axis=1)
df["AR samples"] = df.apply(lambda row: AR.sample(n=50, p=row["AR norm"]), axis=1)

KeyboardInterrupt: 

In [74]:
# Build aggregate df of shift in AR for different offsets

df_mixed_offset = pd.DataFrame()

for n_idx_offset in range(1, max_offset + 1):
    print("offset", n_idx_offset)
    df_offset = (
        df.groupby("ID")
        .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))
        .reset_index(drop=True)
    )
    df_offset["Offset"] = n_idx_offset
    # Remove nan
    df_offset = df_offset.dropna()

    # Add to mix offset
    df_mixed_offset = pd.concat([df_mixed_offset, df_offset])

offset 1


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


offset 2


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


offset 3


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


# Compute AR shift from joint sampling

This code samples AR jointly from two entries separated by a certain offset (from 1 to 3 indices)

It runs for each ID, for each offset, for each datapoint. Since the samples are drawn randomly, I could have done 100 joint samplings (i.e. repeat this whole process 100 times). However, there's enough entries already

The max ecFEV1 entry is also added to df_two_days (if not already there) to have more accurate airway resistance posteriors

Code validated qualitatively and debugged by running the algorithm with mock data, and checking that the results made sense.

In [5]:
# Load data instead of computing it
df_mixed_offset = dh.load_excel(
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/AR_joint_samples_diffs/AR_joint_samples_diff_for_change_factor_ecfev1noiseaddmult.xlsx",
    f"{dh.get_path_to_main()}/ExcelFiles/BR/AR_joint_samples_diffs/AR_joint_samples_diff_for_change_factor_ecfev1noise0.068.xlsx",
    ["AR day 1", "AR day 2"],
    ["Date Recorded"],
)

### Compute joint samples - if not loaded above

In [4]:
# The previous methods was erroneous because it was taking to independent samples from AR day 1 and AR day 2.
# We have to generate joint samples. Given a two days, model,
# 1/ Infer AR1 using the two consecutive days model
# 2/ Sample from AR1
# 3/ Infer HFEV1 and HO2Sat using the sampled AR1 and the observations
# 4/ Infer AR2 using the inferred HFEV1 and HO2Sat
# 5/ Sample from AR2
df_obs = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [47]:
# Used mock values for debugging

df_mock = pd.DataFrame(
    {
        "ID": ["1", "1", "1", "1", "1", "1"],
        "Date Recorded": [1, 2, 3, 4, 5, 6],
        "Height": 180,
        "Age": 35,
        "Sex": "Male",
        "ecFEV1": [1.8, 3.5, 0.1, 1.8, 3.5, 0.1],
        # "ecFEF2575%ecFEV1": [12, 120, 150, 12, 120, 150],
        "ecFEF2575%ecFEV1": [100, 100, 100, 100, 100, 100],
        # "idx ecFEV1 (L)": [1, 3, 0, 1, 3, 0],
        # f"idx ecFEF2575%ecFEV1": [0, 6, 7, 0, 6, 7],
        # f"idx ecFEF25-75 % ecFEV1 (%)": [0, 6, 7, 0, 6, 7],
    }
)
df_mock[f"idx {ecFEV1.name}"] = [
    ecFEV1.get_bin_idx_for_value(x) for x in df_mock["ecFEV1"]
]
df_mock[f"idx {ecFEF2575prctecFEV1.name}"] = [
    ecFEF2575prctecFEV1.get_bin_idx_for_value(x) for x in df_mock["ecFEF2575%ecFEV1"]
]
df_mock["Date Recorded"] = pd.to_datetime(
    df_mock["Date Recorded"], unit="D", origin="2020-01-01"
)
df_mock

Unnamed: 0,ID,Date Recorded,Height,Age,Sex,ecFEV1,ecFEF2575%ecFEV1,idx ecFEV1 (L),idx ecFEF25-75 % ecFEV1 (%)
0,1,2020-01-02,180,35,Male,1.8,100,36,50
1,1,2020-01-03,180,35,Male,3.5,100,70,50
2,1,2020-01-04,180,35,Male,0.1,100,2,50
3,1,2020-01-05,180,35,Male,1.8,100,36,50
4,1,2020-01-06,180,35,Male,3.5,100,70,50
5,1,2020-01-07,180,35,Male,0.1,100,2,50


In [56]:
def get_ar_shift_with_joint_sampling_for_ID(df_for_ID, max_offset=3):
    df_for_ID = df_for_ID.reset_index(drop=True)
    id = df_for_ID.loc[0, "ID"]
    print(f"Processing ID {id} with {len(df_for_ID)} entries")

    res = pd.DataFrame()
    for n_idx_offset in list(np.arange(1, max_offset + 1)):
        print(f"ID: {id}, offset: {n_idx_offset}")

        for i, row in df_for_ID.iterrows():
            # If the offset is too large, break
            if i + n_idx_offset >= len(df_for_ID):
                print(f"ID: {id}, idx: {i}, offset: {n_idx_offset}, breaking")
                break
            # print(f"ID: {id}, idx: {i}, offset: {n_idx_offset}")
            # Find idx of max ecFEV1
            idx_max_ecFEV1 = df_for_ID.sort_values(
                # by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
                by=["ecFEV1", "ecFEF2575%ecFEV1"],
                ascending=False,
            ).index[0]

            # Get two first days as well as idx_max_ecFEV1
            idx_two_days = [i, i + n_idx_offset]

            # If the max ecFEV1 is not in the two days, add it to have more accurate results
            if idx_max_ecFEV1 not in idx_two_days:
                # df_two_days = df_for_ID.iloc[idx_two_days]
                # Check that IQR reduces when adding the max ecFEV1: use ID 134
                df_two_days = df_for_ID.iloc[
                    idx_two_days + [int(idx_max_ecFEV1)]
                ].reset_index(drop=True)
            else:
                df_two_days = df_for_ID.iloc[idx_two_days].reset_index(drop=True)

            day_1 = df_two_days.loc[0, "Date Recorded"]
            day_2 = df_two_days.loc[1, "Date Recorded"]

            ar_shift = model_ar.sample_jointly_from_AR(
                df_two_days, day_1, day_2, debug=False
            )
            days_elapsed = (day_2 - day_1).days

            # Add row to table with format: ID, date, days elapsed, AR shift, offset
            new_row = pd.DataFrame(
                data=[
                    [
                        df_two_days.loc[0, "ID"],
                        df_two_days.loc[0, "Date Recorded"],
                        days_elapsed,
                        ar_shift,
                        n_idx_offset,
                    ]
                ],
                columns=[
                    "ID",
                    "Date Recorded",
                    "Days elapsed",
                    "AR samples shift",
                    "Offset",
                ],
            )
            res = pd.concat([res, new_row])
    return res


# df_for_ID = df_obs[df_obs.ID == "1"].reset_index(drop=True)
# res = get_ar_shift_with_joint_sampling_for_ID(df_mock)

df_in1 = df_obs.iloc[np.r_[1:5]]
res1 = df_in1.groupby("ID").apply(get_ar_shift_with_joint_sampling_for_ID)
df_in2 = df_obs.iloc[np.r_[2400:2405]]
res2 = df_in2.groupby("ID").apply(get_ar_shift_with_joint_sampling_for_ID)

Processing ID 101 with 4 entries
ID: 101, offset: 1
ID: 101, idx: 3, offset: 1, breaking
ID: 101, offset: 2
ID: 101, idx: 2, offset: 2, breaking
ID: 101, offset: 3
ID: 101, idx: 1, offset: 3, breaking
Processing ID 104 with 5 entries
ID: 104, offset: 1
ID: 104, idx: 4, offset: 1, breaking
ID: 104, offset: 2
ID: 104, idx: 3, offset: 2, breaking
ID: 104, offset: 3
ID: 104, idx: 2, offset: 3, breaking


In [None]:
# res.to_excel(
#     f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/AR_samples_shift_for_change_factor_noise.xlsx",
#     index=False,
# )

In [68]:
# df_shift = pd.concat([df_shift, df_shift1], ignore_index=True)
# df_shift.to_excel(
#     f"{dh.get_path_to_main()}/ExcelFiles/BR/AR_joint_samples_diffs/AR_joint_samples_diff_for_change_factor_ecfev1noisestd0.23.xlsx",
#     index=False,
# )

#### Below used for debugging

In [3]:
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb
import src.inference.helpers as ih

height = df_mock.loc[0, "Height"]
age = df_mock.loc[0, "Age"]
sex = df_mock.loc[0, "Sex"]
(
    _,
    inf_alg,
    HFEV1,
    uecFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = mb.o2sat_fev1_fef2575_point_in_time_model_noise_shared_healthy_vars(
    height, age, sex
)

df_res, df_res_conv, _ = slicing.query_forwardly_across_days(
    # slicing.query_back_and_forth_across_days_specific_evidence(
    df_mock.iloc[[0, 1]],
    inf_alg,
    [HFEV1, HO2Sat],
    [AR],
    [ecFEV1.name, ecFEF2575prctecFEV1.name],
    1e-8,
    days_specific_evidence=[],
    # precomp_messages={},
    # precomp_messages=uniform_from_o2_side.copy(),
    debug=False,
)

NameError: name 'df_mock' is not defined

In [15]:
# Plot the AR for df_res_final_epoch1.loc[0, AR.name]
fig = make_subplots(2, 1)
ih.plot_histogram(fig, HFEV1, df_res.loc[0, HFEV1.name], HFEV1.a, HFEV1.b, 1, 1)
ih.plot_histogram(fig, AR, df_res.loc[0, AR.name], AR.a, AR.b, 2, 1)
fig.show()

### Study the shift

In [71]:
# Scatter plot with days elapsed on x axis and AR diff on y axis, using px
y_col = "AR mean shift"
y_col = "AR samples shift"
# y_col = 'AR skewness shift'
fig = px.scatter(df_mixed_offset, x="Days elapsed", y=y_col, color="ID")
# Set x axis range to 0-100
fig.update_xaxes(range=[0, 200])
fig.update_xaxes(range=[0, 50], title="Number of days elapsed")
# Add more y axi tick vals
fig.update_yaxes(title="Mean airway resistance shift (%)")
# Reduce marker size
fig.update_traces(marker=dict(size=2))
title = f"How much does the airway resistance change in n days (1- {max_offset} idx offset)? - samples"
fig.update_layout(
    title=title, width=800, height=400, font=dict(size=10), showlegend=False
)
fig.show()
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

In [5]:
# Optional: only keep data after trikafta start
df_mixed_offset = bd.add_drug_therapy_to_df(df_mixed_offset)

ERROR:root:ID 175 - ?? Symkevi start date is wrong, removing it because no clue about the true date (maybe 2019?)
ERROR:root:ID 206 - Updating Ivacaftor stop date to not overlap and changing Ivacaftor + Symkevi to Trikafta
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 358 - Removing duplicated Trikafta entry
INFO:root:ID 426 - Currently no measures for this ID. Removing two entries with NaN drug therapy type
INFO:root:ID 462 - Symkevi and Ivacaftor prescribed, renaming it to Trikafta
INFO:root:ID 405 - Removing two entries with NaN drug therapy type


In [6]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [7]:
def get_id_health(df_for_ID):
    return df_for_ID["ecFEV1 % Predicted"].mean()


df_health = (
    df.groupby("ID")
    .apply(get_id_health)
    .reset_index()
    .rename(columns={0: "Mean ecFEV1 % Predicted"})
)
df_mixed_offset = df_mixed_offset.merge(df_health, on="ID", how="inner")

In [8]:
# Skewness seems to be patient specific, because lowering health threshold sometimes leads to higher negative s or positive

df_skew = (
    df_mixed_offset.groupby("ID")
    .apply(lambda df_for_ID: df_for_ID["AR samples shift"].skew())
    .reset_index()
    .rename(columns={0: "Skewness"})
)
df_count = (
    df_mixed_offset.groupby("ID").size().reset_index().rename(columns={0: "Count"})
)
# df_skew = df_skew.merge(df_mixed_offset[['ID', 'AR samples shift']], on='ID', how='inner')
df_skew = df_skew.merge(df_count, on="ID", how="inner")
df_skew = df_skew.sort_values(by=["Count", "Skewness"], ascending=False)
df_skew.head(20)

Unnamed: 0,ID,Skewness,Count
0,101,-0.019107,5034
22,123,-0.042431,3378
119,240,0.007039,3297
32,133,-0.066965,3192
231,405,-0.021315,3099
65,172,0.066147,2940
90,198,-0.038443,2853
94,203,0.015451,2529
144,272,-0.008058,2394
12,113,-0.253492,2136


In [None]:
# I want to see the distribution of AR diffs for each day elapsed
from scipy.stats import norm

y_col = "AR mean shift"
y_col = "AR samples shift"


df_mixed_offset_tmp = df_mixed_offset
# df_mixed_offset_tmp = df_mixed_offset[df_mixed_offset.DrugTherapyType == "Trikafta"]
# df_mixed_offset_tmp = df_mixed_offset[df_mixed_offset['Mean ecFEV1 % Predicted'] > 80]
df_mixed_offset_tmp = df_mixed_offset[
    (df_mixed_offset["Mean ecFEV1 % Predicted"] > 70)
    & (df_mixed_offset.DrugTherapyType == "Trikafta")
]

fig = make_subplots(rows=6, cols=1, shared_xaxes=True)
xbin_size = 0.2
yrange = [0, 0.03]
# xbin_size = 1
# yrange=[0, 0.11]
xbin_absolute_span = 20
# xbin_absolute_span = 10
xbins = dict(
    start=-xbin_absolute_span - 0.5, end=xbin_absolute_span + 0.5, size=xbin_size
)


def add_plot_for_n_days_elapsed(n_days_elapsed, row):
    df_tmp = df_mixed_offset_tmp[df_mixed_offset_tmp["Days elapsed"] == n_days_elapsed]
    print(n_days_elapsed, df_tmp.shape)
    # Compute skewness
    skewness = df_tmp[y_col].skew()
    skewness = df_tmp[y_col].mean() - df_tmp[y_col].median()
    count = df_tmp.shape[0]
    fig.add_trace(
        go.Histogram(
            x=df_tmp[y_col],
            xbins=xbins,
            histnorm="probability",
            name=(
                f"{n_days_elapsed} day elapsed (s={skewness:.2f}, #{count})"
                if n_days_elapsed == 1
                else f"{n_days_elapsed} days elapsed (s={skewness:.2f}, #{count})"
            ),
        ),
        row=row,
        col=1,
    )
    return df_tmp
    # Model the data by a normal distribution
    # mean = df_tmp[y_col].mean()
    # std = df_tmp[y_col].std()
    # x = list(range(-10, 11))
    # y = norm.pdf(x, loc=mean, scale=std)
    # Add trace
    # fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f"Normal distribution for {offset} days offset"), row=row, col=1)


# for offset in range(1, 51):
#     add_plot_for_n_days_elapsed(offset, offset)

df_1_DE = add_plot_for_n_days_elapsed(1, 1)
add_plot_for_n_days_elapsed(2, 2)
add_plot_for_n_days_elapsed(3, 3)
add_plot_for_n_days_elapsed(8, 4)
add_plot_for_n_days_elapsed(14, 5)
add_plot_for_n_days_elapsed(20, 6)

# Set y axis range to 0, 0.6
# fig.update_yaxes(range=[0, 0.58])
# Set x axis label
fig.update_xaxes(title_text="AR day 2 - AR day 1", row=6, col=1)
# fig.update_xaxes(title_text='Change in skewness of airway resistance (%)', row=6, col=1)
# Add x axis tick vals
# fig.update_xaxes(tickvals=np.arange(-10, 11, 1), row=6, col=1)
# fig.update_xaxes(tickvals=np.arange(-50, 55, 5), row=6, col=1)
# Update layout
# title = f"Shift in airway resistance for different time periods elapsed (bin_width = {xbin_size}%, bin_span = {xbin_absolute_span})"
title = f"Airway resistance change with n days elapsed<br>(joint samples, 3 days model, obs: ecFEV1 w max val and 0.068noisestd, ecFEF25-75, xbin {xbin_size})"
# fig.update_layout(height=2600, width=1000, title=title)
fig.update_layout(height=600, width=1000, title=title, font=dict(size=14))

# Keep y axis lower
fig.update_yaxes(range=yrange)
#
# Save image
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )
fig.show()

1 (4174, 13)
2 (4044, 13)
3 (3836, 13)
8 (775, 13)
14 (538, 13)
20 (153, 13)


In [13]:
fig.write_image(
    f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
)

## Study the shift for specific AR1 bins

In [6]:
# HP1: The AR change is only a function of the number of days elapsed
# HP2: At the edges, the distribution of AR change is truncated

# First, I want to challenge HP2. Due tot he truncation, probabilities at the edges, get redistributed to the middle, and go close to 0.

df_mixed_offset["Day 1 AR sample bin"] = df_mixed_offset.apply(
    lambda row: AR.get_bin_idx_for_value(row["Day 1 AR sample"]), axis=1
)
df_mixed_offset["Day 2 AR sample bin"] = df_mixed_offset.apply(
    lambda row: AR.get_bin_idx_for_value(row["Day 2 AR sample"]), axis=1
)

In [7]:
df_mixed_offset.describe()

Unnamed: 0,Days elapsed,AR samples shift,Day 1 AR sample,Day 2 AR sample,Offset,Day 1 AR sample bin,Day 2 AR sample bin
count,121759.0,121759.0,121759.0,121759.0,121759.0,121759.0,121759.0
mean,9.007835,-0.086076,35.901397,35.81532,1.994752,17.450349,17.40728
std,29.910754,4.943761,18.194785,18.195543,0.816505,9.093563,9.093596
min,1.0,-42.607858,0.000681,0.006493,1.0,0.0,0.0
25%,2.0,-2.938513,20.66513,20.613373,1.0,10.0,10.0
50%,3.0,-0.052193,35.681626,35.530131,2.0,17.0,17.0
75%,7.0,2.783383,50.231956,50.121747,3.0,25.0,25.0
max,1018.0,44.797406,89.373623,89.857209,3.0,44.0,44.0


In [116]:
xbin_size = 2
# xbin_size = 1
xbin_absolute_span = 20
# xbin_absolute_span = 10
xbins = dict(start=-xbin_absolute_span - 1, end=xbin_absolute_span + 1, size=xbin_size)
print(xbins)

bin_idxs = [0, 1, 5, 15, 25, 35]
de_vals = [1, 6, 20]
fig = make_subplots(
    rows=len(bin_idxs),
    cols=3,
    column_titles=[f"DE={i}" if i < 10 else f"DE={i-3}-{i+3}" for i in de_vals],
    vertical_spacing=0.06,
    horizontal_spacing=0.1,
)

for i, bin_idx in enumerate(bin_idxs):
    for j, de in enumerate(de_vals):
        if bin_idx in [0, 1]:
            merged_bins_idx = [bin_idx]
        else:
            merged_bins_idx = [bin_idx - 1, bin_idx, bin_idx + 1]
        if de >= 20:
            merge_de_vals = list(range(de - 3, de + 4))
            change_from_AR1 = df_mixed_offset[
                (df_mixed_offset["Day 1 AR sample bin"].isin(merged_bins_idx))
                & (df_mixed_offset["Days elapsed"].isin(merge_de_vals))
            ]
        else:
            change_from_AR1 = df_mixed_offset[
                (df_mixed_offset["Day 1 AR sample bin"].isin(merged_bins_idx))
                & (df_mixed_offset["Days elapsed"] == de)
            ]

        fig.add_trace(
            go.Histogram(
                x=change_from_AR1["AR samples shift"],
                # x=change_from_AR1["Day 2 AR sample bin"]
                # - change_from_AR1["Day 1 AR sample bin"],
                histnorm="probability",
                xbins=xbins,
            ),
            row=i + 1,
            col=j + 1,
        )
        fig.data[-1].marker.color = "#636EFA"
        if bin_idx in [0, 1]:
            bins_str = f"{int(AR.bins[merged_bins_idx])}"
        else:
            bins_str = f"{int(AR.bins[merged_bins_idx[0]])}-{int(AR.bins[merged_bins_idx[-1]])+AR.bin_width}"
        fig.update_yaxes(
            title=f"AR1={bins_str}<br>#{change_from_AR1.shape[0]}",
            title_standoff=0.1,
            range=[0, 0.32],
            row=i + 1,
            col=j + 1,
        )
        fig.update_xaxes(
            title="AR2 - AR1 (samples)",
            title_standoff=0,
            range=[-20, 20],
            row=i + 1,
            col=j + 1,
        )

title = "Distribution of AR change given the AR binned sample on day 1 and number of days elapsed"
fig.update_layout(
    title=title, height=600, width=700, font=dict(size=8), showlegend=False
)
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title} merged bins.pdf"
# )
fig.show()

{'start': -21, 'end': 21, 'size': 2}



Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)



In [10]:
dftmp = df_mixed_offset[
    (df_mixed_offset["Days elapsed"] == 1)
    & (df_mixed_offset["Day 1 AR sample bin"] == 0)
]["Day 2 AR sample bin"]

In [12]:
cpttmp = cpth.get_cpt(
    [AR, AR, DE], "_shift_span_[-20;20]_joint_sampling_3_days_model_ecfev1std0.068"
)

In [17]:
fig = make_subplots(rows=3, cols=1)

xbin_size = 2
# xbin_size = 1
xbin_absolute_span = 20
# xbin_absolute_span = 10
xbins = dict(start=-xbin_absolute_span - 1, end=xbin_absolute_span + 1, size=xbin_size)
xbins = dict(start=-2, end=18, size=2)

for de in range(1, 4):
    dftmp = df_mixed_offset[
        (df_mixed_offset["Days elapsed"] == de)
        & (df_mixed_offset["Day 1 AR sample bin"] == 0)
    ]
    fig.add_trace(
        go.Histogram(
            x=dftmp["Day 2 AR sample bin"],
            # x=dftmp["AR samples shift"],
            histnorm="probability",
            xbins=xbins,
            legendgroup="AR2 - AR1",
            name="AR2 - AR1",
            showlegend=True,
        ),
        row=de,
        col=1,
    )
    fig.data[-1].marker.color = "#636EFA"
    # Add histogram of cpt values
    fig.add_trace(
        go.Histogram(
            x=AR.get_distribution_as_sample(cpttmp[:, 0, 0]) / 2,
            histnorm="probability",
            legendgroup="P(AR2 | AR1)",
            name="P(AR2 | AR1)",
            showlegend=True,
        ),
        row=de,
        col=1,
    )
    fig.data[-1].marker.color = "#EF553B"
    fig.update_yaxes(
        title=f"DE={de}",
        title_standoff=0.1,
        row=de,
        col=1,
    )
    fig.update_xaxes(
        title="Airway resistance on day 2",
        title_standoff=0.1,
        range=[-5, 20],
        row=de,
        col=1,
    )

title = "P(AR2 | AR1) from CPT (red)<br>AR2 given AR1=[0; 2) (blue)"
fig.update_layout(
    title=title, height=500, width=500, font=dict(size=8), showlegend=False
)
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )
fig.show()

Sampling 99 values from Airway resistance (%)
Sampling 99 values from Airway resistance (%)
Sampling 99 values from Airway resistance (%)


<!-- ### Build CPT -->


In [3]:
# Building P(AR_next | days_elapsed, AR_prev)
# import src.models.helpers as mh
import numpy as np

max_offset = 3
AR1 = mh.VariableNode(
    "Airway resistance day 1 (%)", 0, 90, 2, prior={"type": "uniform"}
)
AR2 = mh.VariableNode(
    "Airway resistance day 2 (%)", 0, 90, 2, prior={"type": "uniform"}
)
# Set the max number of days elapsed to max offset in order to have as much data as possible per number of days elapsed
DE = mh.DiscreteVariableNode("Days elapsed", 1, max_offset, 1)

In [277]:
def calc_cpt(
    AR_next_day: mh.VariableNode,
    AR_curr_day: mh.VariableNode,
    DE: mh.DiscreteVariableNode,
    shift_p,
    shift_val,
    tol=1e-6,
    debug=False,
):
    cpt = np.zeros([AR_next_day.card, AR_curr_day.card, DE.card])

    for i, de in enumerate(DE.values):
        # For each shift value, get the mapping AR -> AR_next_day for each shifted bin in AR
        # Weight the result by the probability of that shift
        # Add it to the CPT for this day
        for s in range(len(shift_val)):
            if debug:
                print(f"Computing CPT for days elapsed={de}, shift={shift_val[s]}")
            # Summing over the columns of the cpt returned by calc_cpt_X_plus_k should give 1, except at the boundaries
            # Since we weight the 1s by a probability of shift that also sums to one, the sum of the cpt should be 1 (except at the boundaries, see below)
            cpt_contrib = calc_cpt_X_plus_k(
                AR_curr_day,
                AR_next_day,
                shift_val[s],
                tol=tol,
                debug=debug,
            )
            # If has nan
            if (np.isnan(cpt_contrib) == True).any():
                print("issue with cpt contribution")
                print(cpt_contrib)
            print(
                f"s_val: {shift_val[s]}, i {i}, old cpt state {cpt[:, 1, i]}, shift p {shift_p[i, s]}, contrib {cpt_contrib[:, 1]}"
            )
            cpt[:, :, i] += shift_p[i, s] * cpt_contrib
            print(f"new cpt: {cpt[:, 1, i]}")

        print(f"before normalisation {cpt[:, 1, i]}")
        # Normalise the CPT along axis 0 (AR_next_day)
        total = np.sum(cpt[:, :, i], axis=0)
        if (np.isnan(total) == False).all():
            print(cpt[:, :, i])
        print(
            f"Sum along axis 0 before normalisation: np.sum(cpt[:, :, {i}], axis=0) = {total}"
        )
        cpt[:, :, i] /= total
        print(f"final cpt: {cpt[:, 1, i]}")
        # print(f"final cpt: {cpt[1, :, i]}")

        # Check that the sum of probabilities is 1
        total = np.sum(cpt[:, :, i], axis=0)
        assert (
            abs(total - 1) < tol
        ).all(), f"The sum of the probabilities should be 1, got sum(cpt)={total}])"
    return cpt


def calc_cpt_X_plus_k(
    Z: mh.VariableNode,
    X: mh.VariableNode,
    k,
    tol=1e-6,
    debug=False,
):
    """
    Computes the CPT for P(Z|X, Y), when Z is shifted from X by a constant value k
    Z = X + k
    X: parent variable
    Z: child variable
    k: constant, positive or negative

    We compute the CPT with a shift and conquer method:
    1) Start with a CPT zeroed out probabilities
    2) Shift all X bin intervals by the drop amount
    3) For each shifted X bin, spread the X bin evenly onto the overlapping Z bins
    4) Normalise the CPT

    This allows the function to be agnostic of how X and Z are binned.

    - What happens when the function is shifted outside the boundary? -> Raise an error as it shouldn't happen by how the model is built
    """
    nbinsX = X.card
    nbinsZ = Z.card

    cpt = np.zeros([nbinsZ, nbinsX])

    for i in range(nbinsX):
        shifted_X_bin_low = X.bins[i] + k
        shifted_X_bin_up = (X.bins[i] + X.bin_width) + k
        if debug:
            print(
                f"Shifting X bin {i} from [{X.bins[i]};{X.bins[i]+X.bin_width}) to [{shifted_X_bin_low};{shifted_X_bin_up}), shift amount={k}%"
            )
        # If the shifted bin is outside the boundaries of Z, continue:
        if (
            shifted_X_bin_low >= (Z.bins[-1] + Z.bin_width)
            or shifted_X_bin_up <= Z.bins[0]
        ):
            if debug:
                print(
                    f"Shift outside boundaries of Z.bins=[{Z.bins[0]};{Z.bins[-1] + Z.bin_width})"
                )
            continue
        # Handle the case where the shifted bin is partially outside the boundaries
        # Adjust the boundaries of the shifted bin to be within the boundaries of Z
        if shifted_X_bin_low < Z.bins[0]:
            if debug:
                print("Shift partially outside boundaries, adjusting lower boundary")
            shifted_X_bin_low = Z.bins[0]
        if shifted_X_bin_up > Z.bins[-1] + Z.bin_width:
            if debug:
                print("Shift partially outside boundaries, adjusting upper boundary")
            shifted_X_bin_up = Z.bins[-1] + Z.bin_width
        if shifted_X_bin_low in [0, 10]:
            print(
                f"proportion bin width = {(shifted_X_bin_up - shifted_X_bin_low)} / {X.bin_width}"
            )
        proportion_bin_left = (shifted_X_bin_up - shifted_X_bin_low) / X.bin_width

        bin_contribution = mh.get_bin_contribution_to_cpt(
            [shifted_X_bin_low, shifted_X_bin_up], Z.bins, debug=debug
        )
        if debug:
            print(f"i={i}/{nbinsX-1}, z={bin_contribution}")
        bin_contribution = bin_contribution * proportion_bin_left
        if i == 1:
            print(
                f"Shifting X bin {i} from [{X.bins[i]};{X.bins[i]+X.bin_width}) to [{shifted_X_bin_low};{shifted_X_bin_up}), shift amount={k}%"
            )
            print(
                f"Shift {k}, contribution to span: {shifted_X_bin_low};{shifted_X_bin_up}, {bin_contribution}"
            )
        # There is just one bin contribution to the CPT
        cpt[:, i] += bin_contribution
    print(f"CPT progress from bin 1 of X ({X.get_bins_arr()[1]}) {cpt[:, 1]}")

    sum_over_x = np.sum(cpt, axis=0)
    if debug:
        print(f"Results before normalisation sum(cpt)={sum_over_x}")

    # IMPORTANT: there is no boundary check in this function. This allows to have no conditional probability distribution for certain bins of AR2, which are not compatible with the amount of shift applied to AR1
    # Therefore either the sum of probabilities is 0 or 1 for each bin of AR2, summed over AR1
    # for i in range(nbinsZ):
    #     if sum_over_x[i] == 0:
    #         if debug:
    #             print(f"Sum of probabilities is 0 for bin {i}, skipping normalisation")
    #         continue
    #     cpt[i, :] /= sum_over_x[i]
    #     assert (
    #         abs(sum_over_x[i] - 1) < tol
    #     ).all(), f"The sum of the probabilities should be 1, got sum(cpt[i, :])={sum_over_x[i]}])"

    return cpt

In [300]:
# Build the shift distributions
# size = 20
size = 0.2
shift_min = -20
shift_max = 20
shift_val = np.arange(shift_min, shift_max + size / 2, size)
shift_p = np.zeros((max_offset, len(shift_val)))

# Check identity matrix if shift is 0
# cpt_point_mass = np.zeros(len(shift_val))
# cpt_point_mass[100] = 1
print(np.arange(shift_min - size / 2, shift_max + size / 2 + size, size))

for i, de in enumerate(DE.values):
    print("days elapsed: ", de)
    shift = df_mixed_offset[df_mixed_offset["Days elapsed"] == de]["AR samples shift"]

    # Bin up the mean shift series into bins starting at -5 and ending at 5, with bin size 1
    shift_p[i, :] = np.histogram(
        shift,
        bins=np.arange(shift_min - size / 2, shift_max + size / 2 + size - 1e-6, size),
        density=True,
    )[0]
    # shift_p[i, :] = cpt_point_mass

print("shift probability shape: ", shift_p.shape)
print("shift_val: ", shift_val)

[-20.1 -19.9 -19.7 -19.5 -19.3 -19.1 -18.9 -18.7 -18.5 -18.3 -18.1 -17.9
 -17.7 -17.5 -17.3 -17.1 -16.9 -16.7 -16.5 -16.3 -16.1 -15.9 -15.7 -15.5
 -15.3 -15.1 -14.9 -14.7 -14.5 -14.3 -14.1 -13.9 -13.7 -13.5 -13.3 -13.1
 -12.9 -12.7 -12.5 -12.3 -12.1 -11.9 -11.7 -11.5 -11.3 -11.1 -10.9 -10.7
 -10.5 -10.3 -10.1  -9.9  -9.7  -9.5  -9.3  -9.1  -8.9  -8.7  -8.5  -8.3
  -8.1  -7.9  -7.7  -7.5  -7.3  -7.1  -6.9  -6.7  -6.5  -6.3  -6.1  -5.9
  -5.7  -5.5  -5.3  -5.1  -4.9  -4.7  -4.5  -4.3  -4.1  -3.9  -3.7  -3.5
  -3.3  -3.1  -2.9  -2.7  -2.5  -2.3  -2.1  -1.9  -1.7  -1.5  -1.3  -1.1
  -0.9  -0.7  -0.5  -0.3  -0.1   0.1   0.3   0.5   0.7   0.9   1.1   1.3
   1.5   1.7   1.9   2.1   2.3   2.5   2.7   2.9   3.1   3.3   3.5   3.7
   3.9   4.1   4.3   4.5   4.7   4.9   5.1   5.3   5.5   5.7   5.9   6.1
   6.3   6.5   6.7   6.9   7.1   7.3   7.5   7.7   7.9   8.1   8.3   8.5
   8.7   8.9   9.1   9.3   9.5   9.7   9.9  10.1  10.3  10.5  10.7  10.9
  11.1  11.3  11.5  11.7  11.9  12.1  12.3  12.5  1

In [307]:
# Plot histogram
fig = px.bar(x=shift_val, y=shift_p[0, :])
fig.show()
fig = px.bar(x=shift_val, y=shift_p[1, :])
fig.show()
fig.show()

In [302]:
cpt = calc_cpt(AR2, AR1, DE, shift_p, shift_val, debug=False)

proportion bin width = 2.0 / 2
proportion bin width = 2.0 / 2
CPT progress from bin 1 of X ([2. 4.]) [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
s_val: -20.0, i 0, old cpt state [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shift p 0.0, contrib [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
new cpt: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
proportion bin width = 0.1999999999999993 / 2
CPT progress from bin 1 of X ([2. 4.]) [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
s_val: -19.8, i 0, old cpt state [0. 0. 0. 0. 0. 0. 0. 0. 0.

## Test uniform shift distribution to validate the algorithm that creates the CPT

In [None]:
# Conclusion: With the completely uniform CPT, each AR1 bin contributes equally to all AR2 bins, hence the output is uniform over AR2

p_ar2 = np.ones(AR2.card) / AR2.card
# Repeat p_ar2 on each column AR1.card times
cpt_uni = np.repeat(p_ar2[:, np.newaxis], AR1.card, axis=1)
# Repeat this cpt DE.card times
cpt_uni = np.repeat(cpt_uni[:, :, np.newaxis], DE.card, axis=2)

In [None]:
# P of AR2 is uniform over 5 bins centered on the bin of AR1
cpt_ar2_ar1 = np.zeros([AR2.card, AR1.card])
# Add padding left and right
padding = 15
for i in range(AR1.card):
    if i <= padding:
        low = 0
    else:
        low = i - padding
    if AR2.card - 1 <= (i + padding):
        up = AR2.card - 1
    else:
        up = i + padding

    idx_range = list(range(low, up + 1))
    cpt_ar2_ar1[idx_range, i] = 1 / len(idx_range)

cpt_thick_uni = np.repeat(cpt_ar2_ar1[:, :, np.newaxis], DE.card, axis=2)
cpt_thick_uni.shape

## Plot CPT relationships

In [182]:
# Load AR data
df = dh.load_excel(
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_ecFEV1_ecFEF2575_ecfev1noisestd0.23.xlsx",
    f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_using_two_days_model_ecFEV1_ecFEF2575_ecfev1noiseaddmult.xlsx",
    [AR.name],
    ["Day"],
).drop(columns=[HO2Sat.name, HFEV1.name])

In [183]:
import src.inference.helpers as ih

# cpt = cpt_uni


def compare_ARs_for_one_entry(idx):
    title = f"P(AR_next | AR_prev, days_elapsed) for diffent days elapsed (idx {idx}) - samples"
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True)
    ih.plot_histogram(
        fig, AR1, df.loc[idx, AR.name], AR1.a, AR1.b, 1, 1, name="AR day 1", annot=False
    )
    AR_next_day_p = np.matmul(cpt[:, :, 0], df.loc[idx, AR.name])
    ih.plot_histogram(
        fig,
        AR2,
        AR_next_day_p,
        AR2.a,
        AR2.b,
        1,
        1,
        name="AR day 2, days elapsed=1",
        annot=False,
    )
    # AR_next_day_p = np.matmul(cpt[:, :, 2], df.loc[idx, AR.name])
    # ih.plot_histogram(
    #     fig,
    #     AR2,
    #     AR_next_day_p,
    #     AR2.a,
    #     AR2.b,
    #     1,
    #     1,
    #     name="AR day 2, days elapsed=3",
    #     annot=False,
    # )
    # Add x axis title
    fig.update_xaxes(title_text="Airway resistance (%)", row=1, col=1)
    # Reduce figure height
    fig.update_layout(height=200, width=1000, title=title, font=dict(size=10))
    # remove marings
    fig.update_layout(margin=dict(l=2, r=2, t=30, b=2))
    fig.show()
    # Save figure
    # fig.write_image(
    #     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
    # )


compare_ARs_for_one_entry(2)
# compare_ARs_for_one_entry(21000)
compare_ARs_for_one_entry(1000)
compare_ARs_for_one_entry(4400)

In [19]:
cpt = cpth.get_cpt([AR, AR, DE], "_shift_span_[-20;20]_joint_sampling_3_days_model")
shift_min = -20
shift_max = 20

In [13]:
de = 1
fig, title = cpth.plot_2d_cpt(
    cpt[:, :, de - 1], AR, AR, 3000, invert=False, p_range=[0, 0.35]
)
# Update font
title = title + f", {de} days elapsed, shift span [{shift_min};{shift_max}]"
title = (
    "CPT - P(AR day 2|AR day 1)"
    + f", {de} days elapsed, shift span [{shift_min};{shift_max}]<br>(joint samples, 3 days model, obs: ecFEV1 notsurewhichnoise, ecFEF25-75)"
)
# Set x axis
fig.update_xaxes(title_text=f"Airway resistance day 1 (%)", row=AR.card, col=1)
fig.update_layout(font=dict(size=9), title=title)
# fig.show()

# Save figure
fig.write_image(
    f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
)

## Save CPT

In [20]:
# Save cpt
cpth.save_cpt(
    [AR, AR, DE],
    cptnarrower,
    suffix=f"_shift_span_[{shift_min};{shift_max}]_joint_sampling_3_days_model_ecfev1std0.068_mult4x",
)

In [3]:
cpttmp = cpth.get_cpt(
    [AR, AR, DE], "_shift_span_[-20;20]_joint_sampling_3_days_model_ecfev1std0.068"
)

In [16]:
cptnarrower = np.zeros((AR.card, AR.card, DE.card))
for i in range(AR.card):
    for j in range(DE.card):
        cptnarrower[:, i, j] = (
            cpttmp[:, i, j] * cpttmp[:, i, j] * cpttmp[:, i, j] * cpttmp[:, i, j]
        )
        cptnarrower[:, i, j] /= np.sum(cptnarrower[:, i, j])

In [17]:
import src.inference.helpers as ih

fig = make_subplots(rows=1, cols=1, shared_xaxes=True)
ih.plot_histogram(
    fig, AR, cptnarrower[:, 30, 0], AR.a, AR.b, 1, 1, name="AR day 1", annot=False
)
fig.show()
fig = make_subplots(rows=1, cols=1, shared_xaxes=True)
ih.plot_histogram(
    fig, AR, cpttmp[:, 30, 0], AR.a, AR.b, 1, 1, name="AR day 1", annot=False
)
fig.show()

## Study the shift per bin

In [None]:
df

In [None]:
AR.midbins

In [None]:
df_exploded = df1.copy()

for i, row in df_exploded[0:10].iterrows():
    row = pd.DataFrame(data=row[AR.name])
    df_exploded = pd.concat([df_exploded, row], axis=1)

df_exploded