In this notebook, I will model the factor that interconnects the airway resistance between two consecutive days.

In [1]:
import src.models.var_builders as var_builders
import src.data.helpers as dh
import src.data.breathe_data as bd
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import src.models.helpers as mh
import src.models.cpts.helpers as cpth
import src.modelling_ar.ar as model_ar
import src.inference.long_inf_slicing as slicing
import src.models.builders as mb

In [69]:
(
    HFEV1,
    ecFEV1,
    AR,
    HO2Sat,
    O2SatFFA,
    IA,
    UO2Sat,
    O2Sat,
    ecFEF2575prctecFEV1,
) = var_builders.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(
    180, 10, "Male"
)

In [70]:
df = dh.load_excel(
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_ecFEV1_ecFEF2575.xlsx",
    # f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_O2Sat_FEV1.xlsx",
    f"{dh.get_path_to_main()}/ExcelFiles/BR/Refining_F3/infer_AR_with_two_days_model_ecFEV1_ecFEF2575.xlsx",
    [AR.name],
    ["Day"],
    # ).drop(columns=["Unnamed: 0", HO2Sat.name, IA.name, HFEV1.name])
).drop(columns=[HO2Sat.name, IA.name, HFEV1.name])

In [71]:
def get_days_elapsed_for_offset(df_for_ID, idx_offset=1):
    """
    Links each recording with a previous recording that is idx_offset indices before it
    For idx_offset = 1, consecutive recordings are linked
    """
    df_for_ID = df_for_ID.copy()

    def calc_days_elapsed(curr, prev):
        """
        Takes in dates in format
        """
        if prev == None:
            return None
        return (curr - prev).days
        # return (curr - prev).total_seconds() / 3600 / 24

    df_for_ID["Prev date"] = df_for_ID.shift(idx_offset)["Day"]
    s_days_elapsed = df_for_ID.apply(
        lambda x: calc_days_elapsed(x["Day"], x["Prev date"]), axis=1
    )

    return s_days_elapsed


def get_days_elapsed_and_AR_mean_shift(df_for_ID, idx_offset=1):
    df_for_ID = df_for_ID.copy()
    df_for_ID["Days elapsed"] = get_days_elapsed_for_offset(df_for_ID, idx_offset)

    df_for_ID["AR mean"] = df_for_ID.apply(lambda x: AR.get_mean(x[AR.name]), axis=1)
    # df_for_ID['AR skewness'] = df_for_ID.apply(lambda x: AR.get_skewness(x[AR.name]), axis=1)

    df_for_ID["Prev AR mean"] = df_for_ID.shift(idx_offset)["AR mean"]
    # df_for_ID['Prev AR skewness'] = df_for_ID.shift(idx_offset)['AR skewness']

    df_for_ID["AR mean shift"] = df_for_ID["AR mean"] - df_for_ID["Prev AR mean"]
    # df_for_ID['AR skewness shift'] = df_for_ID['AR skewness'] - df_for_ID['Prev AR skewness']

    return df_for_ID[["ID", "Day", "Days elapsed", "AR mean shift"]]
    # return df_for_ID[['ID', 'Day', 'Days elapsed', 'AR mean shift', 'AR skewness shift']]


def generate_AR_change_sample(df_samples_for_ID, idx_offset=1):
    """
    1. Sample from AR1 and AR2
    2. Compute the change in AR and save it
    3. Repeat 500 times for this ID, then aggregate the results across IDs
    500*300 ID = 150,000 samples, can do more if needed
    """
    df_samples_for_ID = df_samples_for_ID.copy()

    df_samples_for_ID["Days elapsed"] = get_days_elapsed_for_offset(
        df_samples_for_ID, idx_offset
    )

    df_samples_for_ID["Prev AR samples"] = df_samples_for_ID.shift(idx_offset)[
        "AR samples"
    ]

    # Remove entries at the boundaries that have no previous recordings after applying the offset
    df_samples_for_ID = df_samples_for_ID.dropna(subset=["Prev AR samples"])

    df_samples_for_ID["AR samples shift"] = df_samples_for_ID.apply(
        lambda row: row["AR samples"] - row["Prev AR samples"], axis=1
    )
    df_samples_for_ID = df_samples_for_ID.explode("AR samples shift")

    return df_samples_for_ID[["ID", "Day", "Days elapsed", "AR samples shift"]]


# out = df.groupby('ID').apply(get_days_elapsed_and_AR_mean_shift).reset_index(drop=True)

## Compute day elapsed between two consecutive entries

In [None]:
df1 = df.merge(
    df.groupby("ID").apply(get_days_elapsed_and_AR_mean_shift).reset_index(drop=True),
    on=["ID", "Day"],
    how="inner",
)

In [None]:
df1.head()

### Validate the output

In [None]:
df1.describe()

In [None]:
df1[df1["Days elapsed"] > 100]

In [None]:
# Verify that the prev day is indeed correct)
df1.iloc[2295:2297]
# Count number of None
print(df1["Days elapsed"].isna().sum())
# Count number if ids
print(df1["ID"].nunique())
# They should be equal

In [None]:
df1[df1["AR mean shift"] > 20]
df1.iloc[2537:2539]

### Analyse time between two consecutive entries

In [None]:
vc = df1["Days elapsed"].value_counts()
# 1/3 of the consecutive indices are more than 1 day apart (~10k entries)
# 97% of the entries are less than 5 days apart from the previous entry
# For the CPT, I'll take 1, 2, 3, 4, 5 days apart, then avg 6-50 -> this last up to the max days diff


# Plot the histogram with vc index and vc values
fig = px.bar(x=vc.index, y=vc.values / sum(vc.values) * 100)
# Set x axis label to day to day difference
fig.update_xaxes(
    title_text="Number of days between two consecutive entries",
    range=[0, 30],
    tickvals=list(range(0, 31, 1)),
)
# Set y axis label to percentage
fig.update_yaxes(
    title_text="Percentage of total entries (%)", tickvals=[2] + list(range(0, 55, 5))
)

title = "Distribution of the time between two measurements"
# Set title
fig.update_layout(title=title, width=800, height=350, font=dict(size=10))

fig.show()

# # Save figure
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

In [None]:
# Study per ID
# Get idx at which the days elapsed is more than 3

df1[df1["Days elapsed"] > 3].index


def get_idx_more_than_n_days_elapsed(df, n=3):
    df = df.reset_index()
    n_days_total = df.shape[0]
    df_tmp = df[df["Days elapsed"] > n]
    if df_tmp.empty:
        return n_days_total, n_days_total
    n_days_consec = df_tmp.index[0]
    return n_days_consec, n_days_total


s_n_entries_to_break = (
    df1.groupby("ID")
    .apply(lambda x: get_idx_more_than_n_days_elapsed(x, 3))
    .sort_values(ascending=False)
)

s_n_entries_to_break

In [None]:
df1[df1.ID == "101"]

## Compute shift in airway resistance

In [72]:
# Max offset between entries will be equal to the max number of days elapsed in the model, to maximise the contributing data
max_offset = 3

In [73]:
df["AR norm"] = df.apply(lambda row: row[AR.name] / sum(row[AR.name]), axis=1)
df["AR samples"] = df.apply(lambda row: AR.sample(n=50, p=row["AR norm"]), axis=1)

In [74]:
# Build aggregate df of shift in AR for different offsets

df_mixed_offset = pd.DataFrame()

for n_idx_offset in range(1, max_offset + 1):
    print("offset", n_idx_offset)
    df_offset = (
        df.groupby("ID")
        .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))
        .reset_index(drop=True)
    )
    df_offset["Offset"] = n_idx_offset
    # Remove nan
    df_offset = df_offset.dropna()

    # Add to mix offset
    df_mixed_offset = pd.concat([df_mixed_offset, df_offset])

offset 1


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


offset 2


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


offset 3


  .apply(lambda df_for_ID: generate_AR_change_sample(df_for_ID, n_idx_offset))


In [75]:
df_mixed_offset

Unnamed: 0,ID,Day,Days elapsed,AR samples shift,Offset
0,101,2019-01-26,1.0,-1.525903,1
1,101,2019-01-26,1.0,6.065192,1
2,101,2019-01-26,1.0,2.344994,1
3,101,2019-01-26,1.0,-2.113095,1
4,101,2019-01-26,1.0,-6.67779,1
...,...,...,...,...,...
2013445,553,2023-11-10,30.0,-5.272935,3
2013446,553,2023-11-10,30.0,-0.47702,3
2013447,553,2023-11-10,30.0,-3.613054,3
2013448,553,2023-11-10,30.0,-2.605032,3


## Compute the shift in airway resistance 

In [2]:
# The previous methods was erroneous because it was taking to independent samples from AR day 1 and AR day 2.
# We have to generate joint samples. Given a two days, model,
# 1/ Infer AR1 using the two consecutive days model
# 2/ Sample from AR1
# 3/ Infer HFEV1 and HO2Sat using the sampled AR1 and the observations
# 4/ Infer AR2 using the inferred HFEV1 and HO2Sat
# 5/ Sample from AR2
df_obs = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [47]:
def sample_jointly_from_AR(df_two_days, date_1, date_2):
    df_two_days = df_two_days.copy().reset_index(drop=True)
    height = df_two_days.loc[0, "Height"]
    age = df_two_days.loc[0, "Age"]
    sex = df_two_days.loc[0, "Sex"]
    id = df_two_days.loc[0, "ID"]
    (
        model,
        inf_alg,
        HFEV1,
        ecFEV1,
        AR,
        HO2Sat,
        O2SatFFA,
        IA,
        UO2Sat,
        O2Sat,
        ecFEF2575prctecFEV1,
    ) = mb.o2sat_fev1_fef2575_point_in_time_model_shared_healthy_vars(height, age, sex)

    # Set variables parametrisation
    key_hfev1 = f"['{ecFEV1.name}', '{HFEV1.name}', '{AR.name}'] -> {HFEV1.name}"
    key_ho2sat = f"['{O2SatFFA.name}', '{HO2Sat.name}', '{AR.name}'] -> {HO2Sat.name}"
    HFEV1.set_factor_node_key(key_hfev1)
    HO2Sat.set_factor_node_key(key_ho2sat)

    # 1/ Infer AR1 using the two consecutive days model
    df_res_final_epoch1, _, _ = slicing.query_back_and_forth_across_days_joint_samples(
        df_two_days,
        inf_alg,
        [HFEV1, HO2Sat],
        [AR],
        [ecFEV1.name, ecFEF2575prctecFEV1.name],
        1e-8,
        days_specific_evidence=[],
        max_passes=5,
    )

    df_res_final_epoch1.set_index("Day", inplace=True)

    # 2/ Sample from AR1
    ar_day1_dist = df_res_final_epoch1.loc[date_1, AR.name]
    [ar_day1_sample] = AR.sample(n=1, p=ar_day1_dist)
    idx_ar = AR.get_bin_for_value(ar_day1_sample)[1]

    v = np.zeros(len(df_two_days)) - 1
    v[0] = ar_day1_sample
    df_two_days["AR"] = v

    v = np.zeros(len(df_two_days)) + 10000
    v[0] = idx_ar
    v = v.astype(int)
    df_two_days[f"idx {AR.name}"] = v

    # 3/ Infer AR2 using with sampled AR1 as evidence specific to day 1
    days_specific_evidence = [(AR.name, [date_1])]

    df_res_final_epoch2, _, _ = slicing.query_back_and_forth_across_days_joint_samples(
        df_two_days,
        inf_alg,
        [HFEV1, HO2Sat],
        [AR],
        [ecFEV1.name, ecFEF2575prctecFEV1.name],
        1e-8,
        days_specific_evidence,
        max_passes=5,
        debug=False,
    )
    df_res_final_epoch2.set_index("Day", inplace=True)
    ar_day2_dist = df_res_final_epoch2.loc[date_2, AR.name]

    # Print the interquartile ranges of the AR distributions
    # ar1_1 = AR.get_val_at_quantile(ar_day1_dist, 0.25)
    # ar1_2 = AR.get_val_at_quantile(ar_day1_dist, 0.75)
    # print(f"AR1: {ar1_2} - {ar1_1} = {ar1_2 - ar1_1}")

    # ar2_1 = AR.get_val_at_quantile(ar_day2_dist, 0.25)
    # ar2_2 = AR.get_val_at_quantile(ar_day2_dist, 0.75)
    # print(f"AR2: {ar2_2} - {ar2_1} = {ar2_2 - ar2_1}")

    # 5/ Sample from AR2
    [ar_day2_sample] = AR.sample(n=1, p=ar_day2_dist)

    ar_shift = ar_day2_sample - ar_day1_sample

    return ar_shift

In [48]:
idx_max_ecFEV1

1576

In [61]:
res = pd.DataFrame()

max_offset = 3

for id in ["527"]:  # df_obs["ID"][0:1].unique():
    print(f"ID: {id}, N: {len(df_obs[df_obs['ID'] == id])}")
    df_for_ID = df_obs[df_obs["ID"] == id].reset_index(drop=True)

    for n_idx_offset in list(np.arange(1, max_offset + 1)):
        print(f"ID: {id}, offset: {n_idx_offset}")

        for i, row in df_for_ID.iterrows():
            if i + n_idx_offset >= len(df_for_ID):
                print(f"ID: {id}, idx: {i}, offset: {n_idx_offset}, breaking")
                break
            # print(f"ID: {id}, idx: {i}, offset: {n_idx_offset}")
            # Find idx of max ecFEV1
            idx_max_ecFEV1 = idx_max_FEV1 = df_for_ID.sort_values(
                by=["ecFEV1", "ecFEF2575", "O2 Saturation"], ascending=False
            ).index[0]
            # Get two first days as well as idx_max_ecFEV1
            idx_two_days = [i, i + n_idx_offset]
            if idx_max_ecFEV1 in idx_two_days:
                print("** Max ecFEV1 in the two days, skipping **")
                continue

            # df_two_days = df_for_ID.iloc[idx_two_days]
            # Check that IQR reduces when adding the max ecFEV1: use ID 134
            df_two_days = df_for_ID.iloc[idx_two_days + [idx_max_ecFEV1]].reset_index(
                drop=True
            )

            day_1 = df_two_days.loc[0, "Date Recorded"]
            day_1_str = day_1.strftime("%Y-%m-%d")
            day_2 = df_two_days.loc[1, "Date Recorded"]
            day_2_str = day_2.strftime("%Y-%m-%d")

            ar_shift = sample_jointly_from_AR(df_two_days, day_1_str, day_2_str)
            days_elapsed = (day_2 - day_1).days

            # Add row to table with format: ID, date, days elapsed, AR shift, offset
            new_row = pd.DataFrame(
                data=[
                    [
                        df_two_days.loc[0, "ID"],
                        df_two_days.loc[0, "Date Recorded"],
                        days_elapsed,
                        ar_shift,
                        n_idx_offset,
                    ]
                ],
                columns=[
                    "ID",
                    "Date Recorded",
                    "Days elapsed",
                    "AR samples shift",
                    "Offset",
                ],
            )
            res = pd.concat([res, new_row])

ID: 527, N: 5
ID: 527, offset: 1
** Max ecFEV1 in the two days, skipping **
ID: 527, idx: 4, offset: 1, breaking
ID: 527, offset: 2
** Max ecFEV1 in the two days, skipping **
ID: 527, idx: 3, offset: 2, breaking
ID: 527, offset: 3
** Max ecFEV1 in the two days, skipping **
ID: 527, idx: 2, offset: 3, breaking


In [62]:
res

Unnamed: 0,ID,Date Recorded,Days elapsed,AR samples shift,Offset
0,527,2022-05-24,1,-2.701479,1
0,527,2022-05-25,1,1.707074,1
0,527,2022-05-26,1,3.231084,1
0,527,2022-05-24,2,0.957549,2
0,527,2022-05-25,2,0.919641,2
0,527,2022-05-24,3,2.018751,3


### Study the shift

In [None]:
# Scatter plot with days elapsed on x axis and AR diff on y axis, using px
y_col = "AR mean shift"
y_col = "AR samples shift"
# y_col = 'AR skewness shift'
fig = px.scatter(df_mixed_offset, x="Days elapsed", y=y_col, color="ID")
# Set x axis range to 0-100
fig.update_xaxes(range=[0, 200])
fig.update_xaxes(range=[0, 50], title="Number of days elapsed")
# Add more y axi tick vals
fig.update_yaxes(title="Mean airway resistance shift (%)")
# Reduce marker size
fig.update_traces(marker=dict(size=2))
title = f"How much does the airway resistance change in n days (1- {max_offset} idx offset)? - samples"
fig.update_layout(
    title=title, width=800, height=400, font=dict(size=10), showlegend=False
)
fig.show()
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

In [None]:
# I want to see the distribution of AR diffs for each day elapsed
from scipy.stats import norm

y_col = "AR mean shift"
y_col = "AR samples shift"

fig = make_subplots(rows=6, cols=1, shared_xaxes=True)
xbin_size = 0.2
# xbin_size = 1
xbin_absolute_span = 50
# xbin_absolute_span = 10
xbins = dict(
    start=-xbin_absolute_span - 0.5, end=xbin_absolute_span + 0.5, size=xbin_size
)


def add_plot_for_n_days_elapsed(n_days_elapsed, row):
    df_tmp = df_mixed_offset[df_mixed_offset["Days elapsed"] == n_days_elapsed]
    print(n_days_elapsed, df_tmp.shape)
    fig.add_trace(
        go.Histogram(
            x=df_tmp[y_col],
            xbins=xbins,
            histnorm="probability",
            name=(
                f"{n_days_elapsed} day elapsed"
                if n_days_elapsed == 1
                else f"{n_days_elapsed} days elapsed"
            ),
        ),
        row=row,
        col=1,
    )
    return df_tmp
    # Model the data by a normal distribution
    # mean = df_tmp[y_col].mean()
    # std = df_tmp[y_col].std()
    # x = list(range(-10, 11))
    # y = norm.pdf(x, loc=mean, scale=std)
    # Add trace
    # fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f"Normal distribution for {offset} days offset"), row=row, col=1)


# for offset in range(1, 51):
#     add_plot_for_n_days_elapsed(offset, offset)

df_1_DE = add_plot_for_n_days_elapsed(1, 1)
add_plot_for_n_days_elapsed(2, 2)
add_plot_for_n_days_elapsed(8, 3)
add_plot_for_n_days_elapsed(14, 4)
add_plot_for_n_days_elapsed(20, 5)
add_plot_for_n_days_elapsed(50, 6)

# Set y axis range to 0, 0.6
# fig.update_yaxes(range=[0, 0.58])
# Set x axis label
fig.update_xaxes(title_text="Shift in mean airway resistance (%)", row=6, col=1)
# fig.update_xaxes(title_text='Change in skewness of airway resistance (%)', row=6, col=1)
# Add x axis tick vals
# fig.update_xaxes(tickvals=np.arange(-10, 11, 1), row=6, col=1)
# fig.update_xaxes(tickvals=np.arange(-50, 55, 5), row=6, col=1)
# Update layout
# title = f"Shift in airway resistance for different time periods elapsed (bin_width = {xbin_size}%, bin_span = {xbin_absolute_span})"
title = f"Shift in airway resistance - O2sat, ecFEV1"
# fig.update_layout(height=2600, width=1000, title=title)
fig.update_layout(height=600, width=1000, title=title)

# Keep y axis lower
fig.update_yaxes(range=[0, 0.01])
#
# Save image
fig.write_image(
    f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
)
fig.show()

In [None]:
fig.write_image(
    f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title} - samples.pdf"
)

### Build CPT


In [None]:
# Building P(AR_next | days_elapsed, AR_prev)
# import src.models.helpers as mh
import numpy as np
import src.modelling_ar.ar as model_ar

AR1 = mh.VariableNode(
    "Airway resistance day 1 (%)", 0, 90, 2, prior={"type": "uniform"}
)
AR2 = mh.VariableNode(
    "Airway resistance day 2 (%)", 0, 90, 2, prior={"type": "uniform"}
)
# Set the max number of days elapsed to max offset in order to have as much data as possible per number of days elapsed
DE = mh.DiscreteVariableNode("Days elapsed", 1, max_offset, 1)

In [None]:
def calc_cpt(
    AR_next_day: mh.VariableNode,
    AR_curr_day: mh.VariableNode,
    DE: mh.DiscreteVariableNode,
    shift_p,
    shift_val,
    tol=1e-6,
    debug=False,
):
    cpt = np.zeros([AR_next_day.card, AR_curr_day.card, DE.card])

    for i, de in enumerate(DE.values):
        # For each shift value, get the mapping AR -> AR_next_day for each shifted bin in AR
        # Weight the result by the probability of that shift
        # Add it to the CPT for this day
        for s in range(len(shift_val)):
            if debug:
                print(f"Computing CPT for days elapsed={de}, shift={shift_val[s]}")
            # Summing over the columns of the cpt returned by calc_cpt_X_plus_k should give 1, except at the boundaries
            # Since we weight the 1s by a probability of shift that also sums to one, the sum of the cpt should be 1 (except at the boundaries, see below)
            cpt_contrib = calc_cpt_X_plus_k(
                AR_curr_day,
                AR_next_day,
                shift_val[s],
                tol=tol,
                debug=debug,
            )
            # If has nan
            if (np.isnan(cpt_contrib) == True).any():
                print("issue with cpt contribution")
                print(cpt_contrib)
            cpt[:, :, i] += shift_p[i, s] * cpt_contrib
        # Normalise the CPT along axis 0 (AR_next_day)
        total = np.sum(cpt[:, :, i], axis=0)
        if (np.isnan(total) == False).all():
            print(cpt[:, :, i])
        print(
            f"Sum along axis 0 before normalisation: np.sum(cpt[:, :, {i}], axis=0) = {total}"
        )
        cpt[:, :, i] /= total

        # Check that the sum of probabilities is 1
        total = np.sum(cpt[:, :, i], axis=0)
        assert (
            abs(total - 1) < tol
        ).all(), f"The sum of the probabilities should be 1, got sum(cpt)={total}])"
    return cpt


def calc_cpt_X_plus_k(
    Z: mh.VariableNode,
    X: mh.VariableNode,
    k,
    tol=1e-6,
    debug=False,
):
    """
    Computes the CPT for P(Z|X, Y), when Z is shifted from X by a constant value k
    Z = X + k
    X: parent variable
    Z: child variable
    k: constant, positive or negative

    We compute the CPT with a shift and conquer method:
    1) Start with a CPT zeroed out probabilities
    2) Shift all X bin intervals by the drop amount
    3) For each shifted X bin, spread the X bin evenly onto the overlapping Z bins
    4) Normalise the CPT

    This allows the function to be agnostic of how X and Z are binned.

    - What happens when the function is shifted outside the boundary? -> Raise an error as it shouldn't happen by how the model is built
    """
    nbinsX = len(X.bins)
    nbinsZ = len(Z.bins)

    cpt = np.zeros([nbinsZ, nbinsX])

    for i in range(nbinsX):
        shifted_X_bin_low = X.bins[i] + k
        shifted_X_bin_up = (X.bins[i] + X.bin_width) + k
        if debug:
            print(
                f"Shifting X bin {i} from [{X.bins[i]};{X.bins[i]+X.bin_width}) to [{shifted_X_bin_low};{shifted_X_bin_up}), shift amount={k}%"
            )
        # If the shifted bin is outside the boundaries of Z, continue:
        if (
            shifted_X_bin_low >= (Z.bins[-1] + Z.bin_width)
            or shifted_X_bin_up <= Z.bins[0]
        ):
            if debug:
                print(
                    f"Shift outside boundaries of Z.bins=[{Z.bins[0]};{Z.bins[-1] + Z.bin_width})"
                )
            continue
        # Handle the case where the shifted bin is partially outside the boundaries
        # Adjust the boundaries of the shifted bin to be within the boundaries of Z
        if shifted_X_bin_low < Z.bins[0]:
            if debug:
                print("Shift partially outside boundaries, adjusting lower boundary")
            shifted_X_bin_low = Z.bins[0]
        if shifted_X_bin_up > Z.bins[-1] + Z.bin_width:
            if debug:
                print("Shift partially outside boundaries, adjusting upper boundary")
            shifted_X_bin_up = Z.bins[-1] + Z.bin_width

        bin_contribution = mh.get_bin_contribution_to_cpt(
            [shifted_X_bin_low, shifted_X_bin_up], Z.bins, debug=debug
        )
        if debug:
            print(f"i={i}/{nbinsX-1}, z={bin_contribution}")
        # There is just one bin contribution to the CPT
        cpt[:, i] = bin_contribution

    sum_over_x = np.sum(cpt, axis=0)
    if debug:
        print(f"Results before normalisation sum(cpt)={sum_over_x}")

    # IMPORTANT: there is no boundary check in this function. This allows to have no conditional probability distribution for certain bins of AR2, which are not compatible with the amount of shift applied to AR1
    # Therefore either the sum of probabilities is 0 or 1 for each bin of AR2, summed over AR1
    for i in range(nbinsZ):
        if sum_over_x[i] == 0:
            if debug:
                print(f"Sum of probabilities is 0 for bin {i}, skipping normalisation")
            continue
        cpt[i, :] /= sum_over_x[i]
        assert (
            abs(sum_over_x[i] - 1) < tol
        ).all(), f"The sum of the probabilities should be 1, got sum(cpt[i, :])={sum_over_x[i]}])"

    return cpt

In [None]:
# Build the shift distributions
size = 0.2
shift_min = -20
shift_max = 20
shift_val = np.arange(shift_min, shift_max + size / 2, 0.2)
shift_p = np.empty((max_offset, len(shift_val)))

# Check identity matrix if shift is 0
# cpt_point_mass = np.zeros(len(shift_val))
# cpt_point_mass[100] = 1

for i, de in enumerate(DE.values):
    print("days elapsed: ", de)
    shift = df_mixed_offset[df_mixed_offset["Days elapsed"] == de]["AR samples shift"]

    # Bin up the mean shift series into bins starting at -5 and ending at 5, with bin size 1
    shift_p[i, :] = np.histogram(
        shift,
        bins=np.arange(shift_min - size / 2, shift_max + size, size),
        density=True,
    )[0]
    # shift_p[i, :] = cpt_point_mass

print("shift probability shape: ", shift_p.shape)
print("shift_val: ", shift_val)

In [None]:
cpt = calc_cpt(AR2, AR1, DE, shift_p, shift_val, debug=False)

## Test uniform shift distribution

In [None]:
# Conclusion: With the completely uniform CPT, each AR1 bin contributes equally to all AR2 bins, hence the output is uniform over AR2

p_ar2 = np.ones(AR2.card) / AR2.card
# Repeat p_ar2 on each column AR1.card times
cpt_uni = np.repeat(p_ar2[:, np.newaxis], AR1.card, axis=1)
# Repeat this cpt DE.card times
cpt_uni = np.repeat(cpt_uni[:, :, np.newaxis], DE.card, axis=2)

In [None]:
# P of AR2 is uniform over 5 bins centered on the bin of AR1
cpt_ar2_ar1 = np.zeros([AR2.card, AR1.card])
# Add padding left and right
padding = 15
for i in range(AR1.card):
    if i <= padding:
        low = 0
    else:
        low = i - padding
    if AR2.card - 1 <= (i + padding):
        up = AR2.card - 1
    else:
        up = i + padding

    idx_range = list(range(low, up + 1))
    cpt_ar2_ar1[idx_range, i] = 1 / len(idx_range)

cpt_thick_uni = np.repeat(cpt_ar2_ar1[:, :, np.newaxis], DE.card, axis=2)
cpt_thick_uni.shape

## Plot CPT relationships

In [None]:
cpt.shape

In [None]:
import src.inference.helpers as ih

# cpt = cpt_uni


def compare_ARs_for_one_entry(idx):
    title = f"P(AR_next | AR_prev, days_elapsed) for diffent days elapsed (idx {idx}) - samples"
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True)
    ih.plot_histogram(
        fig, AR1, df.loc[idx, AR.name], AR1.a, AR1.b, 1, 1, name="AR day 1", annot=False
    )
    AR_next_day_p = np.matmul(cpt[:, :, 0], df.loc[idx, AR.name])
    ih.plot_histogram(
        fig,
        AR2,
        AR_next_day_p,
        AR2.a,
        AR2.b,
        1,
        1,
        name="AR day 2, days elapsed=1",
        annot=False,
    )
    # AR_next_day_p = np.matmul(cpt[:, :, 2], df.loc[idx, AR.name])
    # ih.plot_histogram(
    #     fig,
    #     AR2,
    #     AR_next_day_p,
    #     AR2.a,
    #     AR2.b,
    #     1,
    #     1,
    #     name="AR day 2, days elapsed=3",
    #     annot=False,
    # )
    # Add x axis title
    fig.update_xaxes(title_text="Airway resistance (%)", row=1, col=1)
    # Reduce figure height
    fig.update_layout(height=200, width=1000, title=title, font=dict(size=10))
    # remove marings
    fig.update_layout(margin=dict(l=2, r=2, t=30, b=2))
    fig.show()
    # Save figure
    # fig.write_image(
    #     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
    # )


compare_ARs_for_one_entry(20000)
# compare_ARs_for_one_entry(21000)
compare_ARs_for_one_entry(1000)
compare_ARs_for_one_entry(4400)

In [None]:
de = 3
fig, title = cpth.plot_2d_cpt(cpt_thick_uni[:, :, de - 1], AR2, AR1, 3000, invert=False)
# Update font
title = title + f", {de} days elapsed, shift span [{shift_min};{shift_max}] - samples"
fig.update_layout(font=dict(size=7), title=title)
fig.show()

# Save figure
# fig.write_image(
#     f"{dh.get_path_to_main()}/PlotsBreathe/Interconnecting_ARs_entries/{title}.pdf"
# )

## Save CPT

In [None]:
# Save cpt
cpth.save_cpt([AR2, AR1, DE], cpt, suffix=f"_shift_span_[{shift_min};{shift_max}]")

## Study the shift per bin

In [None]:
df

In [None]:
AR.midbins

In [None]:
df_exploded = df1.copy()

for i, row in df_exploded[0:10].iterrows():
    row = pd.DataFrame(data=row[AR.name])
    df_exploded = pd.concat([df_exploded, row], axis=1)

df_exploded