In [1]:
import data.breathe_data as bd
import inference.long_inf_slicing as slicing
import models.builders as mb
import models.var_builders as var_builders
import inference.helpers as ih
from plotly.subplots import make_subplots
import models.helpers as mh
import data.helpers as dh
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np

In [2]:
df = bd.load_meas_from_excel("BR_O2_FEV1_FEF2575_conservative_smoothing_with_idx")

INFO:root:* Checking for same day measurements *


In [3]:
# Get longest consec series

df_consec = pd.DataFrame(index=df.ID.unique())
for i in range(len(df_consec)):
    id = df_consec.index[i]
    df_for_ID = df[df.ID == id]
    subdf, stard_idx, end_idx = dh.find_longest_consec_series(df_for_ID, n_days=1)
    df_consec.loc[id, "1 day"] = len(subdf)
    subdf, stard_idx, end_idx = dh.find_longest_consec_series(df_for_ID, n_days=2)
    df_consec.loc[id, "2 days"] = len(subdf)
    subdf, stard_idx, end_idx = dh.find_longest_consec_series(df_for_ID, n_days=3)
    df_consec.loc[id, "3 days"] = len(subdf)
    subdf, stard_idx, end_idx = dh.find_longest_consec_series(df_for_ID, n_days=4)
    df_consec.loc[id, "4 days"] = len(subdf)
    df_consec.loc[id, "count"] = len(df_for_ID)
df_consec = df_consec.astype(int)

In [4]:
df_consec.head()

Unnamed: 0,1 day,2 days,3 days,4 days,count
101,536,592,592,592,1680
102,16,24,47,47,263
103,72,88,88,88,375
104,8,10,14,32,222
105,4,6,6,6,6


In [5]:
# Create df with columns: 1 day, 2 days, 3 days, count, and rows: number of data, number of IDs with over 50 datapoints
n_ids_col = "IDs with over 10 datapoints"
n_ids_col2 = "IDs with over 100 datapoints"
df1 = pd.DataFrame(index=["Number of datapoints", n_ids_col, n_ids_col2])
df1.loc["Number of datapoints", "1 day"] = df_consec["1 day"].sum()
df1.loc["Number of datapoints", "2 days"] = df_consec["2 days"].sum()
df1.loc["Number of datapoints", "3 days"] = df_consec["3 days"].sum()
df1.loc["Number of datapoints", "4 days"] = df_consec["4 days"].sum()
df1.loc["Number of datapoints", "count"] = df_consec["count"].sum()
df1.loc[n_ids_col, "1 day"] = df_consec[df_consec["1 day"] > 10].shape[0]
df1.loc[n_ids_col, "2 days"] = df_consec[df_consec["2 days"] > 10].shape[0]
df1.loc[n_ids_col, "3 days"] = df_consec[df_consec["3 days"] > 10].shape[0]
df1.loc[n_ids_col, "4 days"] = df_consec[df_consec["3 days"] > 10].shape[0]
df1.loc[n_ids_col, "count"] = df_consec.shape[0]
df1.loc[n_ids_col2, "1 day"] = df_consec[df_consec["1 day"] > 100].shape[0]
df1.loc[n_ids_col2, "2 days"] = df_consec[df_consec["2 days"] > 100].shape[0]
df1.loc[n_ids_col2, "3 days"] = df_consec[df_consec["3 days"] > 100].shape[0]
df1.loc[n_ids_col2, "4 days"] = df_consec[df_consec["3 days"] > 100].shape[0]
df1.loc[n_ids_col2, "count"] = df_consec.shape[0]
df1 = df1.astype(int)
df1.head()


Unnamed: 0,1 day,2 days,3 days,4 days,count
Number of datapoints,4568,9049,12224,15445,41260
IDs with over 10 datapoints,86,130,154,154,352
IDs with over 100 datapoints,5,14,22,22,352


In [None]:
# Run takes 20 minutes
# I have 8h = 480 minutes, meaning I can run 24 experiments

In [10]:
# Sort by number of 1 day
df_consec = df_consec.sort_values("2 days", ascending=True)
# Bar plot of the number of days for each ID
fig = make_subplots(rows=1, cols=1)
fig.add_trace(
    go.Bar(y=df_consec.index, x=df_consec["count"], orientation="h", name="all data", marker_color=dh.get_blind_colours()[0]), row=1, col=1
)
fig.add_trace(
    go.Bar(y=df_consec.index, x=df_consec["3 days"], orientation="h", name="3 days", marker_color=dh.get_blind_colours()[1]), row=1, col=1
)
fig.add_trace(
    go.Bar(y=df_consec.index, x=df_consec["2 days"], orientation="h", name="2 days", marker_color=dh.get_blind_colours()[2]), row=1, col=1
)
fig.add_trace(
    go.Bar(y=df_consec.index, x=df_consec["1 day"], orientation="h", name="1 day", marker_color=dh.get_blind_colours()[3]), row=1, col=1
)
fig.update_layout(title_text="Number of consecutive days for each ID", height=4000, font_size=8, 
                  barmode="overlay")

fig.show()

In [81]:
# Given an df_for_ID, find the longest series of consecutive measurements


def find_longest_consec_series(df_for_ID):

    df_for_ID = df_for_ID.reset_index(drop=True)
    df_for_ID["Prev day"] = df_for_ID["Date Recorded"].shift(1)
    df_for_ID["Days elapsed"] = df_for_ID["Date Recorded"] - df_for_ID["Prev day"]

    # Get first idx where Days elapsed is greater than 1
    idx = df_for_ID[df_for_ID["Days elapsed"] > pd.Timedelta(days=3)].index
    # Add the first idx
    idx = idx.insert(0, 0)
    # Add last idx
    idx = idx.insert(len(idx), len(df_for_ID))

    # Make the difference between the idxs
    diff = np.diff(idx)

    # Get the longest series of consecutive measurements
    idx_max_diff = np.argmax(diff)
    start_idx = idx[idx_max_diff]

    end_idx = idx[idx_max_diff + 1]

    df_for_ID[start_idx:end_idx]
    return df_for_ID[start_idx:end_idx]

(845, 20)
Index([0, 286, 294, 298, 618, 621, 628, 756, 766, 845], dtype='int64')
start idx 298
end idx 618


Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,...,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,idx ecFEV1 (L),idx O2 saturation (%),idx ecFEF2575%ecFEV1,idx ecFEF25-75 % ecFEV1 (%),Prev day,Days elapsed
298,203,2021-09-20,1.90,98,0.62,1.90,1.05,Female,164.0,38,...,60.859190,60.859190,99.86499,32.631579,38,48,16,16,2021-09-16,"4 days, 0:00:00"
299,203,2021-09-21,1.91,98,1.03,1.91,1.03,Female,164.0,38,...,61.179502,61.179502,99.86499,53.926702,38,48,26,26,2021-09-20,"1 day, 0:00:00"
300,203,2021-09-22,1.95,99,1.08,1.95,1.08,Female,164.0,38,...,62.460748,62.460748,100.88402,55.384615,39,49,27,27,2021-09-21,"1 day, 0:00:00"
301,203,2021-09-23,1.91,99,1.00,1.91,1.00,Female,164.0,38,...,61.179502,61.179502,100.88402,52.356021,38,49,26,26,2021-09-22,"1 day, 0:00:00"
302,203,2021-09-24,1.95,99,1.01,1.95,1.01,Female,164.0,38,...,62.460748,62.460748,100.88402,51.794872,39,49,25,25,2021-09-23,"1 day, 0:00:00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613,203,2022-12-08,1.95,98,1.10,1.95,1.10,Female,164.0,38,...,62.460748,62.460748,99.86499,56.410256,39,48,28,28,2022-12-07,"1 day, 0:00:00"
614,203,2022-12-09,1.87,99,1.11,1.87,1.11,Female,164.0,38,...,59.898256,59.898256,100.88402,59.358289,37,49,29,29,2022-12-08,"1 day, 0:00:00"
615,203,2022-12-12,1.82,99,1.12,1.82,1.12,Female,164.0,38,...,58.296698,58.296698,100.88402,61.538462,36,49,30,30,2022-12-09,"3 days, 0:00:00"
616,203,2022-12-15,1.86,98,1.00,1.86,1.00,Female,164.0,38,...,59.577944,59.577944,99.86499,53.763441,37,48,26,26,2022-12-12,"3 days, 0:00:00"


In [82]:
df_for_ID.iloc[start_idx : end_idx + 1]
# df[df["ID"] == "272"].iloc[:417].reset_index(drop=True)

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,ecFEF2575,Sex,Height,Age,...,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy,ecFEF2575%ecFEV1,idx ecFEV1 (L),idx O2 saturation (%),idx ecFEF2575%ecFEV1,idx ecFEF25-75 % ecFEV1 (%),Prev day,Days elapsed
298,203,2021-09-20,1.90,98,0.62,1.90,1.05,Female,164.0,38,...,60.859190,60.859190,99.86499,32.631579,38,48,16,16,2021-09-16,"4 days, 0:00:00"
299,203,2021-09-21,1.91,98,1.03,1.91,1.03,Female,164.0,38,...,61.179502,61.179502,99.86499,53.926702,38,48,26,26,2021-09-20,"1 day, 0:00:00"
300,203,2021-09-22,1.95,99,1.08,1.95,1.08,Female,164.0,38,...,62.460748,62.460748,100.88402,55.384615,39,49,27,27,2021-09-21,"1 day, 0:00:00"
301,203,2021-09-23,1.91,99,1.00,1.91,1.00,Female,164.0,38,...,61.179502,61.179502,100.88402,52.356021,38,49,26,26,2021-09-22,"1 day, 0:00:00"
302,203,2021-09-24,1.95,99,1.01,1.95,1.01,Female,164.0,38,...,62.460748,62.460748,100.88402,51.794872,39,49,25,25,2021-09-23,"1 day, 0:00:00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,203,2022-12-09,1.87,99,1.11,1.87,1.11,Female,164.0,38,...,59.898256,59.898256,100.88402,59.358289,37,49,29,29,2022-12-08,"1 day, 0:00:00"
615,203,2022-12-12,1.82,99,1.12,1.82,1.12,Female,164.0,38,...,58.296698,58.296698,100.88402,61.538462,36,49,30,30,2022-12-09,"3 days, 0:00:00"
616,203,2022-12-15,1.86,98,1.00,1.86,1.00,Female,164.0,38,...,59.577944,59.577944,99.86499,53.763441,37,48,26,26,2022-12-12,"3 days, 0:00:00"
617,203,2022-12-16,1.88,98,1.12,1.88,1.12,Female,164.0,38,...,60.218567,60.218567,99.86499,59.574468,37,48,29,29,2022-12-15,"1 day, 0:00:00"


In [1]:
def get_consec_days(df):
    df = df.sort_values(by="Date Recorded")
    df["Prev day"] = df["Date Recorded"].shift(1)
    # Compute n days between measurements
    df["Days elapsed"] = df["Date Recorded"] - df["Prev day"]
    # Remove nan
    df = df.dropna(subset=["Days elapsed"])

    # Get first idx where Days elapsed is greater than 1
    idx = df[df["Days elapsed"] > pd.Timedelta(days=3)].index
    print(idx)

    return df


df_tmp = df[df.ID == "405"]
df_tmp = get_consec_days(df_tmp)
# Not finished

NameError: name 'df' is not defined