In [1]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import duckdb
import pyCLIF as pc
import pySBT as t1code
from tqdm import tqdm
from datetime import datetime
from tableone import TableOne, load_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
con = pc.load_config()


Loaded configuration from config.json
{'site_name': 'RUSH', 'tables_path': 'C:/Users/vchaudha/Downloads/rush_parquet/', 'file_type': 'parquet', 'your_site_timezone': 'US/Central'}
Imported SBT Helper!
Loaded configuration from config.json


In [2]:
cohort = pd.read_csv('../output/intermediate/study_cohort.csv')
t1_cohort = cohort.copy()

In [3]:
# Construct the full directory path
directory_path = os.path.join("../output/final/", pc.helper["site_name"], "SAT_standard")

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created.")
else:
    print(f"Directory '{directory_path}' already exists.")

Directory '../output/final/RUSH\SAT_standard' already exists.


In [4]:
cohort['sat_delivery_pass_fail'] = cohort['sat_delivery_pass_fail'].map({0:1,1:1})
cohort['sat_screen_pass_fail'] = cohort['sat_screen_pass_fail'].map({0:1,1:1})

In [5]:
# Ensure 'event_time' is in datetime format
cohort['event_time'] = pd.to_datetime(cohort['event_time'])
cohort['admission_dttm'] = pd.to_datetime(cohort['admission_dttm'], utc=True)
cohort['discharge_dttm'] = pd.to_datetime(cohort['discharge_dttm'], utc=True)

# Ensure the data is sorted by 'hosp_id_day_key' and 'event_time'
cohort = cohort.sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)

cohort['device_category_ffill'] = cohort.groupby('hospitalization_id')['device_category'].ffill()
cohort['location_category_ffill'] = cohort.groupby('hospitalization_id')['location_category'].ffill()

active_sedation_n_col = [
    'fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine'
]

for col in active_sedation_n_col:
    if col not in cohort.columns:
        cohort[col] = np.nan
        print(f"Column '{col}' is missing. Please check your CLIF Meds table — it might be missing, or it's okay if your site doesn't use it.")


# Fill forward the meds by hospitalization columns by 'hosp_id'
cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']] = cohort.groupby('hospitalization_id')[
    ['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']
].ffill()

# Ensure the min value is greater than 0
cohort['min_sedation_dose'] = cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam','hydromorphone','morphine']].min(axis=1, skipna=True)
cohort['min_sedation_dose_2'] = cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']].where(cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']] > 0).min(axis=1, skipna=True)
cohort['min_sedation_dose_non_ops'] = cohort[['propofol', 'lorazepam', 'midazolam']].min(axis=1, skipna=True)
cohort['min_sedation_dose_non_ops'] = cohort['min_sedation_dose_non_ops'].fillna(0)

# Fill forward the paralytic by hospitalization columns by 'hosp_id'
cohort[["cisatracurium"
        ,"vecuronium"
        ,"rocuronium"]] = cohort.groupby('hospitalization_id')[
    ["cisatracurium"
        ,"vecuronium"
        ,"rocuronium"]
].ffill()
# paralytic max to remove from consideration
cohort['max_paralytics'] = cohort[["cisatracurium"
        ,"vecuronium"
        ,"rocuronium"
        ]].max(axis=1, skipna=True).fillna(0)

# Ensure the data is sorted again by 'hosp_id_day_key' and 'event_time'
cohort = cohort.sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)

#### Identify eligible days

In [6]:
def process_cohort(df):
    df = df.sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)
    df['device_category_ffill'] = df.groupby('hospitalization_id')['device_category'].ffill()
    df['location_category_ffill'] = df.groupby('hospitalization_id')['location_category'].ffill()
    # Ensure 'event_time' is datetime
    df['event_time'] = pd.to_datetime(df['event_time'])
   
    df['all_conditions_check'] = (
            (df['device_category_ffill'].str.lower() == 'imv') &
            (df['min_sedation_dose_2'] > 0) &
            (df['location_category_ffill'].str.lower() == 'icu') &
            (df['max_paralytics'] <= 0)
        ).astype(int)

    # Initialize result list
    result = []

    vented_day = df[(df['device_category'] == 'imv')]['hosp_id_day_key'].unique()
    # Group by 'hospitalization_id' and 'date'
    grouped_hosp = df[df['hosp_id_day_key'].isin(vented_day)].groupby(['hospitalization_id', df['event_time'].dt.normalize()])
    
    # Use tqdm for the outer loop to show progress
    for (hosp_id, date), group in tqdm(grouped_hosp, desc='Processing Hospitalizations by Date'):
        group = group.sort_values('event_time')

        # Get the entire hospitalization data for the current hospitalization_id
        temp_df = df[df['hospitalization_id'] == hosp_id].sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)

        # Define start and end times for the current day
        # Start time is 10 PM of the previous day
        start_time = date - pd.Timedelta(days=1) + pd.Timedelta(hours=22)
        # End time is 6 AM of the current day
        end_time = date + pd.Timedelta(hours=6)

        # Filter data in this time window for the entire hospitalization
        mask_time = (temp_df['event_time'] >= start_time) & (temp_df['event_time'] <= end_time)
        df_time_window = temp_df[mask_time].copy()

        if df_time_window.empty:
            continue

        # Use the existing 'device_category_ffill' and 'location_category_ffill' columns
        df_time_window['all_conditions_met'] = (df_time_window['all_conditions_check']>0
        )

        # If no times where all conditions are met, skip
        if not df_time_window['all_conditions_met'].any():
            continue

        # Ensure data is sorted by 'event_time'
        df_time_window = df_time_window.sort_values('event_time').reset_index(drop=True)

        # Create a group identifier for continuous periods where conditions are met
        df_time_window['condition_met_group'] = (df_time_window['all_conditions_met'] != df_time_window['all_conditions_met'].shift()).cumsum()

        # Filter rows where all conditions are met
        df_conditions = df_time_window[df_time_window['all_conditions_met']].copy()
        if df_conditions.empty:
            continue

        # Group by 'condition_met_group' to identify continuous periods
        grouped_conditions = df_conditions.groupby('condition_met_group')

        found_four_hours = False
        for group_id, group_df in grouped_conditions:
            group_df = group_df.reset_index(drop=True)

            # Calculate the duration of each continuous period where all conditions are met
            group_df['duration'] = group_df['event_time'].diff().fillna(pd.Timedelta(seconds=0))
            group_df['cumulative_duration'] = group_df['duration'].cumsum()
            total_duration = group_df['cumulative_duration'].iloc[-1]

            if total_duration >= pd.Timedelta(hours=4):
                # Calculate the exact event_time when cumulative duration reaches four hours
                cumulative_duration = pd.Timedelta(seconds=0)
                for idx in range(len(group_df)):
                    cumulative_duration += group_df['duration'].iloc[idx]
                    if cumulative_duration >= pd.Timedelta(hours=4):
                        event_time_at_4_hours = group_df['event_time'].iloc[idx]
                        break

                # Append to result
                result.append({
                    'hospitalization_id': hosp_id,
                    'current_day_key': date,
                    'event_time_at_4_hours': event_time_at_4_hours
                })
                found_four_hours = True
                # Since we found a period of at least 4 hours continuous conditions met, we can proceed to the next day
                break  # Exit the loop over condition_met_group
        if found_four_hours:
            continue  # Proceed to the next day

    # Convert result to DataFrame for better representation
    result_df = pd.DataFrame(result)
    return result_df

result_df = process_cohort(cohort)
print('Encounter days with at least 4 hours of conditions met from 10 PM to 6 AM:', len(result_df))

Processing Hospitalizations by Date: 100%|██████████| 18387/18387 [02:45<00:00, 111.29it/s]


Encounter days with at least 4 hours of conditions met from 10 PM to 6 AM: 7955


In [None]:
# Merge the result back into the cohort DataFrame
cohort = cohort.merge(result_df[['hospitalization_id', 'current_day_key', 'event_time_at_4_hours']], 
                      how='left', 
                      left_on=['hospitalization_id', cohort['event_time'].dt.normalize()], 
                      right_on=['hospitalization_id', 'current_day_key'])

# Initialize 'eligible_event' column with NaN and used for validation of exact time the event of 4 hr completed
cohort['eligible_event'] = np.nan
has_event_time = cohort['event_time_at_4_hours'].notna()
for (hosp_id, date), group in cohort[has_event_time].groupby(['hospitalization_id', cohort['event_time'].dt.normalize()]):
    event_time_at_4_hours = group['event_time_at_4_hours'].iloc[0]
    subset = cohort[(cohort['hospitalization_id'] == hosp_id) & (cohort['event_time'] >= event_time_at_4_hours)]
    if not subset.empty:
        idx = subset['event_time'].idxmin()
        cohort.loc[idx, 'eligible_event'] = 1
    else:
        subset = cohort[cohort['hospitalization_id'] == hosp_id]
        idx = subset['event_time'].idxmax()
        cohort.loc[idx, 'eligible_event'] = 1

# fix where last row should not be eligible
cohort = cohort.sort_values(['hospitalization_id', 'event_time']).reset_index(drop=True)
for hosp_id, group in cohort.groupby('hospitalization_id'):
    last_idx = group.index[-1]
    if cohort.loc[last_idx, 'eligible_event'] == 1:
        cohort.loc[last_idx, 'eligible_event'] = np.nan


# Flag all that date rows where eligible_event = 1
filtered_cohort = cohort[cohort['eligible_event'] == 1][['hosp_id_day_key', 'eligible_event']]
merged_cohort = cohort.merge(filtered_cohort, on='hosp_id_day_key', how='left', suffixes=('', '_filtered'))
merged_cohort['on_vent_and_sedation'] = merged_cohort['eligible_event_filtered'].fillna(0).astype(int)
merged_cohort = merged_cohort.drop(columns=['eligible_event_filtered'])

del filtered_cohort,result_df

In [None]:
merged_cohort['eligible_event'].value_counts()

eligible_event
1.0    7943
Name: count, dtype: int64

In [None]:
merged_cohort[merged_cohort['on_vent_and_sedation']==1]['hosp_id_day_key'].nunique()

7943

In [None]:
df = merged_cohort[merged_cohort['on_vent_and_sedation']==1].sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)  

df['rank_sedation'] = np.nan
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys'):
    zero_mask = hosp_data['min_sedation_dose'] == 0
    ranks = zero_mask.cumsum() * zero_mask
    df.loc[hosp_data.index, 'rank_sedation'] = ranks.replace(0, np.nan)


df['rank_sedation_non_ops'] = np.nan
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys'):
    zero_mask = hosp_data['min_sedation_dose_non_ops'] == 0
    ranks = zero_mask.cumsum() * zero_mask
    df.loc[hosp_data.index, 'rank_sedation_non_ops'] = ranks.replace(0, np.nan)

Processing hosp_id_day_keys: 100%|██████████| 7943/7943 [00:09<00:00, 853.82it/s] 
Processing hosp_id_day_keys: 100%|██████████| 7943/7943 [00:09<00:00, 831.29it/s] 


#### SAT EHR all meds hard stop flaging

In [None]:
df['SAT_EHR_delivery'] = np.nan
med_columns = ['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']

# Use groupby and vectorized operations for meds check
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys for meds check'):
    hosp_data_sorted = hosp_data.sort_values('event_time')
    for index, row in hosp_data_sorted.iterrows():
        if not np.isnan(row['rank_sedation']):
            current_time = row['event_time']
            thirty_min_forward = hosp_data_sorted[(hosp_data_sorted['event_time'] >= current_time) &
                                                  (hosp_data_sorted['event_time'] <= current_time + pd.Timedelta(minutes=30))]
            # Check if all med_columns are either NaN or 0 and device & location categories are "imv" and "icu" in this timeframe
            if (
                 (thirty_min_forward[med_columns].isna() | (thirty_min_forward[med_columns] == 0)).all(axis=None) and
                 (thirty_min_forward['device_category_ffill'] == 'imv').all() and
                 (thirty_min_forward['location_category_ffill'] == 'icu').all()
            ):
                df.at[index, 'SAT_EHR_delivery'] = 1

Processing hosp_id_day_keys for meds check: 100%|██████████| 7943/7943 [07:51<00:00, 16.85it/s]


#### SAT EHR all meds hard stop flaging (modified meds / non ops)

In [None]:
df['SAT_modified_delivery'] = np.nan
med_columns = ['propofol', 'lorazepam', 'midazolam']

# Use groupby and vectorized operations for meds check
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys for meds check'):
    hosp_data_sorted = hosp_data.sort_values('event_time')
    for index, row in hosp_data_sorted.iterrows():
        if not np.isnan(row['rank_sedation_non_ops']):
            current_time = row['event_time']
            thirty_min_forward = hosp_data_sorted[(hosp_data_sorted['event_time'] >= current_time) &
                                                  (hosp_data_sorted['event_time'] <= current_time + pd.Timedelta(minutes=30))]

            # Check if all med_columns are either NaN or 0 and device & location categories are "imv" and "icu" in this timeframe
            if (
                (thirty_min_forward[med_columns].isna() | (thirty_min_forward[med_columns] == 0)).all(axis=None) and
                 (thirty_min_forward['device_category_ffill'] == 'imv').all() and
                 (thirty_min_forward['location_category_ffill'] == 'icu').all()
            ):
                df.at[index, 'SAT_modified_delivery'] = 1

Processing hosp_id_day_keys for meds check: 100%|██████████| 7943/7943 [10:08<00:00, 13.06it/s]


### Delta plot

In [None]:

# 1. Identify initial sat failure events and delivery events
mask_initial = (df['sat_delivery_pass_fail'] == 1) | (df['sat_screen_pass_fail'] == 1)
mask_ehr = df['SAT_EHR_delivery'] == 1
mask_mod = df['SAT_modified_delivery'] == 1

initial_times = df[mask_initial].groupby('hosp_id_day_key')['event_time'].min().rename('initial_time')
ehr_times = df[mask_ehr].groupby('hosp_id_day_key')['event_time'].min().rename('ehr_time')
mod_times = df[mask_mod].groupby('hosp_id_day_key')['event_time'].min().rename('mod_time')

# 2. Merge into a single DataFrame and drop incomplete cases
times_df = pd.concat([initial_times, ehr_times, mod_times], axis=1).dropna()

# 3. Convert event_time columns to datetime
for col in ['initial_time', 'ehr_time', 'mod_time']:
    times_df[col] = pd.to_datetime(times_df[col])

# 4. Compute deltas in minutes
times_df['delta_to_ehr'] = (times_df['ehr_time'] - times_df['initial_time']).dt.total_seconds() / 60
times_df['delta_to_mod'] = (times_df['mod_time'] - times_df['initial_time']).dt.total_seconds() / 60

# 5. Filter deltas to positive values within 24 hours (0–1440 minutes)
times_df = times_df[
    (times_df['delta_to_ehr'] >= 0) & (times_df['delta_to_ehr'] <= 1440) &
    (times_df['delta_to_mod'] >= 0) & (times_df['delta_to_mod'] <= 1440)
]

# 5. Save CSVs
binned_df = pd.DataFrame({
    'hour_bin': labels,
    'count_to_SAT_EHR_delivery': ehr_counts.values,
    'count_to_SAT_modified_delivery': mod_counts.values
})
binned_df.to_csv(f'{directory_path}/binned_delta_counts.csv', index=False)

# 6. Bin into hourly intervals
bins = list(range(0, 24*60 + 1, 60))  # [0, 60, 120, ..., 1440]
labels = [f'{i}-{i+1}hr' for i in range(24)]

ehr_binned = pd.cut(times_df['delta_to_ehr'], bins=bins, labels=labels, right=False)
mod_binned = pd.cut(times_df['delta_to_mod'], bins=bins, labels=labels, right=False)

ehr_counts = ehr_binned.value_counts().sort_index()
mod_counts = mod_binned.value_counts().sort_index()

# 7. Plot both lines hour-wise
plt.figure(figsize=(12, 6))
plt.plot(labels, ehr_counts.values, marker='o', label='To SAT_EHR_delivery')
plt.plot(labels, mod_counts.values, marker='s', label='To SAT_modified_delivery')
plt.xlabel('Hours since initial failure event')
plt.ylabel('Count of Hospital-Day Keys')
plt.title('Hourly Distribution of Time to EHR and Modified Deliveries')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

NameError: name 'labels' is not defined

#### Icu los calculation

In [None]:
icu_los = cohort[['hospitalization_id', 'event_time', 'location_category_ffill']]
icu_los = icu_los.sort_values(by=['hospitalization_id', 'event_time']).reset_index(drop=True)

icu_los['segment'] = (icu_los['location_category_ffill'] != icu_los['location_category_ffill'].shift()).cumsum()

icu_segments = icu_los[icu_los['location_category_ffill'].str.lower() == 'icu'].groupby(
    ['hospitalization_id', 'segment']
).agg(
    location_start=('event_time', 'first'),
    location_end=('event_time', 'last')
).reset_index()

icu_segments['los_days'] = (icu_segments['location_end'] - icu_segments['location_start']).dt.total_seconds() / (24 * 3600)
icu_los_per_encounter = icu_segments[['hospitalization_id', 'los_days']]

total_icu_los_per_hosp = icu_los_per_encounter.groupby('hospitalization_id', as_index=False).agg(
    ICU_LOS=('los_days', 'sum')
)
total_icu_los_per_hosp.shape

(4176, 2)

#### last dishcharge hosptial_id

In [None]:
last_hosp = cohort[['hospitalization_id', 'event_time', 'hospital_id']]

last_hosp = last_hosp.sort_values(by=['hospitalization_id','event_time'], ascending=False).groupby(
    ['hospitalization_id'], as_index=False
).agg(({'hospital_id': 'first'})).reset_index(drop=True)
last_hosp.shape

(4176, 2)

#### Table one df 

In [None]:
main = df[['patient_id', 'hospitalization_id', 'admission_dttm', 'discharge_dttm',
       'age_at_admission', 'discharge_category', 'sex_category',
       'race_category', 'ethnicity_category','hosp_id_day_key']].drop_duplicates()
main.shape

(7943, 10)

In [None]:
main = pd.merge(main, total_icu_los_per_hosp, on='hospitalization_id', how='left')
main = pd.merge(main, last_hosp, on='hospitalization_id', how='left')
main.shape

(7943, 12)

In [None]:
# Columns to group by
group_cols = [
 'hosp_id_day_key'
]

max_cols = ['sat_screen_pass_fail','sat_delivery_pass_fail','SAT_EHR_delivery', 'SAT_modified_delivery', 'eligible_event']
agg_dict = {col: 'max' for col in max_cols}

df_grouped = df.groupby(group_cols).agg(agg_dict).reset_index()

df_grouped = df_grouped.sort_values('hosp_id_day_key').reset_index(drop=True)

df_grouped['sat_flowsheet_delivery_flag'] = np.where(
    (
        (df_grouped['sat_screen_pass_fail'] == 1) |
        (df_grouped['sat_delivery_pass_fail'] == 1)
    ) &
    (df_grouped['eligible_event'] == 1),
    1,  # Flag is set to 1 (True) if conditions are met
    np.nan   # Flag nan
)

final_df = main.merge(df_grouped, on='hosp_id_day_key', how='inner')
final_df.shape

(7943, 18)

In [None]:
for x in ['sat_delivery_pass_fail', 'sat_screen_pass_fail', 'SAT_EHR_delivery',
       'SAT_modified_delivery', 'eligible_event',
       'sat_flowsheet_delivery_flag']:
    print(final_df[x].value_counts())
    print()

sat_delivery_pass_fail
1.0    2386
Name: count, dtype: int64

sat_screen_pass_fail
1.0    4234
Name: count, dtype: int64

SAT_EHR_delivery
1.0    2907
Name: count, dtype: int64

SAT_modified_delivery
1.0    5535
Name: count, dtype: int64

eligible_event
1.0    7943
Name: count, dtype: int64

sat_flowsheet_delivery_flag
1.0    4248
Name: count, dtype: int64



#### concordance

In [None]:
# DataFrame to collect all metrics
metrics_list = []

for z in ['SAT_EHR_delivery', 'SAT_modified_delivery']:
    x = final_df['sat_flowsheet_delivery_flag']
    y = final_df[z]

    # compute confusion matrix
    tn, fp, fn, tp = confusion_matrix(x, y).ravel()

    # compute derived metrics
    accuracy    = (tp + tn) / (tp + tn + fp + fn)
    precision   = tp / (tp + fp) if (tp + fp) else 0
    recall      = tp / (tp + fn) if (tp + fn) else 0
    f1          = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    specificity = tn / (tn + fp) if (tn + fp) else 0

    # plot and save confusion matrix
    disp = ConfusionMatrixDisplay.from_predictions(
        x, y,
        display_labels=["No Delivery", "Delivery"],
        cmap=plt.cm.Blues,
        normalize=None
    )
    disp.ax_.set_title(f"Confusion Matrix for {z}")
    fig = disp.figure_
    plot_path = os.path.join(directory_path, f"confusion_matrix_{z}.png")
    fig.savefig(plot_path, bbox_inches='tight')
    plt.close(fig)

    # print metrics
    print(f"Column      : {z}")
    print(f"Accuracy    : {accuracy:.3f}")
    print(f"Precision   : {precision:.3f}")
    print(f"Recall      : {recall:.3f}")
    print(f"F1 Score    : {f1:.3f}")
    print(f"Specificity : {specificity:.3f}\n")

    # collect metrics in a dict
    metrics_dict = {
        "Column": z,
        "True Positives (TP)": tp,
        "False Positives (FP)": fp,
        "False Negatives (FN)": fn,
        "True Negatives (TN)": tn,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Specificity": specificity,
        "Plot Path": plot_path
    }
    metrics_list.append(metrics_dict)

# convert to DataFrame and save
metrics_df = pd.DataFrame(metrics_list)
metrics_df.to_csv(os.path.join(directory_path, "delivery_metrics_summary.csv"), index=False)


NameError: name 'final_df' is not defined

In [None]:
final_df.to_csv('../output/intermediate/final_df_SAT.csv', index=False)

#### table one print

In [None]:
categorical_columns = ['sex_category', 'race_category', 'ethnicity_category','discharge_category']
non_categorical_columns = ['age_at_admission',  'ICU_LOS', 'Inpatient_LOS']

final_df['admission_dttm'] = pd.to_datetime(final_df['admission_dttm'],utc=True)
final_df['discharge_dttm'] = pd.to_datetime(final_df['discharge_dttm'],utc=True)

In [None]:
### SAT FLAG Table 1


sat_flow_t1 = final_df[final_df['sat_flowsheet_delivery_flag'] == 1][[ 'hospitalization_id', 'admission_dttm', 'discharge_dttm', 'age_at_admission', 'discharge_category', 'sex_category','race_category', 'ethnicity_category','ICU_LOS']].drop_duplicates()
sat_flow_t1['Inpatient_LOS'] = (sat_flow_t1['discharge_dttm'] - sat_flow_t1['admission_dttm']).dt.total_seconds() / (24 * 3600)

if len(sat_flow_t1)>1:
    table1 = TableOne(sat_flow_t1, categorical=categorical_columns, nonnormal=non_categorical_columns, columns=categorical_columns+non_categorical_columns )

    table1.to_csv(f'{directory_path}/table1_sat_flowhseet_{pc.helper["site_name"]}.csv')
    print(table1)

                                                                           Missing           Overall
n                                                                                               1922
sex_category, n (%)              Female                                                   844 (43.9)
                                 Male                                                    1078 (56.1)
race_category, n (%)             American Indian or Alaska Native                            3 (0.2)
                                 Asian                                                      82 (4.3)
                                 Black or African American                                673 (35.0)
                                 Native Hawaiian or Other Pacific Islander                   3 (0.2)
                                 Other                                                    359 (18.7)
                                 Unknown                                                   

In [None]:
### SAT EHR FLAG Table 1

sat_ehr_t1 = final_df[(final_df['SAT_EHR_delivery'] == 1) | (final_df['SAT_modified_delivery'] == 1)][[ 'hospitalization_id', 'admission_dttm', 'discharge_dttm', 'age_at_admission', 'discharge_category', 'sex_category','race_category', 'ethnicity_category','ICU_LOS']].drop_duplicates()
sat_ehr_t1['Inpatient_LOS'] = (sat_ehr_t1['discharge_dttm'] - sat_ehr_t1['admission_dttm']).dt.total_seconds() / (24 * 3600)

if len(sat_ehr_t1)>1:
    table2 = TableOne(sat_ehr_t1, categorical=categorical_columns, nonnormal=non_categorical_columns, columns=categorical_columns+non_categorical_columns )

    table2.to_csv(f'{directory_path}/table1_sat_ehr_{pc.helper["site_name"]}.csv')
    print(table2)

                                                                           Missing           Overall
n                                                                                               2286
sex_category, n (%)              Female                                                  1027 (44.9)
                                 Male                                                    1259 (55.1)
race_category, n (%)             American Indian or Alaska Native                            4 (0.2)
                                 Asian                                                      90 (3.9)
                                 Black or African American                                808 (35.3)
                                 Native Hawaiian or Other Pacific Islander                   3 (0.1)
                                 Other                                                    435 (19.0)
                                 Unknown                                                   

In [None]:
### all Table 1

all_t1 = final_df[[ 'hospitalization_id', 'admission_dttm', 'discharge_dttm', 'age_at_admission', 'discharge_category', 'sex_category','race_category', 'ethnicity_category','ICU_LOS']].drop_duplicates()
all_t1['Inpatient_LOS'] = (all_t1['discharge_dttm'] - all_t1['admission_dttm']).dt.total_seconds() / (24 * 3600)

if len(all_t1)>1:
    table3 = TableOne(all_t1, categorical=categorical_columns, nonnormal=non_categorical_columns, columns=categorical_columns+non_categorical_columns )

    table3.to_csv(f'{directory_path}/table1_all_t1_{pc.helper["site_name"]}.csv')
    print(table3)

                                                                           Missing           Overall
n                                                                                               2525
sex_category, n (%)              Female                                                  1129 (44.7)
                                 Male                                                    1396 (55.3)
race_category, n (%)             American Indian or Alaska Native                            4 (0.2)
                                 Asian                                                      98 (3.9)
                                 Black or African American                                896 (35.5)
                                 Native Hawaiian or Other Pacific Islander                   3 (0.1)
                                 Other                                                    465 (18.4)
                                 Unknown                                                   

#### Per hospital stats

In [None]:
# Initialize an empty list to store each hospital's data
data_list = []

# Iterate over unique hospital IDs as strings
for x in final_df['hospital_id'].astype(str).unique():
    # Calculate counts based on specific conditions
    eligible_event_count = final_df[(final_df['eligible_event'] == 1) & (final_df['hospital_id'].astype(str) == x)].shape[0]
    sat_flowsheet_delivery_flag_count = final_df[(final_df['sat_flowsheet_delivery_flag'] == 1) & (final_df['hospital_id'].astype(str) == x)].shape[0]
    SAT_modified_delivery_count = final_df[(final_df['SAT_modified_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)].shape[0]
    SAT_EHR_delivery_count = final_df[(final_df['SAT_EHR_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)].shape[0]

    SAT_EHR_uni_pats = final_df[(final_df['SAT_EHR_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)]['patient_id'].nunique()
    SAT_EHR_hosp = final_df[(final_df['SAT_EHR_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)]['hospitalization_id'].nunique()

    SAT_modified_uni_pats = final_df[(final_df['SAT_modified_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)]['patient_id'].nunique()
    SAT_modified_hosp = final_df[(final_df['SAT_modified_delivery'] == 1) & (final_df['hospital_id'].astype(str) == x)]['hospitalization_id'].nunique()

    SAT_EHR_modified_uni_pats = final_df[((final_df['SAT_EHR_delivery'] == 1) | (final_df['SAT_modified_delivery'] == 1)) & (final_df['hospital_id'].astype(str) == x)]['patient_id'].nunique()
    SAT_EHR_modified_hosp = final_df[((final_df['SAT_EHR_delivery'] == 1) | (final_df['SAT_modified_delivery'] == 1)) & (final_df['hospital_id'].astype(str) == x)]['hospitalization_id'].nunique()

    SAT_flowsheet_uni_pats = final_df[(final_df['sat_flowsheet_delivery_flag'] == 1) & (final_df['hospital_id'].astype(str) == x)]['patient_id'].nunique()
    SAT_flowsheet_hosp = final_df[(final_df['sat_flowsheet_delivery_flag'] == 1) & (final_df['hospital_id'].astype(str) == x)]['hospitalization_id'].nunique()

    # Safeguard against division by zero
    if eligible_event_count > 0:
        percent_sat_flowsheet_delivery_flag = (sat_flowsheet_delivery_flag_count / eligible_event_count) * 100
        percent_SAT_modified_delivery = (SAT_modified_delivery_count / eligible_event_count) * 100
        percent_SAT_EHR_delivery = (SAT_EHR_delivery_count / eligible_event_count) * 100
    else:
        percent_sat_flowsheet_delivery_flag = 0
        percent_SAT_modified_delivery = 0
        percent_SAT_EHR_delivery = 0

    # Append the data for this hospital to the list
    data_list.append({
        'Site_Name_Hosp': pc.helper["site_name"] + '_' + x,  
        '%_of_SAT_flowsheet_delivery_flag': percent_sat_flowsheet_delivery_flag,
        '%_of_SAT_modified_delivery': percent_SAT_modified_delivery,
        '%_of_SAT_EHR_delivery': percent_SAT_EHR_delivery,
        'eligible_event_count': eligible_event_count,
        'sat_flowsheet_delivery_flag_count': sat_flowsheet_delivery_flag_count,
        'SAT_modified_delivery_count': SAT_modified_delivery_count,
        'SAT_EHR_delivery_count': SAT_EHR_delivery_count,

        'SAT_EHR_unique_patients': SAT_EHR_uni_pats,
        'SAT_EHR_unique_hospitalizations': SAT_EHR_hosp,
        'SAT_modified_unique_patients': SAT_modified_uni_pats,
        'SAT_modified_unique_hospitalizations': SAT_modified_hosp,
        'SAT_EHR_modified_unique_patients': SAT_EHR_modified_uni_pats,
        'SAT_EHR_modified_unique_hospitalizations': SAT_EHR_modified_hosp,
        'SAT_flowsheet_unique_patients': SAT_flowsheet_uni_pats, 
        'SAT_flowsheet_unique_hospitalizations': SAT_flowsheet_hosp   
    })

# Create a DataFrame from the list
final_data_df = pd.DataFrame(data_list)
final_data_df.to_csv(f'{directory_path}/sat_stats_{pc.helper["site_name"]}.csv',index=False)
# Display the final DataFrame
final_data_df.T

Unnamed: 0,0
Site_Name_Hosp,RUSH_RUMC
%_of_SAT_flowsheet_delivery_flag,53.481052
%_of_SAT_modified_delivery,69.683998
%_of_SAT_EHR_delivery,36.598263
eligible_event_count,7943
sat_flowsheet_delivery_flag_count,4248
SAT_modified_delivery_count,5535
SAT_EHR_delivery_count,2907
SAT_EHR_unique_patients,1851
SAT_EHR_unique_hospitalizations,1908


# New Table 1 Code

In [None]:
# Aggregate functions
def documented(series):
    return "Documented" if series.notna().any() else "Not Documented"

def age_bucket(mean_age):
    if pd.isna(mean_age):
        return None
    elif mean_age < 40:
        return "18-39"
    elif mean_age < 60:
        return "40-59"
    elif mean_age < 80:
        return "60-79"
    else:
        return "80+"

# Clean 'language_name' to only "English", "Spanish", or "Other"
def categorize_language(lang):
    if re.search(r'english', str(lang), re.IGNORECASE):
        return 'English'
    elif re.search(r'spanish', str(lang), re.IGNORECASE):
        return 'Spanish'
    else:
        return 'Other'

t1_col = [
    "patient_id",
    "hospitalization_id",
    "hosp_id_day_key",
    "age_at_admission",    "sex_category",    "race_category",    "ethnicity_category",    "language_name",    "weight_kg",
    "height_cm", "cisatracurium",    "vecuronium",    "rocuronium",    "dobutamine",    "dopamine",    "epinephrine",
    "fentanyl",    "hydromorphone",    "isoproterenol",    "lorazepam",    "midazolam",    "milrinone",    "morphine",
    "norepinephrine",    "phenylephrine",    "propofol",    "vasopressin",    "angiotensin",     "rass", "gcs_total"
]

medication_columns = [
    "rass", "gcs_total", "cisatracurium", "vecuronium", "rocuronium",
    "dobutamine", "dopamine", "epinephrine", "fentanyl", "hydromorphone",
    "isoproterenol", "lorazepam", "midazolam", "milrinone", "morphine",
    "norepinephrine", "phenylephrine", "propofol", "vasopressin", "angiotensin"
]

demographic_columns = ["sex_category", "race_category", "ethnicity_category", "language_name"]

continuous_cols = [
    "rass", "gcs_total", "cisatracurium", "vecuronium", "rocuronium",
    "dobutamine", "dopamine", "epinephrine", "fentanyl", "hydromorphone",
    "isoproterenol", "lorazepam", "midazolam", "milrinone", "morphine",
    "norepinephrine", "phenylephrine", "propofol", "vasopressin",
    "angiotensin", "bmi"
]

drugs = [
    "cisatracurium", "vecuronium", "rocuronium",
    "dobutamine", "dopamine", "epinephrine", "fentanyl", "hydromorphone",
    "isoproterenol", "lorazepam", "midazolam", "milrinone", "morphine",
    "norepinephrine", "phenylephrine", "propofol", "vasopressin", "angiotensin"
]

# Apply the transformation
t1_cohort[drugs] = t1_cohort[drugs].applymap(lambda x: x if x > 0 else np.nan)

t1_cohort['bmi'] = t1_cohort['weight_kg'] / ((t1_cohort['height_cm'] / 100) ** 2)

# Apply the function to 'language_name'
t1_cohort['language_name'] = t1_cohort['language_name'].apply(categorize_language)
t1_cohort['rass'] = t1_cohort['rass'].astype(float)

#### Table 1 By ID for Categorical

In [None]:
for x in ['hospitalization_id', 'patient_id']:
    t1_summary = t1_cohort.groupby(x).agg(
        {
            "age_at_admission": "mean",
            **{col: documented for col in medication_columns},
            **{col: "first" for col in demographic_columns}
        }
    )

    t1_summary["age_bucket"] = t1_summary["age_at_admission"].apply(age_bucket)
    t1_summary = t1_summary.drop(columns=["age_at_admission"])
    t1_summary = t1_summary.reset_index()

    summary_df = t1code.manual_categorical_tableone(
        t1_summary, 
        medication_columns + demographic_columns + ["age_bucket"]
    )

    if x == 'hospitalization_id':
        summary_df.to_csv(f"{directory_path}/table1_hospitalization_id_categorical.csv", index=False)
    else:
        summary_df.to_csv(f"{directory_path}/table1_patient_id_categorical.csv", index=False)

#### Table 1 By ID for Continuous

In [None]:
hospitalization_summary = None
patient_summary = None

hosp = (
    t1_cohort
      .groupby("hospitalization_id")
      .agg(
        {
          **{c: "median" for c in continuous_cols}
        }
      )
      .reset_index()
)

patient = (
    t1_cohort
      .groupby("patient_id")
      .agg(
        {
          **{c: "median" for c in continuous_cols}
        }
      )
      .reset_index()
)

# Build for hospitalization level and patient level
hospitalization_summary = t1code.manual_tableone(hosp, continuous_cols)
patient_summary = t1code.manual_tableone(patient, continuous_cols)

hospitalization_summary.to_csv(f"{directory_path}/table1_hospitalization_id_continuous.csv", index=False)
patient_summary.to_csv(f"{directory_path}/table1_patient_id_continuous.csv", index=False)

#### Table 1 By Days for Categorical

In [None]:
for x in tqdm(
    [
        "eligible_event",
        "SAT_EHR_delivery",
        "SAT_modified_delivery"
    ]
):
    ids_to_use = final_df[final_df[x]==1].hosp_id_day_key.unique()
    # Groupby aggregation by hospitalization_id
    t1_summary = t1_cohort[t1_cohort['hosp_id_day_key'].isin(ids_to_use)].groupby("hosp_id_day_key").agg(
        {
            "age_at_admission": "mean",
            **{col: documented for col in medication_columns},
            **{col: "first" for col in demographic_columns},
        }
    )

    # Apply age bucketing
    t1_summary["age_bucket"] = t1_summary["age_at_admission"].apply(age_bucket)

    # Drop raw age if you don't need it
    t1_summary = t1_summary.drop(columns=["age_at_admission"])

    # Reset index if needed
    t1_summary = t1_summary.reset_index()

    summary_df = t1code.manual_categorical_tableone(
        t1_summary, 
        medication_columns + demographic_columns + ["age_bucket"]
    )
    summary_df.to_csv(f"{directory_path}/table1_{x}_categorical.csv", index=False)


100%|██████████| 3/3 [00:29<00:00,  9.98s/it]


In [None]:
for x in tqdm(
    [
        "eligible_event",
        "SAT_EHR_delivery",
        "SAT_modified_delivery",
    ]
):
    # --- filter to only the days in this subcohort
    ids = final_df.loc[final_df[x] == 1, "hosp_id_day_key"].unique()
    sub = t1_cohort[t1_cohort["hosp_id_day_key"].isin(ids)]

    # --- 1) Day-level medians + flags + demographics
    day_summary = (
        sub.groupby("hosp_id_day_key")
        .agg({**{c: "median" for c in continuous_cols}})
        .reset_index()
    )
    summary_df = t1code.manual_tableone(day_summary, continuous_cols)
    summary_df.to_csv(f"{directory_path}/table1_{x}_continuous.csv", index=False)

100%|██████████| 3/3 [00:01<00:00,  2.21it/s]


#### Thank You!!! keep latest timestamp files and upload to box :)