In [None]:
import pandas as pd
import numpy as np
import duckdb
import pyCLIF as pc
from tqdm import tqdm

In [2]:
cohort = pd.read_csv('../output/study_cohort.csv')

In [3]:


# # Ensure 'time_line' is in datetime format
# cohort['time_line'] = pd.to_datetime(cohort['time_line'])
# cohort['date'] = cohort['time_line'].dt.date

# # Sort the DataFrame by 'hospitalization_id' and 'date' to prepare for day numbering
# cohort = cohort.sort_values(['hospitalization_id', 'date'])

# # Assign day numbers to each 'hospitalization_id'
# cohort['day_number'] = cohort.groupby('hospitalization_id')['date'].rank(method='dense').astype(int)

# # Create the combo_key by combining 'hospitalization_id' and 'day_number'
# cohort['hosp_id_day_key'] = cohort['hospitalization_id'].astype(str) + '_day_' + cohort['day_number'].astype(str)

# Ensure the data is sorted by 'hosp_id_day_key' and 'time_line'
cohort = cohort.sort_values(by=['hospitalization_id', 'time_line']).reset_index(drop=True)

# Forward-fill the 'rass' values within each 'hosp_id_day_key'
cohort['rass_ffill'] = cohort.groupby('hosp_id_day_key')['rass'].ffill()

# Fill forward the meds by hospitalization columns by 'hosp_id'
cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']] = cohort.groupby('hospitalization_id')[
    ['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']
].ffill()

# Ensure the min value is greater than 0
cohort['min_sedation_dose'] = cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam','hydromorphone','morphine']].min(axis=1, skipna=True)

# Ensure the data is sorted again by 'hosp_id_day_key' and 'time_line'
cohort = cohort.sort_values(by=['hosp_id_day_key', 'time_line']).reset_index(drop=True)

In [None]:
cohort.columns

In [5]:
# def process_cohort_12_4(df):
    # # Step 1: Fill forward the 'device_category' column within each 'hosp_id_day_key' group
    # df['device_category_ffill'] = df.groupby('hosp_id_day_key')['device_category'].ffill()
    
    # # Step 2: Filter rows where 'time_line' is between 12 AM and 4 AM 

    # mask_time = (df['time_line'].dt.hour >= 0) & (df['time_line'].dt.hour < 4)
    # df_filtered = df[mask_time]
    
    # # Step 3: Group by 'hosp_id_day_key'
    # grouped = df_filtered.groupby('hosp_id_day_key')
    
    # # Step 4: Define function to check if both conditions are met within each group
    # def conditions_met(group):
    #     has_ivm = (group['device_category_ffill'].str.lower() == 'imv').any()
    #     has_sedation = group['min_sedation_dose'].notna().any()
    #     return has_ivm and has_sedation
    
    # # Step 5: Apply the function to each group and filter groups that meet the conditions
    # filtered_groups = grouped.filter(lambda x: conditions_met(x))
    
    # # Step 6: Get the list of unique 'hosp_id_day_key's that meet the conditions
    # result = filtered_groups['hosp_id_day_key'].unique()
    
    # return result

# # Example usage
# result = process_cohort_12_4(cohort)

# print('encounter_days with imv and sedation on from 12am to 4am : ',len(result))

In [None]:

def process_cohort(df):

    df['device_category_ffill'] = df.groupby('hospitalization_id')['device_category'].ffill()
    # Ensure 'time_line' is datetime
    df['time_line'] = pd.to_datetime(df['time_line'])

    # Initialize result list
    result = []

    # Group by 'hospitalization_id'
    grouped_hosp = df.groupby('hospitalization_id')

    # Use tqdm for the outer loop to show progress
    for hosp_id, group in tqdm(grouped_hosp, desc='Processing Hospitalizations'):
        group = group.sort_values('time_line')
        day_numbers = group['day_number'].unique()
        for day_num in day_numbers:
            # Get current day's data
            current_day_data = group[group['day_number'] == day_num]
            current_day_keys = current_day_data['hosp_id_day_key'].unique()
            if len(current_day_keys) == 0:
                continue
            current_day_key = current_day_keys[0]

            # Define start and end times
            # Start time is 8 PM of the previous day
            prev_day_data = group[group['day_number'] == (day_num - 1)]
            if not prev_day_data.empty:
                prev_day_date = prev_day_data['time_line'].dt.normalize().min()
                start_time = prev_day_date + pd.Timedelta(hours=20)  # 8 PM
            else:
                # If no previous day, start from 8 PM of the day before the current day
                current_day_date = current_day_data['time_line'].dt.normalize().min()
                start_time = current_day_date - pd.Timedelta(days=1) + pd.Timedelta(hours=20)

            # End time is 8 AM of the current day
            current_day_date = current_day_data['time_line'].dt.normalize().min()
            end_time = current_day_date + pd.Timedelta(hours=8)  # 8 AM

            # Filter data in this time window
            mask_time = (group['time_line'] >= start_time) & (group['time_line'] <= end_time)
            df_time_window = group[mask_time].copy()

            if df_time_window.empty:
                continue

            # Use the existing 'device_category_ffill' column
            df_time_window['both_conditions_met'] = (
                df_time_window['device_category_ffill'].str.lower() == 'imv'
            ) & df_time_window['min_sedation_dose'].notna()

            # If no times where both conditions are met, skip
            if not df_time_window['both_conditions_met'].any():
                continue

            # Ensure data is sorted by 'time_line'
            df_time_window = df_time_window.sort_values('time_line')

            # Calculate time differences
            df_time_window['time_diff'] = df_time_window['time_line'].diff().fillna(pd.Timedelta(seconds=0))

            # Assume that each row represents the duration until the next timestamp
            df_time_window['duration'] = df_time_window['time_diff'].shift(-1)
            df_time_window.iloc[-1, df_time_window.columns.get_loc('duration')] = pd.Timedelta(seconds=0)

            # Keep only periods where both conditions are met
            df_conditions = df_time_window[df_time_window['both_conditions_met']]

            # Sum the durations
            total_duration = df_conditions['duration'].sum()

            # Check if total_duration is at least 4 hours
            if total_duration >= pd.Timedelta(hours=4):
                result.append(current_day_key)

    return result

# Example usage
result = process_cohort(cohort)
print('Encounter days with at least 4 hours of IMV and sedation from 8 PM to 8 AM:', len(result))



In [7]:
cohort['on_vent_and_sedation'] = 0
cohort.loc[cohort['hosp_id_day_key'].isin(result), 'on_vent_and_sedation'] = 1

# # Ensure the data is sorted by 'hosp_id_day_key' and 'time_line'
# cohort = cohort.sort_values(by=['hosp_id_day_key', 'time_line']).reset_index(drop=True)

# # Forward-fill the 'rass' values within each 'hosp_id_day_key'
# cohort['rass_ffill'] = cohort.groupby('hosp_id_day_key')['rass'].ffill()

# # Fill forward the meds by hospitalization columns by 'hosp_id'
# cohort[['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']] = cohort.groupby('hospitalization_id')[
#     ['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']
# ].ffill()

In [None]:
cohort.columns

In [None]:
# Assuming you have the dataset loaded into a DataFrame called cohort
df = cohort.copy()  # Assuming cohort is already a Pandas DataFrame

# Step 1: Create rank_sedation column
# Initialize rank_sedation with NaN
df['rank_sedation'] = np.nan

# Step 1 Optimization: Use groupby and vectorized operations for rank assignment
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys'):
    zero_mask = hosp_data['min_sedation_dose'] == 0
    ranks = zero_mask.cumsum() * zero_mask
    df.loc[hosp_data.index, 'rank_sedation'] = ranks.replace(0, np.nan)


In [None]:
# Step 2: Calculate all_meds_0_dose_1hr_forward, 30mins, and 1.5hr forward
# Initialize the new columns with NaN to track rank values
df['all_meds_0_dose_1hr_forward'] = np.nan
df['all_meds_0_dose_30min_forward'] = np.nan
df['all_meds_0_dose_1_5hr_forward'] = np.nan
med_columns = ['fentanyl', 'propofol', 'lorazepam', 'midazolam', 'hydromorphone', 'morphine']

# Step 2 Optimization: Use groupby and vectorized operations for meds check
for hosp_id_day_key, hosp_data in tqdm(df[df['on_vent_and_sedation'] == 1].groupby('hosp_id_day_key'), desc='Processing hosp_id_day_keys for meds check'):
    hosp_data_sorted = hosp_data.sort_values('time_line')
    rank_counter_30min = 1
    rank_counter_1hr = 1
    rank_counter_1_5hr = 1
    for index, row in hosp_data_sorted.iterrows():
        if not np.isnan(row['rank_sedation']):
            current_time = row['time_line']
            # Define time windows
            one_hour_forward = hosp_data_sorted[(hosp_data_sorted['time_line'] >= current_time) &
                                                (hosp_data_sorted['time_line'] <= current_time + pd.Timedelta(hours=1))]
            thirty_min_forward = hosp_data_sorted[(hosp_data_sorted['time_line'] >= current_time) &
                                                  (hosp_data_sorted['time_line'] <= current_time + pd.Timedelta(minutes=30))]
            one_half_hour_forward = hosp_data_sorted[(hosp_data_sorted['time_line'] >= current_time) &
                                                     (hosp_data_sorted['time_line'] <= current_time + pd.Timedelta(hours=1, minutes=30))]
            
            # Check if all meds are zero for each timeframe and rank accordingly
            if thirty_min_forward.empty or (thirty_min_forward[med_columns].isna() | (thirty_min_forward[med_columns] == 0)).all(axis=None):
                df.at[index, 'all_meds_0_dose_30min_forward'] = rank_counter_30min
                rank_counter_30min += 1

            if one_hour_forward.empty or (one_hour_forward[med_columns].isna() | (one_hour_forward[med_columns] == 0)).all(axis=None):
                df.at[index, 'all_meds_0_dose_1hr_forward'] = rank_counter_1hr
                rank_counter_1hr += 1

            if one_half_hour_forward.empty or (one_half_hour_forward[med_columns].isna() | (one_half_hour_forward[med_columns] == 0)).all(axis=None):
                df.at[index, 'all_meds_0_dose_1_5hr_forward'] = rank_counter_1_5hr
                rank_counter_1_5hr += 1

In [None]:
# Step 3: Calculate net improvement in rass_ffill for each segment using vectorized operations
# Only calculate RASS if the corresponding forward column has a rank value
df['rass_net_improvement_30min'] = np.nan
df['rass_net_improvement_1hr'] = np.nan
df['rass_net_improvement_1_5hr'] = np.nan

def calculate_rass_net_improvement(group):
    group = group.sort_values('time_line')
    time_lines = group['time_line']
    rass_values = group['rass_ffill']
    
    rass_30min_forward = rass_values.shift(-1).where((time_lines + pd.Timedelta(minutes=30)) >= time_lines)
    rass_1hr_forward = rass_values.shift(-1).where((time_lines + pd.Timedelta(hours=1)) >= time_lines)
    rass_1_5hr_forward = rass_values.shift(-1).where((time_lines + pd.Timedelta(hours=1, minutes=30)) >= time_lines)
    
    # Only calculate if there is a rank value for the respective forward columns
    group['rass_net_improvement_30min'] = np.where(~group['all_meds_0_dose_30min_forward'].isna(), np.sign(rass_30min_forward - rass_values) * abs(rass_30min_forward - rass_values), np.nan)
    group['rass_net_improvement_1hr'] = np.where(~group['all_meds_0_dose_1hr_forward'].isna(), np.sign(rass_1hr_forward - rass_values) * abs(rass_1hr_forward - rass_values), np.nan)
    group['rass_net_improvement_1_5hr'] = np.where(~group['all_meds_0_dose_1_5hr_forward'].isna(), np.sign(rass_1_5hr_forward - rass_values) * abs(rass_1_5hr_forward - rass_values), np.nan)
    
    return group

# Apply the optimized calculation using groupby and apply without dropping rows
df = df.groupby('hosp_id_day_key').apply(lambda group: calculate_rass_net_improvement(group) if group['on_vent_and_sedation'].iloc[0] == 1 else group)


In [12]:
df.to_csv('sat_with_flags_jc.csv',index=False)

In [13]:
#cohort.to_csv('temp.csv',index=False)