#### Import packages

In [None]:
import pandas as pd
import numpy as np
import time
import pickle
import random
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
from matplotlib.pyplot import figure

#### Read in files

In [None]:
labFilePath = r'C:\\Users\\momenzadeha\\Documents\hypogly_pred\EHR_extracts\meyer_2306_labs_with_order_time_2023_03_06.csv'
originalLabDf = pd.read_csv(labFilePath)

#### Pre-processing

In [None]:
#clean duplicate lab names
replacement_dict = {
    'GLUCOSE (POC)': 'GLUCOSE-POC',
    'SERUM CREATININE': 'CREATININE',
    'SERUM ALBUMIN': 'ALBUMIN'
}
originalLabDf = originalLabDf.replace(replacement_dict)

In [None]:
#extract these labs only
lab_list=['GLUCOSE-POC','HEMOGLOBIN A1C','C-REACTIVE PROTEIN', 'CREATININE', 
          'EGFR NON-AFRICAN AMERICAN','EGFR AFRICAN AMERICAN', 'GLUCOSE RANDOM',
          'AST (SGOT)','ALT (SGPT)','ALBUMIN','L-LACTATE','EGFR CREATININE CKD-EPI 2021']
targLabDf=originalLabDf[originalLabDf['COMPONENT_NAME'].isin(lab_list)]

In [None]:
# Function to format the lab DataFrame and remove missing lab entries
def format_lab_df(originalLabDf):
    """
    This function formats the input lab DataFrame by:
    - Renaming columns for consistency.
    - Selecting relevant columns.
    - Converting date columns to datetime format.
    - Cleaning up erroneous values in the 'ORD_VALUE' column.
    - Converting 'ORD_VALUE' to numeric and removing rows with missing 'ORD_VALUE' values.

    Parameters:
    originalLabDf (pd.DataFrame): The original lab data DataFrame.

    Returns:
    pd.DataFrame: A cleaned and formatted lab DataFrame.
    """
    
    # Create a copy of the original DataFrame to avoid modifying it directly
    labDf = originalLabDf.copy()
    
    # Rename 'PAT_ENC_CSN_ID' column to 'CSN' for consistency
    labDf.rename(columns={'PAT_ENC_CSN_ID': 'CSN'}, inplace=True)
    
    # Select only the relevant columns: MRN, CSN, CONTACT_DATE, RESULT_TIME, COMPONENT_NAME, and ORD_VALUE
    labDf = labDf[['MRN', 'CSN', 'CONTACT_DATE', 'RESULT_TIME', 'COMPONENT_NAME', 'ORD_VALUE']]
    
    # Convert 'CONTACT_DATE' and 'RESULT_TIME' columns to datetime format for easier time-based operations
    labDf[['CONTACT_DATE', 'RESULT_TIME']] = labDf[['CONTACT_DATE', 'RESULT_TIME']].apply(pd.to_datetime)
    
    # Replace erroneous values in the 'ORD_VALUE' column with standardized numeric values
    labDf['ORD_VALUE'].replace(
        ['154bs', 'BS 175', 'bs120', '168 mg/dL', '159=bs', 'bs 120', '`185', '250 mg/dL', 'BG 171 mg/dL', 
         '250 mg/dL', '240 mg/dL', '130  mg/dL', 'BS 145', 'bs-134'], 
        ['154', '175', '120', '168', '159', '120', '185', '250', '171', '250', '240', '130', '145', '134'], 
        inplace=True
    )
    
    # Convert 'ORD_VALUE' to a numeric type, forcing any non-convertible values to NaN
    labDf['ORD_VALUE'] = pd.to_numeric(labDf['ORD_VALUE'], errors='coerce')
    
    # Drop rows where 'ORD_VALUE' is missing (NaN), as these are invalid or incomplete entries
    labDf = labDf.dropna(subset=['ORD_VALUE'])
    
    # Return the cleaned and formatted lab DataFrame
    return labDf

In [None]:
labDf = format_lab_df(targLabDf)
filteredLabDf = labDf.copy()

#### Calculate differences between lab times and random BG time

In [None]:
#CSNs to merge on from meds dataset
with open(r'/Users/momenzadeha/Documents/mixed effects LR/LMM/MELRoutputCSNs.pkl', 'rb') as handle:
    list_for_labs = pickle.load(handle)

In [None]:
# Rename the 'RESULT_TIME' column to 'rand_BG_time' in the 'list_for_labs' DataFrame
list_for_labs.rename(columns={'RESULT_TIME': 'rand_BG_time'}, inplace=True)

# Drop duplicate rows in the 'list_for_labs' DataFrame, keeping only the 'CSN' and 'rand_BG_time' columns
list_for_labs = list_for_labs[['CSN', 'rand_BG_time']].drop_duplicates()

# Perform an inner join between 'list_for_labs' and 'filteredLabDf' DataFrames on the 'CSN' column
# The merge aligns rows where 'CSN' values match in both DataFrames
merge = list_for_labs.merge(filteredLabDf, how='inner', on='CSN')

In [None]:
def create_time_difference_column_in_lab_before_event_df(merge):
    """
    This function creates a 'TIME_DIFF' column in the input DataFrame 'merge' which calculates the difference between 
    'rand_BG_time' and 'RESULT_TIME'. The time difference is converted into minutes using NumPy's timedelta64 format.
    
    Parameters:
    merge (pd.DataFrame): A DataFrame containing 'rand_BG_time' and 'RESULT_TIME' columns that represent 
                          time of random blood glucose measurements and lab result times, respectively.
    
    Returns:
    pd.DataFrame: The updated DataFrame with a new 'TIME_DIFF' column and a subset of the original columns.
    """    
    # Calculate the difference between 'rand_BG_time' and 'RESULT_TIME' to get time differences
    diffFromLabColumn = merge['rand_BG_time'] - merge['RESULT_TIME']
    
    # Convert each time difference to a timedelta64 object in minutes ('m')
    diffFromLabColumn = [np.timedelta64(x, 'm') for x in diffFromLabColumn]
    
    # Assign the calculated time differences as the 'TIME_DIFF' column in the DataFrame
    merge['TIME_DIFF'] = diffFromLabColumn
    
    # Select only a subset of relevant columns from the DataFrame, including 'TIME_DIFF'
    merge = merge[['CSN', 'rand_BG_time', 'RESULT_TIME', 'TIME_DIFF', 'COMPONENT_NAME', 'ORD_VALUE']]
    
    # Return the updated DataFrame with the 'TIME_DIFF' column
    return merge

In [None]:
def create_time_difference_column_in_lab_before_event_df(merge):
    """
    This function creates a new column, 'TIME_DIFF', in the input DataFrame 'merge', which represents the time difference 
    between two columns: 'rand_BG_time' and 'RESULT_TIME'. The time difference is calculated in minutes ('m').

    Parameters:
    merge (pd.DataFrame): A pandas DataFrame containing the columns 'rand_BG_time' and 'RESULT_TIME', representing 
                          the time of a random blood glucose measurement and the time of a lab result, respectively.

    Returns:
    pd.DataFrame: The updated DataFrame with a new 'TIME_DIFF' column and a subset of the original columns.
    """
    # Calculate the difference between 'rand_BG_time' and 'RESULT_TIME' columns
    diffFromLabColumn = merge['rand_BG_time'] - merge['RESULT_TIME']
    
    # Convert the time difference to 'timedelta64' in minutes
    diffFromLabColumn = [np.timedelta64(x, 'm') for x in diffFromLabColumn]
    
    # Create a new column 'TIME_DIFF' in the DataFrame to store the time differences
    merge['TIME_DIFF'] = diffFromLabColumn
    
    # Keep only a subset of columns: CSN, rand_BG_time, RESULT_TIME, TIME_DIFF, COMPONENT_NAME, and ORD_VALUE
    merge = merge[['CSN', 'rand_BG_time', 'RESULT_TIME', 'TIME_DIFF', 'COMPONENT_NAME', 'ORD_VALUE']]
    
    # Return the updated DataFrame
    return merge

In [None]:
merge_diff = create_time_difference_column_in_lab_before_event_df(merge)

#### Filter for labs only in lookback window

In [None]:
def establish_random_delay_before_lab_test_for_each_csn(csnRts, delayTimeLength):
    """
    This function creates a dictionary of random delays for each unique CSN and RESULT_TIME pair.
    The delay is constant for all pairs in this implementation.

    Parameters:
    csnRts (set): A set of tuples where each tuple contains a unique combination of CSN and RESULT_TIME.
    delayTimeLength (int): The constant random delay (in hours) to be assigned to each CSN and RESULT_TIME pair.

    Returns:
    dict: A dictionary where keys are tuples of (CSN, RESULT_TIME) and values are the specified random delay.
    """
    
    # Create a dictionary where each key (CSN, RESULT_TIME pair) is assigned the same delay time length
    return {key: delayTimeLength for key in csnRts}

In [None]:
def divide_times_into_segments_for_each_csn_result_time(csnRt, timeDiffs, randomDelayDict, numSections, timeWindowHours):
    """
    This function divides the time differences ('timeDiffs') for a specific CSN (patient encounter) into segments.
    The time window is adjusted based on the random delay before the lab test and a specified number of sections
    over a customizable time window (in hours).

    Parameters:
    csnRt (tuple): A tuple containing a unique identifier for the CSN (Clinical Serial Number) and RESULT_TIME.
    timeDiffs (list): A list of time differences between random BG measurements and lab results (in minutes).
    randomDelayDict (dict): A dictionary mapping each CSN to a random delay value before the lab test.
    numSections (int): The number of segments to divide the time window into.
    timeWindowHours (int): The total time window in hours to create segments.

    Returns:
    list: A list of segment categories indicating in which segment each time difference falls.
    """
    # Convert the random delay for the given CSN from hours to minutes and then to timedelta64 ('m')
    endTime = np.timedelta64(int(np.round(randomDelayDict[csnRt] * 60)), 'm')
    
    # Adjust the start time by adding the time window in minutes to the endTime
    startTime = endTime + np.timedelta64(timeWindowHours * 60, 'm')
    
    # Generate 'numSections + 1' equally spaced cutoffs between endTime and startTime
    # These cutoffs define the boundaries of each time segment
    segmentCutoffs = np.linspace(endTime.astype(int), startTime.astype(int), numSections + 1)
    
    # Convert the segment cutoffs from integer format to timedelta64 format in minutes ('m')
    segmentCutoffs = [np.timedelta64(int(x), 'm') for x in segmentCutoffs]
    
    # Convert the list of time differences to timedelta64 format in minutes
    timeDiffs = [np.timedelta64(x, 'm') for x in timeDiffs]
    
    # Categorize each time difference into one of the time segments based on the segment cutoffs
    # np.searchsorted() returns the indices of the segment that each time difference falls into
    segmentCategories = np.searchsorted(segmentCutoffs, timeDiffs, side='right')
    
    # Return the list of segment categories for each time difference
    return segmentCategories

In [None]:
def identify_time_section_window_from_lab_result(merge, numSections, randomDelayMaxLength, timeWindowHours):
    """
    This function identifies and assigns time segments for each row in the dataframe based on time differences between 
    random BG measurements and lab results, considering a random delay for each CSN (patient encounter).

    Parameters:
    merge (DataFrame): The input dataframe containing CSN, RESULT_TIME, TIME_DIFF, and other relevant columns.
    numSections (int): The number of segments to divide the time window into.
    randomDelayMaxLength (int): The maximum length (in hours) of the random delay before lab tests for each CSN.
    timeWindowHours (int): The total time window (in hours) to be segmented into sections.

    Returns:
    DataFrame: A copy of the original dataframe with an added 'SEGMENT' column that indicates the time segment for each row.
    dict: A dictionary containing the random delay values for each CSN and RESULT_TIME pair.
    """
    # Create a copy of the dataframe to avoid modifying the original
    merge = merge.copy()
    
    # Sort the dataframe by CSN (patient encounter ID), RESULT_TIME (time of lab result), and TIME_DIFF (time difference between lab and event)
    merge = merge.sort_values(['CSN', 'RESULT_TIME', 'TIME_DIFF']).reset_index(drop=True)
    
    # Establish a dictionary of random delays for each unique CSN and RESULT_TIME pair.
    # The random delay is generated with a maximum length defined by 'randomDelayMaxLength'.
    randomDelayDict = establish_random_delay_before_lab_test_for_each_csn(
        set(zip(merge['CSN'], merge['RESULT_TIME'])), randomDelayMaxLength
    )
    
    # Apply the time segmentation to each group of rows with the same CSN and RESULT_TIME.
    # The 'transform' function applies the segmentation for each group.
    merge['SEGMENT'] = merge.groupby(['CSN', 'RESULT_TIME'])['TIME_DIFF'].transform(
        lambda timeDiffs: divide_times_into_segments_for_each_csn_result_time(
            timeDiffs.name, timeDiffs, randomDelayDict, numSections, timeWindowHours
        )
    )
    
    # Return the modified dataframe with the 'SEGMENT' column and the random delay dictionary
    return merge, randomDelayDict

In [None]:
# Set parameters for segmentation and random delay
numSegments = 1  # Number of segments to divide the time window into
randomDelayMaxLength = 4  # Maximum length (in hours) for the random delay before the lab test
timeWindowHours = 24  # Total time window (in hours) over which to segment the time differences

# Set the random seed for reproducibility
random.seed(0)

# Apply the function to segment time differences and compute random delays
merge_seg, randomDelayDict = identify_time_section_window_from_lab_result(
    merge_diff,  # The input dataframe with time differences
    numSegments,  # Number of segments to divide the time window into
    randomDelayMaxLength,  # Maximum random delay before lab tests
    timeWindowHours  # Total time window for segmentation
)

In [None]:
def aggregate_labs_data(merge_seg):
    # Filter out rows where SEGMENT is 0 or 2
    labs_red = merge_seg[(merge_seg.SEGMENT != 0) & (merge_seg.SEGMENT != 2)]

    # Sort the data
    labs_red_sorted = labs_red.sort_values(by=['CSN', 'COMPONENT_NAME', 'RESULT_TIME'], ascending=[True, True, False])

    # Group by CSN and COMPONENT_NAME, select the first entry
    labs_red_grouped = labs_red_sorted.groupby(['CSN', 'COMPONENT_NAME']).first().reset_index()

    # Helper function to create pivot tables
    def create_pivot(df, agg_func, suffix):
        return (df.groupby(['CSN', 'COMPONENT_NAME'])
                .agg({'ORD_VALUE': agg_func})
                .pipe(lambda df: pd.pivot_table(df, values='ORD_VALUE', index='CSN', columns='COMPONENT_NAME'))
                .rename(columns=lambda col: f'{col}_{suffix}'))

    # Create pivot tables for different aggregations
    labs_red_last_piv = pd.pivot_table(labs_red_grouped, values='ORD_VALUE', index='CSN', columns='COMPONENT_NAME')
    labs_red_last_piv.columns = [f'{col}_last' for col in labs_red_last_piv.columns]

    labs_red_agg_mean_piv = create_pivot(labs_red, 'mean', 'mean')
    labs_red_agg_min_piv = create_pivot(labs_red, 'min', 'min')
    labs_red_agg_max_piv = create_pivot(labs_red, 'max', 'max')

    # Merge all pivot tables
    labs_merged = (labs_red_agg_mean_piv
                    .join(labs_red_agg_min_piv, how='inner')
                    .join(labs_red_agg_max_piv, how='inner')
                    .join(labs_red_last_piv, how='inner'))

    # Reset index to include CSN as a column
    return labs_merged.reset_index()

In [None]:
labs_merged = aggregate_labs_data(merge_seg)