# WORK IM PROGRESS Convert EHR data to format used in this repo

In [None]:
# connect to EMAP
# retrieve bed moves
# identify moment of exit from ED/SDEC
# set up snapshot datetimes to sample
# sample ED visits at those times

## DB connection script

In [None]:
# An example of how to set up database connection (NOT RUN)
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

def get_credentials():
    with open('../../secret-emap', 'r') as file:
        username = file.readline().strip()
        password = file.readline().strip()
        database_host = file.readline().strip()
        database_name = file.readline().strip()
        database_port = file.readline().strip()
        return username, password, database_host, database_name, database_port

# Get credentials from secret file
username, password, database_host, database_name, database_port = get_credentials()

# Database connection URL
DATABASE_URL = f"postgresql://{username}:{password}@{database_host}:{database_port}/{database_name}"

# Create engine
engine = create_engine(DATABASE_URL)

## Retrieve the data

In [None]:
# function to mask the encounter numbers
import hashlib

def hash_csn(df):
    """
    Consistently hash CSN values in a dataframe
    Returns a new dataframe with hashed CSN column
    """
    # Create a copy to avoid modifying original
    df_hashed = df.copy()
    
    # Use a fixed salt for consistency
    FIXED_SALT = "your_fixed_salt_here"  # You can change this value
    
    def hash_value(value):
        if pd.isna(value):
            return None
        salted = f"{str(value)}{FIXED_SALT}".encode()
        return hashlib.sha256(salted).hexdigest()[:12]
    
    # Apply the hash function to the CSN column
    df_hashed['csn'] = df_hashed['csn'].apply(hash_value)
    
    return df_hashed


In [None]:
import random
from datetime import timedelta

with open('../../seed', 'r') as file:
    seed = file.readline().strip()

def shift_dates_inplace(df, seed, min_weeks=52, max_weeks=52*2):
    df_copy = df.copy()

    """Shift datetime columns in place"""
    random.seed(seed)
    weeks_to_add = random.randint(min_weeks, max_weeks)
    shift_delta = timedelta(weeks=weeks_to_add)

    datetime_cols = df_copy.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns

    for col in datetime_cols:
        df_copy[col] = df_copy[col].apply(lambda x: x + shift_delta if pd.notna(x) else x)

    return df_copy

In [None]:
# get the data
from pathlib import Path
import pandas as pd

# set date range
arrived_after = '2024-01-01'
arrived_before = '2024-01-31'

# create parameters dictionary
params = {
    'arrived_after': arrived_after,
    'arrived_before': arrived_before
}

# set up SQL query
SQL_DIR = Path("/home/jovyan/work/zella/zbeds/sql")
subquery = (SQL_DIR / "EMAP_ed_subquery.sql").read_text()
mainquery = (SQL_DIR / "EMAP_test_script.sql").read_text()
final_query = mainquery.replace('[subquery]', f'({subquery})')

# execute the combined query
df = pd.read_sql(
    final_query,
    engine,
    params=params
)

# Hash the csns before display
df = hash_csn(df)

# shift the dates before display
df = shift_dates_inplace(df, seed)

## Identify moment of departure from ED

In [None]:
df.sort_values(['csn', 'location_arrival'], inplace= True)

In [None]:
# create a mask for ED locations using string operations
ed_mask = (
    df['location_string'].str.startswith('ED^') |
    df['location_string'].str.startswith('1020100166^') |
    df['location_string'].str.startswith('1020100170^')
)

# Filter for ED locations and group by CSN to find first departure
first_ed_departure = (
    df[ed_mask]
    .groupby('csn')['location_departure']
    .max()
    .reset_index()
    .rename(columns={'location_departure': 'first_ed_departure'})
)

# merge this back with original dataframe:
df_with_departure = df.merge(
    first_ed_departure,
    on='csn',
    how='left'
)

## Identify whether patient was admitted

In [None]:
# Calculate admission status
def determine_admission(group):
    # Get rows after first ED departure
    post_ed = group[group['location_arrival'] >= group['first_ed_departure']]
    return len(post_ed) > 0

# Group by CSN and apply the admission check
admissions = (
    df_with_departure
    .groupby('csn')
    .apply(determine_admission)
    .reset_index()
    .rename(columns={0: 'is_admitted'})
)

# Merge admission status back to dataframe
df_with_admission_status = df_with_departure.merge(
    admissions,
    on='csn',
    how='left'
)

In [None]:
df_with_admission_status.head(20)

In [None]:
# Modify final_df to include only ED rows

df_final = df_with_admission_status[df_with_admission_status.location_departure <= df_with_admission_status.first_ed_departure]

## Set up an array of snapshot datetimes

In [None]:
# indicate whether the notebook is being run locally for UCLH or with public datasets
uclh = False
from patientflow.load import set_file_paths
from patientflow.load import load_config_file

# set file locations
data_folder_name = 'data-uclh' if uclh else 'data-public'
data_file_path, media_file_path, model_file_path, config_path = set_file_paths(
        train_dttm = None, data_folder_name = data_folder_name, uclh = uclh, from_notebook=True, inference_time = False)

# load params
params = load_config_file(config_path)

snapshot_times = params["prediction_times"]



In [None]:
from datetime import datetime, time, timedelta
import random
import pandas as pd

def get_shifted_snapshot_dates(arrived_after, arrived_before, seed, min_weeks=52, max_weeks=52*2):
    # First get the original dates
    original_dates = pd.date_range(
        start=arrived_after, 
        end=arrived_before, 
        freq="D"
    ).date.tolist()[:-1]
    
    # Apply the same shift
    random.seed(seed)
    weeks_to_add = random.randint(min_weeks, max_weeks)
    shift_delta = timedelta(weeks=weeks_to_add)
    
    # Shift each date
    shifted_dates = [date + shift_delta for date in original_dates]
    
    return shifted_dates

snapshot_dates = get_shifted_snapshot_dates(arrived_after, arrived_before, seed)


## Create snapshots dataset


In [None]:
from datetime import datetime, time
import pandas as pd

def create_snapshots(df, snapshot_times, snapshot_dates):
    # Create empty list to store all results
    all_results = []
    
    # For each combination of date and time
    for date in snapshot_dates:
        for hour, minute in snapshot_times:
            snapshot_datetime = datetime.combine(
                date, 
                time(hour=hour, minute=minute)
            )
            
            # Filter dataframe for this snapshot
            mask = (df['location_arrival'] <= snapshot_datetime) & (df['location_departure'] > snapshot_datetime)
            snapshot_df = df[mask].copy()  # Create copy to avoid SettingWithCopyWarning
            
            # Add snapshot information columns
            snapshot_df['snapshot_date'] = date
            snapshot_df['snapshot_time'] = [(hour, minute)] * len(snapshot_df)
            
            # Append to results list
            all_results.append(snapshot_df)
    
    # Combine all results into single dataframe
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)
        snapshot_cols = ['snapshot_date', 'snapshot_time']
        other_cols = [col for col in final_df.columns if col not in snapshot_cols]
        final_df = final_df[snapshot_cols + other_cols]
    else:
        # Create empty dataframe with correct columns if no results found
        final_df = pd.DataFrame(columns=list(df.columns) + ['snapshot_date', 'snapshot_time', 'snapshot_datetime'])
    
    return final_df.drop(columns = ['patient_class', 'presentation_datetime', 'hospital_arrival', 'hospital_departure', 'location_arrival', 'location_departure', 'first_ed_departure'])

snapshots_df = create_snapshots(df_final, snapshot_times, snapshot_dates)

In [None]:
# doesn't appear in snapshots because this patient was whizzed to the stroke unit
snapshots_df[snapshots_df.csn=='000639d6912b']

In [None]:
snapshots_df#.shape