# Explore datasets

There are three datasets:
- ED_visits.csv 
- specialty.csv
- yet_to_arrive.csv

See the data dictionaries for more information. 

The ED visits database uses the following concepts:

- visit: a single visit to the ED which may end in admission or discharge
- prediction times: the times in the day at which predictions are to be made (eg 06:00, 09:30, 12:00, 15:30, 22:00)
- visit snapshots: snapshots of visits observed at the prediction times; a vist may have multiple visit snapshots



## Set up the notebook environment


In [None]:
# Reload functions every time
%load_ext autoreload 
%autoreload 2

In [None]:
from pathlib import Path
import sys
import json


PROJECT_ROOT = Path().home() / 'HyMind'
sys.path.append(str(PROJECT_ROOT / 'dissemination' / 'predict4flow' ))
sys.path.append(str(PROJECT_ROOT / 'dissemination' / 'functions' ))
from pathlib import Path




## Load parameters

These are set in config.json. You can change these for your own purposes. But the times of day will need to match those in the provided dataset if you want to run this notebook successfully.

In [None]:
# Load the times of day
import yaml

config_path = Path(PROJECT_ROOT / 'dissemination' / 'patientflow')

with open(config_path / 'config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    
# Convert list of times of day at which predictions will be made (currently stored as lists) to list of tuples
prediction_times = [tuple(item) for item in config['prediction_times']]

# See the times of day at which predictions will be made
prediction_times

## Load data

In [None]:
from ed_admissions_data_retrieval import ed_admissions_get_data
PATH_ED = 'HyMind/dissemination/data-raw/ED_visits.csv'

df = ed_admissions_get_data(PATH_ED)

In [None]:
# certain columns are not used in training
exclude_from_plot = [
    "visit_number",
    "snapshot_datetime",
    "prediction_time",
    "random_number",
    "snapshot_id", 
    "training_validation_test"]

### Elapsed length of stay

A long tail of long visits, and the box plot shows that the long tail is comprised of people who are not admitted

In [None]:
def plot_numerical(df, column):# Faceted Histogram
    g = sns.FacetGrid(df, row='is_admitted', height=2, aspect=4)
    g.map(sns.histplot, column)
    g.set_titles("{row_name} is_admitted")
    plt.show()

    # Faceted Boxplot
    g = sns.FacetGrid(df, row='is_admitted', height=2, aspect=4)
    order = df.is_admitted.unique()
    g.map(sns.boxplot, column, orient='h', order = order)
    g.set_titles("{row_name} is_admitted")
    plt.show()
    
plot_numerical(df, 'elapsed_los_td')

In [None]:
# Remove visits where LOS is > 3 days
df = df[df.elapsed_los_td <= 72 * 3600]


In [None]:
df[(df.is_admitted == False) & (df.elapsed_los_td > 24*60*60) ]['current_ed_location'].value_counts()

In [None]:
long_stays_otf = df[(df.elapsed_los_td > 24*60*60) & (df.current_ed_location == 'OTF')]['visit_number'].unique()

In [None]:
df['elapsed_los_td_hrs'] = df.elapsed_los_td/3600
plot_numerical(df[~(df.visit_number.isin(long_stays_otf)) ], 'elapsed_los_td_hrs')

In [None]:
df[~(df.visit_number.isin(long_stays_otf)) ]
df[(df.elapsed_los_td_hrs > 24) & (df.is_admitted == False)]

In [None]:

    
for column in df.columns:
    if column not in exclude_from_training_data:
        # Decide whether to facet based on 'is_admitted'
        facet_by = 'is_admitted' if 'is_admitted' in df.columns else None
        
        # Determine the plot type based on the data type
        if df[column].dtype == 'object' or df[column].nunique() < 20:  # Treating columns with fewer than 20 unique values as categorical
            plot_categorical(column, df, facet_by)
        else:
            plot_numerical(column, df, facet_by)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_data(column_name, df, facet_by = True):
    # Check if the column is boolean and convert it to category if it is
    if df[column_name].dtype == 'bool':
        df[column_name] = df[column_name].astype('category')

    if df[column_name].dtype in ['object', 'category'] or df[column_name].dtype == 'bool':
        # Categorical plot
        if facet_by:
            g = sns.FacetGrid(df, col='is_admitted', height=2, aspect=1)
            g.map(sns.countplot, column_name, order=df[column_name].value_counts().index, orient='h')
        else:
            sns.countplot(y=column_name, data=df, order=df[column_name].value_counts().index, orient='h')
        # plt.title(f'Distribution of {column_name}')
    else:
        # Numerical plot (boxplot)
        if facet_by:
            g = sns.FacetGrid(df, row='is_admitted', height=2, aspect=6)
            order = df.is_admitted.unique()
            g.map(sns.boxplot, column_name, data=df, orient='h', order = order)
        else:
            sns.boxplot(y=column_name, data=df, orient='h')
        # plt.title(f'Boxplot of {column_name}')
    plt.tight_layout()
    plt.show()


# Iterate through each column and plot
for column in df.columns:
    if column not in exclude_from_plot and df[column].dtype == 'category' :
        print(column)
        plot_data(column, df)



