# Predict yet to arrive 

prepare a model that will predict the number of patients yet to arrive.

Inputs
- A series of times in the day at which we want to make these predictions is set 
- A series of dates on which we want to make these predictions
- A time window after the prediction time, within which we are interested in predicting a number of patients (eg 8 hours)

## Set up the notebook environment

In [1]:
# Reload functions every time
%load_ext autoreload 
%autoreload 2

In [4]:
from pathlib import Path
import sys
import json
import pandas as pd
import numpy as np
# import joblib
from joblib import load, dump


PROJECT_ROOT = Path().home() 

# Patient flow package
USER_ROOT = Path().home() / 'work'
sys.path.append(str(USER_ROOT / 'patientflow' / 'src' / 'patientflow'))
sys.path.append(str(USER_ROOT / 'patientflow' / 'functions'))

In [12]:
model_file_path = PROJECT_ROOT /'data' / 'ed-predictor' / 'trained-models'
model_file_path

data_file_path = USER_ROOT / 'ed-predictor' / 'data-raw'
data_file_path



## Load parameters

These are set in config.json. You can change these for your own purposes. But the times of day will need to match those in the provided dataset if you want to run this notebook successfully.

In [5]:
# Load the times of day
import yaml

config_path = Path(USER_ROOT / 'patientflow')

with open(config_path / 'config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    
# Convert list of times of day at which predictions will be made (currently stored as lists) to list of tuples
prediction_times = [tuple(item) for item in config['prediction_times']]
epsilon = float(eval(config['epsilon']))
prediction_window = int(config['prediction_window'])
time_interval = int(config['yta_time_interval'])




## Load data

In [16]:
from ed_admissions_data_retrieval import ed_admissions_get_data
PATH_ED =  str(data_file_path) + '/yet_to_arrive.csv'

df = ed_admissions_get_data(PATH_ED)

In [17]:
df.head()

Unnamed: 0,training_validation_test,admission_datetime,sex,specialty,is_child
0,train,2030-06-13 14:33:22+00:00,F,Oncology,False
1,train,2030-04-03 10:43:56+00:00,F,Oncology,False
2,train,2030-04-12 13:47:06+00:00,F,Oncology,False
3,train,2030-04-12 12:33:22+00:00,M,Haematology,False
4,train,2030-03-29 16:39:00+00:00,F,Urology,False


## Separate into training, validation and test sets

As part of preparing the data, each visit has already been allocated into one of three sets - training, vaidation and test sets. 

In [18]:
df.training_validation_test.value_counts()

training_validation_test
train    14071
test      4919
valid     1684
Name: count, dtype: int64

In [21]:
train_df = df[df.training_validation_test == 'train']#.drop(columns='training_validation_test')
valid_df = df[df.training_validation_test == 'valid']#.drop(columns='training_validation_test')
test_df = df[df.training_validation_test == 'test']#.drop(columns='training_validation_test')

train_df['admission_datetime'] = pd.to_datetime(train_df['admission_datetime'], utc = True)
train_df.set_index('admission_datetime', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['admission_datetime'] = pd.to_datetime(train_df['admission_datetime'], utc = True)


In [22]:
isinstance(train_df.index, pd.DatetimeIndex)

True

## Train the Poisson-Binomial model

In [None]:
config

In [27]:
from predict.emergency_demand.poisson_binomial_predictor import PoissonBinomialPredictor

### Train a model for all admission, irrespective of specialty of admission

In [28]:
from predict.emergency_demand.poisson_binomial_predictor import PoissonBinomialPredictor
from joblib import dump, load

model =  PoissonBinomialPredictor()

model.fit(train_df, prediction_window, time_interval, prediction_times)

MODEL__ED_YETTOARRIVE__NAME = 'ed_yet_to_arrive_all_' + str(int(prediction_window/60)) + '_hours'
full_path = model_file_path / MODEL__ED_YETTOARRIVE__NAME 
full_path = full_path.with_suffix('.joblib')

dump(model, full_path)

Calculating time-varying arrival rates for data provided, which spans 520 days
Poisson Binomial Predictor trained for these times: [(6, 0), (9, 30), (12, 0), (15, 30), (22, 0)]
using prediction window of 480 minutes after the time of prediction
and time interval of 15 minutes within the prediction window.
The error value for prediction will be 1e-07
To see the weights saved by this model, used the get_weights() method


['/home/jovyan/data/ed-predictor/trained-models/ed_yet_to_arrive_all_8_hours.joblib']

In [31]:
weights = model.get_weights()


In [32]:


print(x1)
print(x2)

preds = model.predict(prediction_context, x1, y1, x2, y2)

preds['default'].head(10)#['agg_proba']

NameError: name 'x1' is not defined

In [34]:
prediction_context = {
    'default': {
        'prediction_time': tuple([7, 0])  
    }
}

x1 = float(config['x1'])
y1 = float(config['y1'])
x2 = float(config['x2'])
y2 = float(config['y2'])

MODEL__ED_YETTOARRIVE__NAME = 'ed_yet_to_arrive_all_' + str(int(prediction_window/60)) + '_hours'
full_path = model_file_path / MODEL__ED_YETTOARRIVE__NAME 
full_path = full_path.with_suffix('.joblib')

model = load(full_path)

preds = model.predict(prediction_context, x1, y1, x2, y2)
preds



{'default':      agg_proba
 sum           
 0     0.014505
 1     0.061405
 2     0.129970
 3     0.183398
 4     0.194092
 ..         ...
 220   0.000000
 221   0.000000
 222   0.000000
 223   0.000000
 224   0.000000
 
 [225 rows x 1 columns]}

### Predict within specialty

In [None]:
from predict.emergency_demand.poisson_binomial_predictor import PoissonBinomialPredictor

specialty_filters = {
    'medical': {'observed_specialty': 'medical', 'is_child': False},
    'surgical': {'observed_specialty': 'surgical', 'is_child': False},
    'haem_onc': {'observed_specialty': 'haem_onc', 'is_child': False},
    'paediatric': {'is_child': True}  # Pediatric doesn't filter by observed_specialty
}

model_by_spec =  PoissonBinomialPredictor(filters = specialty_filters)

model_by_spec.fit(train_df, prediction_window, time_interval, prediction_times)


MODEL__ED_YETTOARRIVE__NAME = 'ed_yet_to_arrive_by_spec_' + str(int(prediction_window/60)) + '_hours'
full_path = model_file_path / MODEL__ED_YETTOARRIVE__NAME 
full_path = full_path.with_suffix('.joblib')

dump(model_by_spec, full_path)

In [None]:
MODEL__ED_YETTOARRIVE__NAME = 'ed_yet_to_arrive_by_spec_' + str(int(prediction_window/60)) + '_hours'
full_path = model_file_path / MODEL__ED_YETTOARRIVE__NAME 
full_path = full_path.with_suffix('.joblib')

model_by_spec = load(full_path)

x1 = float(config['x1'])
y1 = float(config['y1'])
x2 = float(config['x2'])
y2 = float(config['y2'])

prediction_context = {
    'medical': {
        'prediction_time': tuple([7, 0])  
    }
}

preds = model_by_spec.predict(prediction_context, x1, y1, x2, y2)
preds['medical']