In [1]:
import pandas as pd

# Project imports
from self_harm_triage_notes.config import interim_data_dir, N_SPLITS
from self_harm_triage_notes.dataset_utils import print_stats

# Toolbox imports
from ml_health_toolbox.dev_utils import split_data, get_cv_strategy

___
# RMH 
### Load original data from 2012 to 2022

In [2]:
df = pd.read_parquet(interim_data_dir / "rmh_2012_2022_cleaned.parquet", engine="pyarrow")
df

Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,SI_or_SH,audit_case,source_system,quarter,length
0,RMH-1,female,64.0,other,2012-01-08 00:35:00,2012,"SOB for 5/7, been to GP given prednisolone, co...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,140
1,RMH-2,male,31.0,other,2012-01-08 00:41:00,2012,"pt has lac down right forehead, to eyebrow, wi...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,107
2,RMH-3,male,19.0,road ambulance,2012-01-08 00:52:00,2012,"pt expect MBA, trapped for 45mins, #right femu...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,74
3,RMH-4,male,51.0,other,2012-01-08 01:11:00,2012,L) sided flank pain same as previous renal col...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,169
4,RMH-5,female,25.0,other,2012-01-08 01:23:00,2012,generalised abdo pain and associated headache ...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771334,RMH-2022-42573,male,26.0,other,2022-06-30 23:31:25,2022,"1700hrs dizziness, worse on movement. 1x vomit...",Not self-harm,Not suicidal ideation,Not overdose,0,0.0,EPIC,2022Q2,138
771335,RMH-2022-42574,female,26.0,other,2022-06-30 23:38:35,2022,2/7 PR bleeding with bowel motions. Mild dizzi...,Not self-harm,Not suicidal ideation,Not overdose,0,0.0,EPIC,2022Q2,126
771336,RMH-2022-42575,female,27.0,other,2022-06-30 23:51:53,2022,Worsening back pain on b/ground UTI. Commenced...,Not self-harm,Not suicidal ideation,Not overdose,0,0.0,EPIC,2022Q2,183
771337,RMH-2022-42576,female,61.0,air ambulance,2022-06-30 23:53:59,2022,,Not self-harm,Not suicidal ideation,Not overdose,0,0.0,EPIC,2022Q2,0


In [3]:
df.year.value_counts().sort_index()

year
2012    59771
2013    61707
2014    65268
2015    65495
2016    71628
2017    75242
2018    77958
2019    82199
2020    86235
2021    83263
2022    42573
Name: count, dtype: int64

### Subset data: 2012-2017 for development and testing; 2018-2022

In [4]:
print("Number of presentations from 2018 onwards:", (df.year >= 2018).sum())
df[df.year >= 2018].reset_index(drop=True).to_parquet(interim_data_dir / "rmh_2018_2022_cleaned.parquet", engine="pyarrow")

Number of presentations from 2018 onwards: 372228


In [5]:
df = df[df.year <= 2017].copy()
print(df.shape)

(399111, 15)


### Create a train/test split

In [6]:
dev_data, test_data = split_data(df, df.SH, n_splits=5)

print("DEVELOPMENT SET")
print_stats(dev_data)
print()
print("TEST SET")
print_stats(test_data)

DEVELOPMENT SET
The dataset contains 319288 presentations.

SELF-HARM
Number of presentations:
SH
Not self-harm    314962
Self-harm          4326
Name: count, dtype: int64

Proportion of presentations:
SH
Not self-harm    98.64511
Self-harm         1.35489
Name: proportion, dtype: float64

________________________________________________________________________________
SUICIDAL IDEATION
Number of presentations:
SI
Not suicidal ideation    315457
Suicidal ideation          3831
Name: count, dtype: int64

Proportion of presentations:
SI
Not suicidal ideation    98.800143
Suicidal ideation         1.199857
Name: proportion, dtype: float64
________________________________________________________________________________
AOD overdose
Number of presentations:
AOD_OD
Not overdose    318634
Overdose           654
Name: count, dtype: int64

Proportion of presentations:
AOD_OD
Not overdose    99.795169
Overdose         0.204831
Name: proportion, dtype: float64


TEST SET
The dataset contains 7982

In [7]:
# Initialise CV
X = dev_data 
y = dev_data.SH

# Initialise CV object
cv = get_cv_strategy(n_splits=N_SPLITS)

dev_data['val_fold'] = 0

i = 1
for train_idx, val_idx in cv.split(X, y, groups=None):
    print("%d reports allocated to validation fold #%d" % (len(val_idx), i))
    dev_data.loc[val_idx, 'val_fold'] = i
    i +=1

63858 reports allocated to validation fold #1
63858 reports allocated to validation fold #2
63858 reports allocated to validation fold #3
63857 reports allocated to validation fold #4
63857 reports allocated to validation fold #5


In [8]:
dev_data.to_parquet(interim_data_dir / "rmh_2012_2017_dev.parquet", engine="pyarrow")
test_data.to_parquet(interim_data_dir / "rmh_2012_2017_test.parquet", engine="pyarrow")