In [1]:
import pandas as pd
from self_harm_triage_notes.config import data_interim_dir
from self_harm_triage_notes.dataset import print_stats
from sklearn.model_selection import train_test_split

___
# RMH 
### Load original data from 2012 to 2022

In [2]:
df = pd.read_parquet(data_interim_dir / "rmh_2012_2022_cleaned.parquet", engine="pyarrow")
print(df.shape)
df.head()

(771339, 14)


Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,audit_case,source_system,quarter,length
0,RMH-1,female,64.0,other,2012-01-08 00:35:00,2012,"SOB for 5/7, been to GP given prednisolone, co...",0,0,0,,Symphony,2012Q1,140
1,RMH-2,male,31.0,other,2012-01-08 00:41:00,2012,"pt has lac down right forehead, to eyebrow, wi...",0,0,0,,Symphony,2012Q1,107
2,RMH-3,male,19.0,road ambulance,2012-01-08 00:52:00,2012,"pt expect MBA, trapped for 45mins, #right femu...",0,0,0,,Symphony,2012Q1,74
3,RMH-4,male,51.0,other,2012-01-08 01:11:00,2012,L) sided flank pain same as previous renal col...,0,0,0,,Symphony,2012Q1,169
4,RMH-5,female,25.0,other,2012-01-08 01:23:00,2012,generalised abdo pain and associated headache ...,0,0,0,,Symphony,2012Q1,196


In [3]:
df.year.value_counts().sort_index()

year
2012    59771
2013    61707
2014    65268
2015    65495
2016    71628
2017    75242
2018    77958
2019    82199
2020    86235
2021    83263
2022    42573
Name: count, dtype: int64

### Subset data: 2012-2017 for development and testing; 2018-2022

In [4]:
print("Number of presentations from 2018 onwards:", (df.year >= 2018).sum())
df[df.year >= 2018].to_parquet(data_interim_dir / "rmh_2018_2022_cleaned.parquet", engine="pyarrow")

Number of presentations from 2018 onwards: 372228


In [5]:
df = df[df.year <= 2017].copy()
print(df.shape)

(399111, 14)


### Create a train/test split

In [6]:
dev_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df.SH)

print_stats(dev_data)
print_stats(test_data)

The dataset contains 319288 presentations.

SELF-HARM
Number of presentations:
SH
0    314962
1      4326
Name: count, dtype: int64

Proportion of presentations:
SH
0    98.64511
1     1.35489
Name: proportion, dtype: float64

________________________________________________________________________________
SUICIDAL IDEATION
Number of presentations:
SI
0    315475
1      3813
Name: count, dtype: int64

Proportion of presentations:
SI
0    98.80578
1     1.19422
Name: proportion, dtype: float64
________________________________________________________________________________
AOD overdose
Number of presentations:
AOD_OD
0    318655
1       633
Name: count, dtype: int64

Proportion of presentations:
AOD_OD
0    99.801746
1     0.198254
Name: proportion, dtype: float64

The dataset contains 79823 presentations.

SELF-HARM
Number of presentations:
SH
0    78741
1     1082
Name: count, dtype: int64

Proportion of presentations:
SH
0    98.644501
1     1.355499
Name: proportion, dtype: float64


In [8]:
dev_data.to_parquet(data_interim_dir / "rmh_2012_2017_dev.parquet", engine="pyarrow")
test_data.to_parquet(data_interim_dir / "rmh_2012_2017_test.parquet", engine="pyarrow")