In [1]:
import pandas as pd

# Project imports
from self_harm_triage_notes.config import *
from self_harm_triage_notes.text_utils import *

In [2]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dictionary of misspellings
spell_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for analysis
data_filename = "rmh_2012_2017_dev" #"rmh_2018_2022_cleaned"#"rmh_2012_2017_test"

In [3]:
# Load the ED vocabulary
vocab = load_vocab(spell_corr_dir, vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_corr_dir, spell_filename)

# Load a dictionary of slang terms for medications
slang_dict = load_slang_dict(resources_dir)

Domain-specific vocabulary contains 20043 words.
Spelling correction available for 43863 words.
Slang available for 20 words.


___
# Normalise unseen triage notes
### Load unseen data

In [4]:
df = pd.read_parquet(interim_data_dir / (data_filename + ".parquet"), engine="pyarrow")
df

Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,SI_or_SH,audit_case,source_system,quarter,length,val_fold
0,RMH-1,female,64.0,other,2012-01-08 00:35:00,2012,"SOB for 5/7, been to GP given prednisolone, co...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,140,4
1,RMH-2,male,31.0,other,2012-01-08 00:41:00,2012,"pt has lac down right forehead, to eyebrow, wi...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,107,1
2,RMH-3,male,19.0,road ambulance,2012-01-08 00:52:00,2012,"pt expect MBA, trapped for 45mins, #right femu...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,74,1
3,RMH-5,female,25.0,other,2012-01-08 01:23:00,2012,generalised abdo pain and associated headache ...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,196,5
4,RMH-6,female,18.0,other,2012-01-08 01:37:00,2012,abdo pain associated with constipation. Pt se...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2012Q1,134,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319283,RMH-399236,male,34.0,road ambulance,2017-12-31 23:26:00,2017,"Pt expect, hand vs firework, all fingers amput...",Not self-harm,Not suicidal ideation,Not overdose,0,1.0,Symphony,2017Q4,99,2
319284,RMH-399237,male,27.0,self/community/pt,2017-12-31 23:34:00,2017,increasing pain to existing haemaroids. Tx at ...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2017Q4,113,5
319285,RMH-399238,female,76.0,self/community/pt,2017-12-31 23:42:00,2017,bleeding since wednsday from stoma. seen at RM...,Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2017Q4,124,3
319286,RMH-399239,female,44.0,road ambulance,2017-12-31 23:48:00,2017,"ETOH and marijuana tonight, pt then fell onto ...",Not self-harm,Not suicidal ideation,Not overdose,0,,Symphony,2017Q4,99,3


In [5]:
print_token_counts(count_tokens(df.triage_note))
print_token_counts(count_tokens(df.triage_note, valid=True))

The corpus contains 227911 unique tokens (6594864 tokens in total).
The corpus contains 212259 unique tokens (6153769 tokens in total).


### Pre-process

In [6]:
# Pre-processing
df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)

print_token_counts(count_tokens(df.preprocessed_triage_note))
print_token_counts(count_tokens(df.preprocessed_triage_note, valid=True))

The corpus contains 144936 unique tokens (6761504 tokens in total).
The corpus contains 134722 unique tokens (6238239 tokens in total).


### Tokenise

In [7]:
# Create tokenised text
df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  return re.compile(expression)


The corpus contains 102953 unique tokens (7630368 tokens in total).
The corpus contains 95553 unique tokens (6263764 tokens in total).


### Re-tokenise

In [8]:
# Re-tokenise text
df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, vocab)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.tokenized_triage_note, vocab=vocab))

The corpus contains 77892 unique tokens (7805040 tokens in total).
The corpus contains 70489 unique tokens (6348747 tokens in total).
The corpus contains 20043 unique tokens (6109147 tokens in total).


### Spelling correction

In [9]:
# Correct spelling mistakes
df['normalised_triage_note'] = df.tokenized_triage_note.apply(correct_tokens, _dict=misspelled_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 34327 unique tokens (7808350 tokens in total).
The corpus contains 26924 unique tokens (6352057 tokens in total).
The corpus contains 20043 unique tokens (6317699 tokens in total).


### Slang replacement

In [10]:
# Replace slang for medications
df.normalised_triage_note = df.normalised_triage_note.apply(correct_tokens, _dict=slang_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 34319 unique tokens (7817279 tokens in total).
The corpus contains 26916 unique tokens (6360986 tokens in total).
The corpus contains 20031 unique tokens (6310854 tokens in total).


___
# Extract entities

In [11]:
df['entities'] = df.normalised_triage_note.apply(select_valid_tokens)

print_token_counts(count_tokens(df.entities))
print_token_counts(count_tokens(df.entities, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.entities, vocab=vocab))

The corpus contains 26916 unique tokens (6360986 tokens in total).
The corpus contains 26916 unique tokens (6360986 tokens in total).
The corpus contains 20031 unique tokens (6310854 tokens in total).


### Save dataset

In [12]:
data_filename = data_filename.replace('_cleaned', '')
df.to_parquet(proc_data_dir / (data_filename + "_normalised.parquet"), engine="pyarrow")