In [1]:
from self_harm_triage_notes.config import data_interim_dir, data_proc_dir
from self_harm_triage_notes.text import *
from self_harm_triage_notes.dev import *
import pandas as pd

In [2]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dictionary of misspellings
spell_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for analysis
data_filename = "rmh_2012_2017_test"

In [3]:
# Load the ED vocabulary
vocab = load_vocab(vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_filename)

# Lopad a dictionary of slang terms for medications
slang_dict = load_slang_dict()

Domain-specific vocabulary contains 20109 words.
Spelling correction available for 43695 words.
Slang available for 20 words.


___
# Normalise unseen triage notes
### Load unseen data

In [4]:
df = pd.read_parquet(data_interim_dir / (data_filename + ".parquet"), engine="pyarrow")
print(df.shape)
df.head()

(79823, 14)


Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,audit_case,source_system,quarter,length
38457,RMH-38460,female,19.0,other,2012-08-26 03:15:00,2012,Spontaneous painful lump in R) hand with numbn...,0,0,0,,Symphony,2012Q3,177
239346,RMH-239404,male,56.0,private ambulance,2015-10-13 17:20:00,2015,Sudden onset generalised weakness post shower....,0,0,0,,Symphony,2015Q4,109
78113,RMH-78123,male,44.0,other,2013-04-19 11:53:00,2013,From GP post fall of push bike. # to 5th Proxi...,0,0,0,,Symphony,2013Q2,58
291404,RMH-291469,male,50.0,road ambulance,2016-07-26 05:09:00,2016,Suicidal ideation,0,1,0,,Symphony,2016Q3,17
120000,RMH-120014,male,57.0,road ambulance,2013-12-23 23:24:00,2013,"3/7 increased lethargy, exertional SOB hx haem...",0,0,0,,Symphony,2013Q4,108


In [5]:
print_token_counts(count_tokens(df.triage_note))
print_token_counts(count_tokens(df.triage_note, valid=True))

The corpus contains 95496 unique tokens (1643410 tokens in total).
The corpus contains 88898 unique tokens (1533680 tokens in total).


### Pre-process

In [6]:
# Pre-processing
df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)

print_token_counts(count_tokens(df.preprocessed_triage_note))
print_token_counts(count_tokens(df.preprocessed_triage_note, valid=True))

The corpus contains 62281 unique tokens (1685863 tokens in total).
The corpus contains 57798 unique tokens (1555164 tokens in total).


### Tokenise

In [7]:
# Create tokenised text
df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  return re.compile(expression)


The corpus contains 43909 unique tokens (1902137 tokens in total).
The corpus contains 40574 unique tokens (1561444 tokens in total).


### Re-tokenise

In [8]:
# Re-tokenise text
df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, vocab)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.tokenized_triage_note, vocab=vocab))

The corpus contains 36098 unique tokens (1945855 tokens in total).
The corpus contains 32763 unique tokens (1582736 tokens in total).
The corpus contains 12465 unique tokens (1521626 tokens in total).


### Spelling correction

In [9]:
# Correct spelling mistakes
df['normalised_triage_note'] = df.tokenized_triage_note.apply(correct_tokens, _dict=misspelled_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 26739 unique tokens (1946656 tokens in total).
The corpus contains 23404 unique tokens (1583537 tokens in total).
The corpus contains 13152 unique tokens (1566851 tokens in total).


### Slang replacement

In [10]:
# Replace slang for medications
df.normalised_triage_note = df.normalised_triage_note.apply(correct_tokens, _dict=slang_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 26734 unique tokens (1948931 tokens in total).
The corpus contains 23399 unique tokens (1585812 tokens in total).
The corpus contains 13143 unique tokens (1565061 tokens in total).


___
# Extract entities

In [11]:
df['entities'] = df.normalised_triage_note.apply(select_valid_tokens)

print_token_counts(count_tokens(df.entities))
print_token_counts(count_tokens(df.entities, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.entities, vocab=vocab))

The corpus contains 23399 unique tokens (1585812 tokens in total).
The corpus contains 23399 unique tokens (1585812 tokens in total).
The corpus contains 13143 unique tokens (1565061 tokens in total).


### Save dataset

In [12]:
data_filename = data_filename.replace('_cleaned', '')
df.to_parquet(data_proc_dir / (data_filename + "_normalised.parquet"), engine="pyarrow")