In [1]:
import pandas as pd

# Project imports
from self_harm_triage_notes.config import *
from self_harm_triage_notes.text_utils import *

In [2]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dictionary of misspellings
spell_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for analysis
data_filename = "rmh_2012_2017_test"

In [3]:
# Load the ED vocabulary
vocab = load_vocab(spell_corr_dir, vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_corr_dir, spell_filename)

# Load a dictionary of slang terms for medications
slang_dict = load_slang_dict(resources_dir)

Domain-specific vocabulary contains 20043 words.
Spelling correction available for 43863 words.
Slang available for 20 words.


___
# Normalise unseen triage notes
### Load unseen data

In [4]:
df = pd.read_parquet(interim_data_dir / (data_filename + ".parquet"), engine="pyarrow")
print(df.shape)
df.head()

(79823, 14)


Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,audit_case,source_system,quarter,length
0,RMH-4,male,51.0,other,2012-01-08 01:11:00,2012,L) sided flank pain same as previous renal col...,Negative,Negative,Negative,,Symphony,2012Q1,169
1,RMH-12,male,53.0,other,2012-01-08 03:27:00,2012,"Restless legs, tingling in spine, unable to sl...",Negative,Negative,Negative,,Symphony,2012Q1,105
2,RMH-14,male,54.0,road ambulance,2012-01-08 04:42:00,2012,generalised abdo pain radiating to back at 010...,Negative,Negative,Negative,,Symphony,2012Q1,181
3,RMH-17,male,41.0,other,2012-01-08 05:23:00,2012,"SOB feels like something in throat, thick sput...",Negative,Negative,Negative,,Symphony,2012Q1,157
4,RMH-20,male,49.0,other,2012-01-08 05:32:00,2012,prolapsed disc in c-spine due for MRI in 4/7. ...,Negative,Negative,Negative,,Symphony,2012Q1,185


In [5]:
print_token_counts(count_tokens(df.triage_note))
print_token_counts(count_tokens(df.triage_note, valid=True))

The corpus contains 94942 unique tokens (1646449 tokens in total).
The corpus contains 88433 unique tokens (1535539 tokens in total).


### Pre-process

In [6]:
# Pre-processing
df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)

print_token_counts(count_tokens(df.preprocessed_triage_note))
print_token_counts(count_tokens(df.preprocessed_triage_note, valid=True))

The corpus contains 62061 unique tokens (1688011 tokens in total).
The corpus contains 57557 unique tokens (1556662 tokens in total).


### Tokenise

In [7]:
# Create tokenised text
df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  return re.compile(expression)


The corpus contains 43687 unique tokens (1904918 tokens in total).
The corpus contains 40369 unique tokens (1563008 tokens in total).


### Re-tokenise

In [8]:
# Re-tokenise text
df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, vocab)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_tokens(df.tokenized_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.tokenized_triage_note, vocab=vocab))

The corpus contains 36030 unique tokens (1948311 tokens in total).
The corpus contains 32711 unique tokens (1584083 tokens in total).
The corpus contains 12447 unique tokens (1522843 tokens in total).


### Spelling correction

In [9]:
# Correct spelling mistakes
df['normalised_triage_note'] = df.tokenized_triage_note.apply(correct_tokens, _dict=misspelled_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 26636 unique tokens (1949128 tokens in total).
The corpus contains 23317 unique tokens (1584900 tokens in total).
The corpus contains 13103 unique tokens (1567889 tokens in total).


### Slang replacement

In [10]:
# Replace slang for medications
df.normalised_triage_note = df.normalised_triage_note.apply(correct_tokens, _dict=slang_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_tokens(df.normalised_triage_note, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.normalised_triage_note, vocab=vocab))

The corpus contains 26631 unique tokens (1951418 tokens in total).
The corpus contains 23312 unique tokens (1587190 tokens in total).
The corpus contains 13094 unique tokens (1566187 tokens in total).


___
# Extract entities

In [11]:
df['entities'] = df.normalised_triage_note.apply(select_valid_tokens)

print_token_counts(count_tokens(df.entities))
print_token_counts(count_tokens(df.entities, valid=True))
print_token_counts(count_vocab_tokens_in_data(df.entities, vocab=vocab))

The corpus contains 23312 unique tokens (1587190 tokens in total).
The corpus contains 23312 unique tokens (1587190 tokens in total).
The corpus contains 13094 unique tokens (1566187 tokens in total).


### Save dataset

In [12]:
data_filename = data_filename.replace('_cleaned', '')
df.to_parquet(proc_data_dir / (data_filename + "_normalised.parquet"), engine="pyarrow")