In [1]:
from self_harm_triage_notes.config import data_interim_dir, data_proc_dir
from self_harm_triage_notes.text import *
from self_harm_triage_notes.dev import *
import pandas as pd

In [2]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dictionary of misspellings
spell_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for analysis
data_filename = "lvrh_2012_2022_cleaned"

In [3]:
# Load the ED vocabulary
vocab = load_vocab(vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_filename)

# Lopad a dictionary of slang terms for medications
slang_dict = load_slang_dict()

Domain-specific vocabulary contains 20109 words.
Spelling correction available for 43695 words.
Slang available for 20 words.


___
# Normalise unseen triage notes
### Load unseen data

In [4]:
df = pd.read_parquet(data_interim_dir / (data_filename + ".parquet"), engine="pyarrow")
print(df.shape)
df.head()

(335209, 13)


Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,audit_case,quarter,length
0,LVRH2-2012-2,female,53,self/community/pt,2012-01-01 00:30:00,2012,PT PRESENTS WITH ?SOMETHING IN HER LEFT EYE. P...,0,0,0,0,2012Q1,334
1,LVRH2-2012-3,male,24,road ambulance,2012-01-01 00:56:00,2012,BIBA POST WITTNESSED ASSAULT APPROX 2330HRS. H...,0,0,0,0,2012Q1,341
2,LVRH2-2012-4,male,21,self/community/pt,2012-01-01 01:07:00,2012,PAST WEEK HAS BEEN SMOKING MARIJUANA. 2/7 EAR ...,0,0,0,0,2012Q1,257
3,LVRH2-2012-6,female,14,road ambulance,2012-01-01 01:40:00,2012,HAS BEEN DRINKING CRUISERS SINCE 1800HRS TILL ...,0,0,0,0,2012Q1,317
4,LVRH2-2012-7,male,56,self/community/pt,2012-01-01 01:58:00,2012,PT PRESENTS WITH CENTRAL CHEST TIGHTNESS RADIA...,0,0,0,0,2012Q1,364


In [5]:
print_token_counts(count_tokens(df.triage_note))
print_token_counts(count_valid_tokens(df.triage_note))

The corpus contains 389538 unique tokens (15398247 tokens in total).
The corpus contains 362946 unique tokens (14623818 tokens in total).


### Pre-process

In [6]:
# Pre-processing
df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)

print_token_counts(count_tokens(df.preprocessed_triage_note))
print_token_counts(count_valid_tokens(df.preprocessed_triage_note))

The corpus contains 189501 unique tokens (15504832 tokens in total).
The corpus contains 170988 unique tokens (14592846 tokens in total).


### Tokenise

In [7]:
# Create tokenised text
df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_valid_tokens(df.tokenized_triage_note))

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  return re.compile(expression)


The corpus contains 136285 unique tokens (17026349 tokens in total).
The corpus contains 121863 unique tokens (14727196 tokens in total).


### Re-tokenise

In [8]:
# Re-tokenise text
df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, vocab)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_valid_tokens(df.tokenized_triage_note))
print_token_counts(count_valid_tokens_in_vocab(df.tokenized_triage_note, vocab=vocab))

The corpus contains 111434 unique tokens (17265930 tokens in total).
The corpus contains 97008 unique tokens (14844617 tokens in total).
The corpus contains 17152 unique tokens (14332742 tokens in total).


### Spelling correction

In [9]:
# Correct spelling mistakes
df['normalised_triage_note'] = df.tokenized_triage_note.apply(correct_tokens, _dict=misspelled_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_valid_tokens(df.normalised_triage_note))
print_token_counts(count_valid_tokens_in_vocab(df.normalised_triage_note, vocab=vocab))

The corpus contains 93714 unique tokens (17277148 tokens in total).
The corpus contains 79288 unique tokens (14855835 tokens in total).
The corpus contains 17482 unique tokens (14656885 tokens in total).


### Slang replacement

In [10]:
# Replace slang for medications
df['normalised_triage_note'] = df.normalised_triage_note.apply(correct_tokens, _dict=slang_dict)

print_token_counts(count_tokens(df.normalised_triage_note))
print_token_counts(count_valid_tokens(df.normalised_triage_note))
print_token_counts(count_valid_tokens_in_vocab(df.normalised_triage_note, vocab=vocab))

The corpus contains 93705 unique tokens (17288509 tokens in total).
The corpus contains 79279 unique tokens (14867196 tokens in total).
The corpus contains 17470 unique tokens (14646517 tokens in total).


___
# Extract entities

In [11]:
def select_valid_tokens(text):
    """
    Select valid tokens from a text.
    """
    return ' '.join([token for token in text.split() if is_valid_token(token)])

In [12]:
df['entities'] = df.normalised_triage_note.apply(select_valid_tokens)

print_token_counts(count_tokens(df.entities))
print_token_counts(count_valid_tokens(df.entities))
print_token_counts(count_valid_tokens_in_vocab(df.entities, vocab=vocab))

The corpus contains 79279 unique tokens (14867196 tokens in total).
The corpus contains 79279 unique tokens (14867196 tokens in total).
The corpus contains 17470 unique tokens (14646517 tokens in total).


### Save dataset

In [13]:
data_filename = data_filename.replace('_cleaned', '')
df.to_parquet(data_proc_dir / (data_filename + "_normalised.parquet"), engine="pyarrow")