>**Generate a word frequency list**
>
>This notebook loads the vocabulary learned from the MIMIC-III free-text notes and uses it as a starting point to generate a custom word frequency list. We expand the vocabulary by adding names of common drugs (including generic, brand, and slang names) and local mental health organisations.
The word frequency list is generated by parsing the whole dataset and appending to an empty list every word that is known to the vocabulary.

In [1]:
import pandas as pd
import json
from self_harm_triage_notes.config import *
from self_harm_triage_notes.text import *
# import pickle
# import re


# from devutils import *

In [2]:
dev_data_filename = "rmh_2012_2017_dev" # rmh_2012_2017_dev, lvrh_2012_2017_dev

___
# Prepare data
### Load RMH development data

In [None]:
df = pd.read_parquet(data_interim_dir / "rmh_2012_2017_dev.parquet", engine="pyarrow")
print(df.shape)
df.head()

In [None]:
print_token_counts(count_tokens(df.triage_note))
print_token_counts(count_valid_tokens(df.triage_note))

### Pre-process

In [None]:
# Pre-processing
df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)

print_token_counts(count_tokens(df.preprocessed_triage_note))
print_token_counts(count_valid_tokens(df.preprocessed_triage_note))

### Tokenise

In [None]:
# Create tokenised text
df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)

print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_valid_tokens(df.tokenized_triage_note))

df.to_parquet(data_interim_dir / (dev_data_filename + "_tokenised_step1.parquet"), engine="pyarrow")

In [3]:
df = pd.read_parquet(data_interim_dir / (dev_data_filename + "_tokenised_step1.parquet"), engine="pyarrow")
print_token_counts(count_tokens(df.tokenized_triage_note))
print_token_counts(count_valid_tokens(df.tokenized_triage_note))

The corpus contains 102666 unique tokens (7633149 tokens in total).
The corpus contains 95251 unique tokens (6265328 tokens in total).


___
# Baseline vocabulary

### General English

In [4]:
# List all available files
files_eng = [str(path) for path in scowl_dir.glob("english-words*")]
files_aus = [str(path) for path in scowl_dir.glob("australian-words*")]
files_upp = [str(path) for path in scowl_dir.glob("english-upper*")]
files = files_eng + files_aus + files_upp

# Select word lists <= 70
files = [f for f in files if int(f.split('.')[-1]) <= 70]

# Create an empty list
english_words = []

# Load and append lists of words to the common list
for filename in files:
    with open (filename, 'rb') as f:
        words = f.read().decode(errors='ignore')

    english_words.extend(words.replace('\n', ' ').lower().split())

    print("Total number of words: %d" % len(english_words))

# Convert to a set
english_words = set(english_words)
print("English, australian, and english upper word lists (<= 70) result in a total of %d words." % 
      len(english_words))

Total number of words: 7951
Total number of words: 12324
Total number of words: 36120
Total number of words: 72223
Total number of words: 85661
Total number of words: 118931
Total number of words: 125322
Total number of words: 131555
Total number of words: 131779
Total number of words: 133092
Total number of words: 133314
Total number of words: 133360
Total number of words: 133556
Total number of words: 134104
Total number of words: 134906
Total number of words: 136391
Total number of words: 136877
Total number of words: 143176
Total number of words: 143915
Total number of words: 144133
Total number of words: 154476
Total number of words: 154489
English, australian, and english upper word lists (<= 70) result in a total of 152186 words.


### AMT Australian ED reference set

In [5]:
# Load reference set
df_amt = pd.read_csv(amt_ed_path, sep='\t')
df_amt.columns = df_amt.columns.str.lower().str.replace(' ', '_')

fully_spec = {
    word
    for line in df_amt.fully_specified_name.dropna().str.lower().str.replace('(disorder)', '').str.replace('(finding)', '').apply(lambda x: re.sub("[:,()/]", " ", x).strip()).tolist()
    for word in line.split()
}
print(len(fully_spec), "fully specified terms")

# pref_terms = {
#     word
#     for line in df_amt.preferred_term.dropna().str.lower().apply(lambda x: re.sub("[:,()/]", " ", x).strip()).tolist()
#     for word in line.split()
# }
# print(len(pref_terms), "preferred terms")

# acc_syn = [
#     word
#     for line in df_amt.acceptable_synonyms.dropna().str.lower().str.replace(' - ', '').apply(lambda x: re.sub("[:,()/\\\]", " ", x).strip()).tolist()
#     for word in line.split()
# ]

ed_terms = fully_spec#.union(pref_terms)#set(fully_spec + pref_terms)# + acc_syn)
print("Aus ED ref set contains %d words." % len(ed_terms))

23311 fully specified terms
Aus ED ref set contains 23311 words.


### AMT Medicinal and trade product reference sets

In [6]:
# Medicinal products
df_amt = pd.read_csv(amt_mp_path, sep='\t')
df_amt.columns = df_amt.columns.str.lower().str.replace(' ', '_')

medicinal_products = [
    word
    for line in df_amt.preferred_term.dropna().str.lower().apply(lambda x: re.sub("[+,()]", "", x).strip()).tolist()
    for word in line.split()
    if word.isalpha()
]

# Trade products
df_amt = pd.read_csv(amt_tp_path, sep='\t')
df_amt.columns = df_amt.columns.str.lower().str.replace(' ', '_')

trade_products = [
    word
    for line in df_amt.preferred_term.dropna().str.lower().apply(lambda x: re.sub("[/,()]", " ", x).strip()).tolist()
    for word in line.split()
    if word.isalpha()
]

drug_names = set(medicinal_products + trade_products)
print("AMT medicinal and trade product ref sets contains %d words." % len(drug_names))

AMT medicinal and trade product ref sets contains 7858 words.


### Names of places in Vic

In [7]:
# Load the list of places in Victoria
with open (resources_dir / 'places_in_victoria.txt', 'r') as f:
    vic_places = f.read()
    
vic_places = set(vic_places.replace('\n', ' ').lower().split())
print("List of names of places in Vic contains %d words." % len(vic_places))

List of names of places in Vic contains 1510 words.


### Local medical organisations

In [8]:
# List of local medical organisations
med_orgs = [
    "ecatt", 
    "orygen", 
    "saapu", 
    "vccc", # Victorian Comprehensive Cancer Centre
    "pmcc", # Peter MacCallum Cancer Centre
    "metcall", 
    "wgh", # West Gippsland Hospital
    "rmh", # Royal Melbourne Hospital
    "rwh", # Royal Women's Hospital
    "rdns", # Royal District Nursing Service
    "emh",
    "alfred",
    "epworth",
    "jfh",
 ]

### Other terms

In [9]:
# List of units
units = [
    "km",
    "kms",
    "mins",
    "min",
    "kph",
    "kmh",
    "sec",
    "yr",
    "yrs",
    "mph",
    "mls",
    "mgs",
    "l",
]

# List of highly specific medical terms
med_terms = [
    "headstrike",
    "painfree",
    "section351",
    "penthrane",
    "normotensive",
    "ictal",
    "cspine",
    "laminectomy",
    "holter",
    "unwellness",
    "presyncopal",
    "hemi",
]   
#     "unrousable",
#     "unrecordable",
#     "batcall",
#     "acopia", 
#     "daswest",
#     "neurovasc", 
#     "vasc", 
#     "bibp", 
#     "headstrike", 
#     "n&v", 
#     "v&d",
#     "d&v",
#     "n&v&d",
#     "foosh", 
#     "esrf", 
#     "urti", 
#     "lrti",
#     "cwms", 
#     "lvf", 
#     "stml", 
#     "permacath", 
#     "section351", 
#     "weightbear", 
#     "creps", 
#     "warfarinised", 
#     "midzone", 
#     "burnshield", 
#     "aperient", 
#     "haemoserous"
# ]

# Common incorrectly spelled words contained in med7
incorrect_spelling = [
    "diarrhea",
]
#     'sucidal', 
#     'sucide', 
#     'intermitantly',
#     'intermitent',
#     'intermitently',
#     'intermittant',
#     'intermittantly',
#     'intermittenly', 
#     'spontaenous',
#     'spontan',
#     'spontanious',
#     'spontaniously',
#     'spontanous',
#     'spontanously'

> `soboe` = `sobe` = shortness of breath on exertion
>
> `n&v` = nausea & vomiting
>
> `foosh` = falling on outstretched hand
>
> `cabg` = `cabgs` = coronary artery bypass surgery 
>
> `exac` = exacerbation
>
> `oab` = `oabs` = `abs` = [oral] antibiotics
>
> `hlcnh` = high-level care nursing home
>
> `nv` = `nvasc` = `n'vasc` = `neurovasc` = neuro-vascular
>
> `esrf` = end-stage renal failure 
>
> `esr` = erythrocyte Sedimentation Rate
> 
> `urti` = upper respiratory tract infection 
>
> `lrti` = lower respiratory tract infection
>
> `uti` = urinary tract infection
>
> `fpx4` = `fp4` = focus point at 4mm ?? 
>
> `cwms` = `cms` = circulation [warmth] motion sensation

abbs = {
    'sob': 'shortness of breath',
    'loc': 'loss of consciousness',
    'lac': 'laceration',
    'gtn': 'glyceryl trinitrate',
    'ami': 'acute myocardial infarction',
    'mva': 'motor vehicle accident',
    'nad': 'no abnormality detected',
    'mas': 'motor assessment scale',
    'cva': 'cerebrovascular accident'
}

In [10]:
# Create a baseline vocabulary
base_vocab = english_words

# Add ED-specific terms
base_vocab.update(ed_terms)

# Add drug names
base_vocab.update(drug_names)

# Add places in Vic
base_vocab.update(vic_places)

# Add local medical organisations
base_vocab.update(med_orgs)

# Add highly specific medical terms
base_vocab.update(med_terms)

# Add units
base_vocab.update(units)

# Remove incorrectly spelt words
base_vocab = {v for v in base_vocab if v not in incorrect_spelling}

print("Combined base vocabulary contains %d  words." % len(base_vocab))

Combined base vocabulary contains 172551  words.


### Abbreviations

In [11]:
df_abbs = pd.read_excel(abbr_path, header=0)
df_abbs.columns = ['abbreviation_raw', 'meaning']
df_abbs['abbreviation_lower'] = df_abbs.abbreviation_raw.str.lower()

# df_abbs['all_caps'] = np.where(df_abbs.abbreviation_raw==df_abbs.abbreviation_raw.str.upper(), 
#                                True, False)
# df_abbs['abbreviation'] = df_abbs.apply(lambda x: x.abbreviation_raw if x.all_caps else x.abbreviation_lower, axis=1)

print("List of abbreviations contains %d words." % df_abbs.shape[0])

List of abbreviations contains 2272 words.


___
# Filter vocabulary and abbreviations
### Filter the base vocab

In [12]:
# Create a vocabulary of known tokens
counts = count_valid_tokens(df.tokenized_triage_note)
filtered_vocab = set(k for k in counts.keys() if k in base_vocab)
print(len(filtered_vocab))

19235


### Filter abbreviations

In [13]:
# Filter out abbreviations that do not occur in the data in their raw form
counts = count_valid_tokens(df.triage_note)
filtered_abbs = set(df_abbs.loc[df_abbs.abbreviation_raw.isin(counts), 'abbreviation_lower'])
print(len(filtered_abbs))

1000


### Re-tokenise

In [14]:
# Re-tokenise text
df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, filtered_vocab.union(filtered_abbs))

# Update the dictionary of tokens with counts
counts = count_valid_tokens(df.tokenized_triage_note)
word_list = Counter({k:v for k,v in counts.items() if k in base_vocab or k in filtered_abbs})
len(word_list)

20109

In [15]:
# Most common words
sorted(word_list.items(),  key=lambda item: item[1], reverse=True)

[('pain', 176829),
 ('to', 154301),
 ('nil', 125537),
 ('and', 124430),
 ('with', 104017),
 ('on', 77636),
 ('phx', 75137),
 ('at', 64320),
 ('of', 62606),
 ('left', 58498),
 ('for', 57836),
 ('hx', 57619),
 ('in', 54572),
 ('pt', 53137),
 ('right', 51052),
 ('from', 41733),
 ('post', 39118),
 ('chest', 35283),
 ('ago', 32930),
 ('gcs', 32633),
 ('by', 30836),
 ('pmhx', 29824),
 ('gp', 29358),
 ('sob', 28975),
 ('abdo', 28816),
 ('denies', 27290),
 ('rate', 26840),
 ('has', 26816),
 ('today', 26595),
 ('l', 26478),
 ('not', 26174),
 ('nausea', 25932),
 ('states', 25493),
 ('back', 25196),
 ('loc', 24851),
 ('last', 24831),
 ('o/a', 24514),
 ('fall', 23941),
 ('heart', 23638),
 ('no', 23604),
 ('since', 23003),
 ('r', 22800),
 ('now', 21953),
 ('pmh', 21613),
 ('c/o', 21503),
 ('triage', 21379),
 ('this', 20753),
 ('onset', 20028),
 ('av', 19595),
 ('ht', 18004),
 ('swelling', 17812),
 ('had', 17086),
 ('worse', 16985),
 ('blood', 16872),
 ('x', 16716),
 ('hours', 16677),
 ('headache', 

In [16]:
# Most common words with 3 letters
sorted({k: v for k,v in word_list.items() if len(k)==3}.items(),  
       key=lambda item: item[1], reverse=True)

[('nil', 125537),
 ('and', 124430),
 ('phx', 75137),
 ('for', 57836),
 ('ago', 32930),
 ('gcs', 32633),
 ('sob', 28975),
 ('has', 26816),
 ('not', 26174),
 ('loc', 24851),
 ('o/a', 24514),
 ('now', 21953),
 ('pmh', 21613),
 ('c/o', 21503),
 ('had', 17086),
 ('leg', 14532),
 ('arm', 14369),
 ('the', 12355),
 ('lac', 11461),
 ('but', 11175),
 ('htn', 10276),
 ('eye', 8618),
 ('car', 7979),
 ('non', 7723),
 ('pmx', 7469),
 ('gtn', 7352),
 ('was', 7006),
 ('d/c', 6556),
 ('ami', 6303),
 ('due', 6250),
 ('lmo', 5968),
 ('hit', 5815),
 ('per', 5764),
 ('hip', 5627),
 ('rmh', 5352),
 ('mva', 5255),
 ('nad', 5214),
 ('out', 5133),
 ('off', 4824),
 ('ccf', 4389),
 ('mas', 4145),
 ('few', 4089),
 ('cva', 4073),
 ('bsl', 4040),
 ('red', 3938),
 ('ppm', 3885),
 ('uti', 3746),
 ('ihd', 3653),
 ('rom', 3650),
 ('all', 3645),
 ('low', 3623),
 ('see', 3546),
 ('ear', 3398),
 ('hot', 3378),
 ('use', 3321),
 ('jaw', 3259),
 ('ecg', 3195),
 ('dry', 3105),
 ('ccp', 3102),
 ('any', 3096),
 ('bed', 2944),
 

### Save the word list and vocabulary

In [17]:
with open(spell_corr_dir / (dev_data_filename + "_amt6_word_freq_list.json"), 'w') as f:
    json.dump(word_list, f)
with open(spell_corr_dir / (dev_data_filename + "_amt6_vocab.json"), 'w') as f:
    json.dump(list(word_list.keys()), f)

### Save dataset

In [18]:
df.to_parquet(data_interim_dir / (dev_data_filename + "_amt6_nospellcorr.parquet"), engine="pyarrow")