# HealTAC 2023: MiADE (Medical information AI Data Extractor) Tutorial Demo

In [1]:
from pathlib import Path
from typing import List, Dict, Optional

from miade.core import NoteProcessor
from miade.note import Note
from miade.annotators import Annotator
from miade.concept import Concept, Category
from miade.dosageextractor import DosageExtractor
from miade.utils.miade_cat import MiADE_CAT

  from tqdm.autonotebook import tqdm, trange


In [2]:
MODEL_DIRECTORY = "../dev_models/current/"

## Part 1: Extracting concepts and dosages from notes using the full MiADE pipeline

### Configuring the MiADE processor

```NoteProcessor``` is the MiADE core. It is initialised with a model directory path that contains all the MedCAT model zips we would like to use in our pipeline, and a config.yaml file that maps an alias to the model IDs and annotators we would like to use:

```
models:
  problems: af37e525e37cdb1a
  medications: a146c741501cf1f7
annotators:
  problems: ProblemsAnnotator
  medications: MedsAllergiesAnnotator
 ```

In [7]:
miade = NoteProcessor(Path(MODEL_DIRECTORY))

[2023-06-09 16:17:01,921] [INFO] miade.core: Found config file ../dev_models/current/config.yaml
[2023-06-09 16:17:01,923] [INFO] miade.core: Loading MedCAT models from ../dev_models/current
[2023-06-09 16:17:10,241] [INFO] miade.dosageextractor: Loaded drug dosage extractor with model en_core_med7_lg


Once ```NoteProcessor``` is initialised, we can add annotators by the aliases we have specified in ```config.yaml``` to our processor. 

When adding annotators, we have the option to add [NegSpacy](https://spacy.io/universe/project/negspacy) to the MedCAT spaCy pipeline, which implements the NegEx algorithm for negation detection. This allows the models to perform simple rule-based negation detection in the absence of MetaCAT models.

In [8]:
miade.add_annotator("problems", use_negex=True)

[2023-06-09 16:17:32,645] [INFO] miade.core: Added ProblemsAnnotator to processor
[2023-06-09 16:17:32,653] [INFO] miade.core: Added Negex context detection for ProblemsAnnotator


In [5]:
miade.add_annotator("meds/allergies")

[2023-06-09 14:53:56,196] [INFO] miade.core: Added MedsAllergiesAnnotator to processor


In [6]:
# print the MedCAT model cards to check that it's been mapped to the correct annotators
miade.print_model_cards()

ProblemsAnnotator: {
  "Model ID": "af37e525e37cdb1a",
  "Last Modified On": "13 December 2022",
  "History (from least to most recent)": [
    "0b130195e6964e66",
    "353abfe8f57e2009"
  ],
  "Description": "MiADE problems development model unsupervised trained on MIMIC-III. cdb: full_condition_list_MedCAT_cdb.csv; vocab: medcat model 811218c0b819c304 (feb 2022)",
  "Source Ontology": "SNOMEDUK_May2022 MiADE subset",
  "Location": "gae03:gae/miade/models/problems/, miade-dev:vol/models/problems/",
  "MetaCAT models": {},
  "Basic CDB Stats": {
    "Number of concepts": 118431,
    "Number of names": 561230,
    "Number of concepts that received training": 13256,
    "Number of seen training examples in total": 12247485,
    "Average training examples per concept": 923.9201116475558
  },
  "Performance": {
    "ner": {},
    "meta": {}
  },
  "Important Parameters (Partial view, all available in cat.config)": {
    "config.ner.min_name_len": {
      "value": 3,
      "description": "M

### Creating a Note

Next we will create a ```Note``` object which contains the text we would like to extract concepts and dosages from

In [8]:
text = """Suspected heart failure

Previous medical hist:
myocardial infarction
Hypothyroidism

Current meds:
Losartan 100mg daily
Atorvastatin 20mg daily
Paracetamol 500mg tablets 2 tabs qds prn

Allergies:
Penicillin - rash

Referred with swollen ankles and shortness of breath since 2 weeks."""

In [9]:
note = Note(text)

### Extracting concepts and dosages

In [10]:
concepts = miade.get_concept_dicts(note, filter_uncategorized=True)

In [11]:
concepts

[{'name': 'hypothyroidism',
  'id': '40930008',
  'category': 'PROBLEM',
  'start': 46,
  'end': 60,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'heart failure',
  'id': '84114007',
  'category': 'PROBLEM',
  'start': 10,
  'end': 23,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'dyspnea',
  'id': '267036007',
  'category': 'PROBLEM',
  'start': 226,
  'end': 245,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'swollen ankle',
  'id': '267039000',
  'category': 'PROBLEM',
  'start': 207,
  'end': 221,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'eruption of skin',
  'id': '271807003',
  'category': 'PROBLEM',
  'start': 187,
  'end': 191,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'losartan-containing product',
  'id': '96309000',
  'category': 'MEDICATION',
  'start': 76,
  'end': 84,
  'dosage': {'dose': {'sou

#### Handling existing records - deduplication

In [12]:
# create list of concepts that already exists in patient record
record_concepts = [
    Concept(id="40930008", name="hypothyroidism", category=Category.PROBLEM),
    Concept(id="267039000", name="swollen ankle", category=Category.PROBLEM)
]

In [13]:
miade.get_concept_dicts(note=note, record_concepts=record_concepts, filter_uncategorized=True)

[{'name': 'heart failure',
  'id': '84114007',
  'category': 'PROBLEM',
  'start': 10,
  'end': 23,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'dyspnea',
  'id': '267036007',
  'category': 'PROBLEM',
  'start': 226,
  'end': 245,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'eruption of skin',
  'id': '271807003',
  'category': 'PROBLEM',
  'start': 187,
  'end': 191,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'losartan-containing product',
  'id': '96309000',
  'category': 'MEDICATION',
  'start': 76,
  'end': 84,
  'dosage': {'dose': {'source': '100 mg',
    'value': 100.0,
    'unit': 'mg',
    'low': None,
    'high': None},
   'duration': None,
   'frequency': {'source': 'start 100 mg every day ',
    'value': 1.0,
    'unit': 'd',
    'low': None,
    'high': None,
    'standardDeviation': None,
    'institutionSpecified': False,
    'preconditionAsRequired': False},
   'r

#### Removing annotators

In [14]:
# we can also remove annotators if no longer wanted
miade.remove_annotator("meds/allergies")

[2023-06-09 14:56:39,441] [INFO] miade.core: Removed MedsAllergiesAnnotator from processor


In [15]:
miade.get_concept_dicts(note=note, record_concepts=record_concepts)

[{'name': 'heart failure',
  'id': '84114007',
  'category': 'PROBLEM',
  'start': 10,
  'end': 23,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'dyspnea',
  'id': '267036007',
  'category': 'PROBLEM',
  'start': 226,
  'end': 245,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'eruption of skin',
  'id': '271807003',
  'category': 'PROBLEM',
  'start': 187,
  'end': 191,
  'dosage': None,
  'negex': False,
  'meta': None,
  'debug': None}]

## Part 2: Customising MiADE

### 2.1 Building custom MedCAT models

For more information on **MedCAT**:

- [Github](https://github.com/CogStack/MedCAT)
- [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials)
- [Paper](https://arxiv.org/abs/2010.01165)

In [13]:
from miade.model_builders.cdbbuilder import CDBBuilder
from miade.model_builders.vocabbuilder import VocabBuilder

In [15]:
CDB_DATA_PATH = "../dev_models/test_cdb/meds_allergies_cdb.csv"

In [46]:
TEXT_DATA_PATH = "../tests/examples/wikipedia_sample.txt"

#### Create a CDB (concept databse) builder

In [22]:
cdbbuilder = CDBBuilder(
    temp_dir=(Path.cwd() / Path(".temp")),
    custom_data_paths=[Path(CDB_DATA_PATH)]
)

In [23]:
cdbbuilder.preprocess()
cdb = cdbbuilder.create_cdb()

In [52]:
cdb.vocab

{'acetaminophen': 1,
 '500mg': 3,
 'oral': 4,
 'tablet': 4,
 'paracetamol': 1,
 'ibuprofen': 1,
 '200mg': 1,
 'penicillin': 1,
 'coconut': 1,
 'metformin': 1,
 'hydrochloride': 1,
 'peanut': 1,
 'butter': 1,
 'a': 6,
 'rash': 2,
 'complaining': 1,
 'of': 37,
 'facial': 1,
 'swelling': 1,
 'symptoms': 4,
 'depression': 2,
 'symptom': 2,
 'depressive': 2}

#### Create a Vocab builder

In [47]:
with open(TEXT_DATA_PATH, "r", encoding="utf-8") as training_data:
    training_data_list = [line.strip() for line in training_data]

In [27]:
training_data_list

['Lung cancer, also known as bronchial carcinoma, since about 98–99% of all lung cancers are carcinomas, is a malignant lung tumor characterized by uncontrolled cell growth in tissues of the lung. Lung carcinomas derive from transformed, malignant cells that originate as epithelial cells, or from tissues composed of epithelial cells. Other lung cancers, such as the rare sarcomas of the lung, are generated by the malignant transformation of connective tissues (i.e. nerve, fat, muscle, bone), which arise from mesenchymal cells. Lymphomas and melanomas (from lymphoid and melanocyte cell lineages) can also rarely result in lung cancer.',
 'In time, this uncontrolled growth can spread beyond the lung – either by direct extension, by entering the lymphatic circulation, or via the hematogenous, bloodborne spread – the process called metastasis – into nearby tissue or other, more distant parts of the body. Most cancers that start in the lung, known as primary lung cancers, are carcinomas. The 

In [42]:
vocab_builder = VocabBuilder()

In [44]:
vocab = vocab_builder.create_new_vocab(
    training_data_list=training_data_list,
    cdb=cdb,
    config=cdb.config
)

In [57]:
vocab.vocab.keys()

dict_keys(['lung', 'cancer', 'also', 'known', 'as', 'bronchial', 'carcinoma', 'since', 'about', '98', '99', 'of', 'all', 'cancers', 'are', 'carcinomas', 'is', 'a', 'malignant', 'tumor', 'characterized', 'by', 'uncontrolled', 'cell', 'growth', 'in', 'tissues', 'the', 'derive', 'from', 'transformed', 'cells', 'that', 'originate', 'epithelial', 'or', 'composed', 'other', 'such', 'rare', 'sarcomas', 'generated', 'transformation', 'connective', 'i', 'e', 'nerve', 'fat', 'muscle', 'bone', 'which', 'arise', 'mesenchymal', 'lymphomas', 'and', 'melanomas', 'lymphoid', 'melanocyte', 'lineages', 'can', 'rarely', 'result', 'time', 'this', 'spread', 'beyond', 'either', 'direct', 'extension', 'entering', 'lymphatic', 'circulation', 'via', 'hematogenous', 'bloodborne', 'process', 'called', 'metastasis', 'into', 'nearby', 'tissue', 'more', 'distant', 'parts', 'body', 'most', 'start', 'primary', 'two', 'main', 'types', 'small', 'sclc', 'non', 'nsclc', 'common', 'symptoms', 'coughing', 'including', 'up'

#### Save the model pack

In [62]:
vocab_builder.make_model_pack(cdb=cdb, save_name="miade_example")

#### Run training from script

MiADE provides scripts for automatically building MedCAT model packs, unsupervised training, supervised training steps, and the creation and training of MetaCAT models. 

The ```--synthetic-data-path``` option allows you to add synthetically generated training data in csv to the supervised and MetaCAT training steps.

In [None]:
MODEL_PACK_PATH = ""
MEDCAT_JSON_EXPORT = ""
SYNTHETIC_CSV_PATH = ""

In [None]:
# Builds a model pack from given CDB in CSV format and a Vocab file from an existing model pack
!miade build-model-pack $CDB_DATA_PATH $VOCAB_MODEL_PATH

In [None]:
# Trains unsupervised training step of MedCAT model
!miade train $MODEL_PACK_PATH $TEXT_DATA_PATH --tag "miade-example"

In [None]:
# Trains supervised training step of MedCAT model
!miade train-supervised $MODEL_PACK_PATH $MEDCAT_JSON_EXPORT --synthetic-data-path $SYNTHETIC_CSV_PATH

In [49]:
# Creates tokenizer for MetaCAT
!miade create-bbpe-tokenizer $TEXT_DATA_PATH

[00:00:00] Pre-processing files (0 Mo)              ░░░░░░░░                  0%
[2K[1B[1A[00:00:00] Pre-processing files (0 Mo)              ████████                100%
[00:00:00] Tokenize words                           ████████ 0        /        0
[2K[1B[1A[00:00:00] Tokenize words                           ████████ 257      /      257

[2K[1B[1A[00:00:00] Count pairs                              ████████ 257      /      257

[2K[1B[1A[00:00:00] Compute merges                           ████████ 299      /      299



In [None]:
# Initialises MetaCAT models to do training on
!miade create-metacats $TOKENIZER_PATH $CATEGORY_NAMES

In [None]:
# Trains the MetaCAT Bi-LSTM models
!miade train-metacats $METACAT_MODEL_PATH $MEDCAT_JSON_EXPORT --synthetic-data-path $SYNTHETIC_CSV_PATH

In [None]:
# Packages MetaCAT models with the main MedCAT model pack
!miade add_metacat_models $MODEL_PACK_PATH $METACAT_MODEL_PATH

#### And more...

**MiADE training dashboard**: A streamlit app that allows you to interactively inspect synthetic data and train MetaCAT models https://github.com/uclh-criu/miade/tree/master/streamlit_app

### 2.2 Creating custom MiADE annotators

We can add custom annotators to MiADE by subclassing ```Annotator``` and s initialising ```NoteProcessor``` with the list of custom annotators 

```Annotator``` methods include:

- ```.get_concepts()```: returns MedCAT output as MiADE ```Concepts```
- ```.add_dosages_to_concepts()```: uses ```DosageExtractor``` to add dosages associated with medication concepts
- ```.deduplicate()```: filters duplicate concepts in list 

In [11]:
# subclass Annotator
class CustomAnnotator(Annotator):
    def __init__(self, cat: MiADE_CAT):
        super().__init__(cat)
        # we need to include MEDICATIONS in concept types so MiADE processor will also extract dosages
        self.concept_types = [Category.MEDICATION, Category.ALLERGY]
    
    def postprocess(self, concepts: List[Concept]) -> List[Concept]:
        # some example post-processing code
        reactions = ["271807003"]
        allergens = ["764146007"]
        for concept in concepts:
            if concept.id in reactions:
                concept.category = Category.REACTION
            elif concept.id in allergens:
                concept.category = Category.ALLERGY
        return concepts
    
    def __call__(
        self,
        note: Note,
        record_concepts: Optional[List[Concept]] = None,
        dosage_extractor: Optional[DosageExtractor] = None,
    ):
        concepts = self.get_concepts(note)
        concepts = self.postprocess(concepts)
        # run dosage extractor if given
        if dosage_extractor is not None:
            concepts = self.add_dosages_to_concepts(dosage_extractor, concepts, note)
        concepts = self.deduplicate(concepts, record_concepts)

        return concepts
        

In [12]:
miade = NoteProcessor(Path(MODEL_DIRECTORY), custom_annotators=[CustomAnnotator])

[2023-06-09 17:21:40,468] [INFO] miade.core: Found config file ../dev_models/current/config.yaml
[2023-06-09 17:21:40,471] [INFO] miade.core: Loading MedCAT models from ../dev_models/current
[2023-06-09 17:21:48,928] [INFO] miade.dosageextractor: Loaded drug dosage extractor with model en_core_med7_lg


In [16]:
miade.add_annotator("problems")

[2023-06-09 17:29:08,572] [INFO] miade.core: Added ProblemsAnnotator to processor


In [13]:
miade.add_annotator("custom")

[2023-06-09 17:21:57,040] [INFO] miade.core: Added CustomAnnotator to processor


In [17]:
miade.get_concept_dicts(note, filter_uncategorized=True)

[{'name': 'losartan-containing product',
  'id': '96309000',
  'category': 'MEDICATION',
  'start': 76,
  'end': 84,
  'dosage': {'dose': {'source': '100 mg',
    'value': 100.0,
    'unit': 'mg',
    'low': None,
    'high': None},
   'duration': None,
   'frequency': {'source': 'start 100 mg every day ',
    'value': 1.0,
    'unit': 'd',
    'low': None,
    'high': None,
    'standardDeviation': None,
    'institutionSpecified': False,
    'preconditionAsRequired': False},
   'route': None},
  'negex': False,
  'meta': None,
  'debug': None},
 {'name': 'atorvastatin-containing product',
  'id': '108600003',
  'category': 'MEDICATION',
  'start': 97,
  'end': 109,
  'dosage': {'dose': {'source': '20 mg',
    'value': 20.0,
    'unit': 'mg',
    'low': None,
    'high': None},
   'duration': None,
   'frequency': {'source': 'start 20 mg every day ',
    'value': 1.0,
    'unit': 'd',
    'low': None,
    'high': None,
    'standardDeviation': None,
    'institutionSpecified': False,
