In [41]:
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB
from icd9cms import search

In [7]:
import json
from copy import deepcopy
import pandas as pd

# 1) Load UMLS Concept Databases
- Base Vocab - 
- Base UMLS CDB.
- Filtered ICD-10-CM code only CDB

In [11]:
vocab = Vocab()
vocab.load_dict('medcat_models/base_vocabulary.dat')

In [5]:
full_umls_cdb = CDB()
full_umls_cdb.load_dict('medcat_models/0.2.7+ umls_2m_mimic.dat')

In [None]:
umls_cdb = CDB()
umls_cdb.load_dict('medcat_models/0.2.7+ - umls_base_mimic - ICD10_Only.dat')

In [None]:
cat = CAT(cdb=umls_cdb, vocab=vocab)

## 2) Run Pre-Trained MedCAT on DD Subsections
- Loads the vocab and pre-trained UMLS based Concept Database into a MedCAT instance.
- Reads the DD subsections from /data/mimic_dd_icd9_sections.csv
- Runs MedCAT over all DD subsections and collects results as JSON,
- Dumps output to file 'mimic_dd_annos.json'

In [25]:
notes = pd.read_csv('data/mimic_dd_icd9_sections.csv')
objs = [json.loads(cat.get_json(t)) for t in notes.text]
json.dump(objs, open('mimic_dd_annos.json', 'w'))

## 3) Fine-Tune the MedCAT Model with tranining samples
- Firstly, ensure all names that are in the ICD-10 CDB 'Trainable', by telling MedCAT to use training samples for given concepts during training data annotation.

In [None]:
%%timeit
unique_names = [k for k, v in umls_cdb.name2cui.items() if len(v) == 1]
non_unique_names_from_full_umls_cdb = set([k for k, v in full_umls_cdb.name2cui.items() if len(v) > 1])
new_names = set(umls_cdb.name2cui.keys()) - set(umls_cdb.name_isunique.keys())

for name in new_names:
    if name in non_unique_names_from_full_umls_cdb:
        umls_cdb.name_isunique[name] = False

In [9]:
umls_cdb.save_dict('medcat_models/0.2.7+ - umls_base_mimic - ICD10_Only - Trainable.dat')

- 'Train' the model with collected samples

In [29]:
cdb = CDB()

In [30]:
cdb.load_dict('medcat_models/0.2.7+ - umls_base_mimic - ICD10_Only - Trainable.dat')

In [31]:
cat = CAT(cdb=cdb, vocab=vocab)

In [32]:
cat.spacy_cat.MIN_CUI_COUNT_STRICT = 0

In [33]:
cat.train_supervised('data/MedCAT_Export_With_Text_Assn_No_Pred.json', print_stats=True, lr=0.3, nepochs=5, reset_cdb=True, use_filters=True, anneal=True)

Epoch: 0, Prec: 0.7162162162162162, Rec: 0.06141367323290846, F1: 0.38881494472456235
First 10 out of 786 docs with problems: 153791; 105960; 111263; 143808; 171129; 106006; 151079; 103496; 116879; 138328
Starting epoch: 0
Epoch: 1, Prec: 0.893348623853211, Rec: 0.9026651216685979, F1: 0.8980068727609045
First 10 out of 147 docs with problems: 116726; 142881; 136130; 109386; 186963; 178995; 150592; 167835; 147385; 117806
Starting epoch: 1
Epoch: 2, Prec: 0.8939566704675028, Rec: 0.9084588644264194, F1: 0.9012077674469612
First 10 out of 141 docs with problems: 116726; 142881; 136130; 109386; 186963; 178995; 150592; 167835; 147385; 110674
Starting epoch: 2
Epoch: 3, Prec: 0.8952164009111617, Rec: 0.9107763615295481, F1: 0.902996381220355
First 10 out of 140 docs with problems: 116726; 142881; 136130; 109386; 186963; 178995; 150592; 167835; 147385; 110674
Starting epoch: 3
Epoch: 4, Prec: 0.8965909090909091, Rec: 0.9142526071842411, F1: 0.9054217581375751
First 10 out of 136 docs with pr

In [34]:
cdb.save_dict('medcat_models/0.2.7+ - umls_base_mimic - ICD 10 - Fine Tuned.dat')

## Load Fine-Tuned MedCAT Model, Re-Reun on DD Subsections and save down.

In [35]:
fine_tuned_cdb = CDB()
fine_tuned_cdb.load_dict('medcat_models/0.2.7+ - umls_base_mimic - ICD 10 - Fine Tuned.dat')
tuned_cat = CAT(cdb=fine_tuned_cdb, vocab=vocab)

In [36]:
notes = pd.read_csv('data/mimic_dd_icd9_sections.csv')
objs = [json.loads(tuned_cat.get_json(t)) for t in notes.text]
json.dump(objs, open('data/mimic_dd_annos_fine_tuned_cat.json', 'w'))