In [1]:
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB

In [2]:
import json
from copy import deepcopy
import pandas as pd

# Load UMLS Base MedCAT
-  filtered to only the top ICD 400 Codes.

In [3]:
vocab = Vocab()
vocab.load_dict('/Users/tom/phd/cattrainer_models/base_vocabulary.dat')

In [4]:
umls_cdb = CDB()
umls_cdb.load_dict('/Users/tom/phd/cattrainer_models/0.2.7+ - umls_base_mimic - ICD10_Only.dat')

In [5]:
full_umls_cdb = CDB()
full_umls_cdb.load_dict('/Users/tom/phd/cattrainer_models/0.2.7+ umls_2m_mimic.dat')

In [6]:
cat = CAT(cdb=umls_cdb, vocab=vocab)

## Train using the Code's already predicted and correct.

In [9]:
pred_corr = json.load(open('MedCAT_Export_With_Text_Pred_Corr.json'))

In [None]:
pred_core_train = []

In [51]:
train_set = []
test_set = []
for p in pred_corr['projects']:
    if len(p['documents']) > 1:
        p_copy_train = deepcopy(p)
        p_copy_test = deepcopy(p)
        p_copy_train['documents'] = p['documents'][:-1]
        p_copy_test['documents'] = [p['documents'][-1]]
        train_set.append(p_copy_train)
        test_set.append(p_copy_test)
    else:
        train_set.append(p)

In [52]:
pred_corr_train_set = {'projects': train_set}
pred_corr_test_set = {'projects': test_set}

In [53]:
len([a for p in pred_corr_train_set['projects'] for d in p['documents'] for a in d['annotations']])

1437

In [55]:
len([a for p in pred_corr_test_set['projects'] for d in p['documents'] for a in d['annotations']])

171

In [58]:
json.dump(pred_corr_train_set, open('pred_corr_train.json', 'w'))
json.dump(pred_corr_test_set, open('pred_corr_test.json', 'w'))

In [None]:
cat.train_supervised('pred_corr_train.json', lr=0.1, print_stats=True, test_set='pred_corr_test.json', use_filters=True)

In [None]:
cat.train_supervised('pred_corr_train.json', lr=0.1, print_stats=True, use_filters=True)

In [12]:
cat.spacy_cat.MIN_CUI_COUNT_STRICT = 0

In [138]:
foo = pd.DataFrame(json.load(open('MedCAT_Export_With_Text_Assn_No_Pred.json'))['projects'])

In [97]:
cat.train_supervised('not-fucked.json', print_stats=True, lr=0.3, nepochs=0, use_filters=True, reset_cdb=True, anneal=True)

Epoch: 0, Prec: 0.9034564958283671, Rec: 0.8783314020857474, F1: 0.8908939489570573
First 10 out of 157 docs with problems: 125347; 162484; 151592; 131363; 191028; 166261; 111866; 128902; 134125; 123355
Starting epoch: 0
Epoch: 1, Prec: 0.893348623853211, Rec: 0.9026651216685979, F1: 0.8980068727609045
First 10 out of 147 docs with problems: 125347; 162484; 151592; 191028; 111866; 138230; 128902; 134125; 123355; 115842
Starting epoch: 1
Epoch: 2, Prec: 0.8939566704675028, Rec: 0.9084588644264194, F1: 0.9012077674469612
First 10 out of 141 docs with problems: 125347; 162484; 151592; 191028; 111866; 138230; 128902; 134125; 123355; 110674
Starting epoch: 2
Epoch: 3, Prec: 0.8952164009111617, Rec: 0.9107763615295481, F1: 0.902996381220355
First 10 out of 140 docs with problems: 125347; 162484; 151592; 111866; 128902; 134125; 123355; 110674; 115842; 133316
Starting epoch: 3
Epoch: 4, Prec: 0.8965909090909091, Rec: 0.9142526071842411, F1: 0.9054217581375751
First 10 out of 136 docs with prob

In [111]:
cat.train_supervised('not-fucked.json', print_stats=True, lr=0.3, nepochs=1, use_filters=True, anneal=True)

Epoch: 0, Prec: 0.8979591836734694, Rec: 0.9177288528389339, F1: 0.9078440182562016
First 10 out of 134 docs with problems: 151592; 134125; 123355; 110674; 115842; 133316; 187850; 173185; 157828; 162451
Starting epoch: 0
Epoch: 1, Prec: 0.8978433598183881, Rec: 0.9165701042873696, F1: 0.9072067320528789
First 10 out of 135 docs with problems: 151592; 135955; 134125; 123355; 110674; 115842; 133316; 187850; 173185; 157828


In [125]:
doc = cat('\n1.  Coronary artery disease status post CABG.\n2.  Sternal wound infection status post rewiring.\n3.  Revision of PDA graft.\n4.  Status post living related kidney transplant for chronic\nrenal failure.\n')

In [126]:
cat.spacy_cat.CUI_FILTER = None

In [129]:
doc.ents[-2]._.cui

'C2891269'

In [128]:
doc.ents

(Coronary artery disease, CABG, kidney transplant, chronic
 renal failure)

In [18]:
cat.cdb.cui_count['C2887101']

25

In [40]:
cat.cdb.name2cui['sepsis'] = {'C0036690', 'C2887101'}

In [38]:
[(cat.cdb.cui_count[x], x) for x in cat.cdb.name2cui['sepsis']]

[(2, 'C0036690'), (25, 'C2887101'), (48, 'C0243026')]

In [31]:
cat.spacy_cat.PREFER_FREQUENT = False

The steps would be:
- Take all the names you have in the new CDB
- Check which ones were ambiguous in the old CDB (have  multiple CUIs linked)
- Set cdb.name_isunique[name] = False

## Ensure all names that are in the ICD-10 CDB 'Trainable' 

In [7]:
unique_names = [k for k, v in umls_cdb.name2cui.items() if len(v) == 1]
non_unique_names_from_full_umls_cdb = set([k for k, v in full_umls_cdb.name2cui.items() if len(v) > 1])
new_names = set(umls_cdb.name2cui.keys()) - set(umls_cdb.name_isunique.keys())

In [8]:
for name in new_names:
    if name in non_unique_names_from_full_umls_cdb:
        umls_cdb.name_isunique[name] = False

In [9]:
umls_cdb.save_dict('/Users/tom/phd/cattrainer_models/0.2.7+ - umls_base_mimic - ICD10_Only - Trainable.dat')

In [74]:
len(umls_cdb.cui2names)

99082

In [73]:
cui_to_keep = json.load(open('all_icd9_cuis.json'))

In [36]:
mimic3 = pd.read_csv('../data/mimic3_with_addendums.csv')
df = pd.DataFrame({'name': ['Test Episode 1', 'Test Episode 2'], 'text': mimic3.text[3:5]})
df.to_csv('test_dataset.csv', index=False)