In [1]:
# add autoreload
%load_ext autoreload
%autoreload 2
import os
import sys

import numpy as np
import pandas as pd
import scipy as sc

from collections import defaultdict
import re
import deduce

from tqdm import tqdm
import seaborn as sns

from gensim.models import phrases

# Context:
* $100$ K echocardiographic reports available. 
* we want to extract diagnoses regarding the left-ventricle function
* we have $5000$ reports with labeled spans.

# Goal:
Train a "model" that can
1. identify the spans
2. classify the spans

# Approach: MedCAT - MetaCAT

## Two-step approach

* unsupervised training on the documents
* add a single custom entity with a custom identifier
* train a model to identify the custom entities
* supervised training on the meta-annotations of the entities

## One-step approach

* unsupervised training on the documents
* add custom entities based on the spans and their labels
* train a model to identify the custom entities

# Approach: biLSTM/transformer

## Two-step approach

* Train a model to identify the spans: self-supervision by random selecting non-span ranges as negative examples
* Train a model to classify the spans: supervised based on the labeled spans 
* Combine the model in one pipeline

## One-step approach
* Assign a label to each span
* Train a model to identify the spans

##  Load Medcat modelpack

In [36]:
from medcat.cat import CAT
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.config import Config
from medcat.meta_cat import MetaCAT

medcat_dir = os.getenv('medcat_pack')
pack_location = 'umls-dutch-v1-10_echo'


# Load texts

In [38]:
echo_path = 'T://lab_research/RES-Folder-UPOD/Echo_label/E_ResearchData/2_ResearchData'
# load the jsonl in a dataframe
texts = pd.read_json(os.path.join(echo_path, 'outdb_140423.jsonl'), lines=True)

## Unsupervised learning for NER+L

In [39]:
MCAT = CAT.load_model_pack(os.path.join(medcat_dir, pack_location))

MCAT.train(texts.text.values, 
            nepochs=3, 
            progress_print=10,  
            is_resumed=True)
MCAT.create_model_pack(medcat_dir + "/umls-dutch-v1-10_echo")



'medcat_model_pack_0ac8cc93d015ac50'

### Add LVEF diagnosis spans from Prodigy annotations

In [49]:
span_set = set()
span_list = []
for k, (_spans, text) in enumerate(zip(texts[texts.spans.notna()].spans.values,
                                       texts[texts.spans.notna()].text.values)):
    for _span in _spans:
        start, end = _span['start'], _span['end']
        span_set.add(text[start:end])
        span_list.append(text[start:end])

In [57]:
for _span in tqdm(span_set):
    MCAT.add_and_train_concept(cui='LVEF_SPAN',
                            name=_span, 
                            do_add_concept=True,
                            negative=False,
                           )

100%|██████████| 2432/2432 [00:30<00:00, 78.72it/s]


## Supervised learning for NER+L

In [59]:
MCAT.train_supervised(data_path=os.path.join(medcat_dir, 
                                 "umls-dutch-v1-10_echo",
                                 "input/ner_l_anno/trainer_export.json"), 
                      nepochs=7,
                      print_stats=0,
                      use_filters=False)
MCAT.create_model_pack(medcat_dir + "/umls-dutch-v1-10_echoV2")

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.6561085972850679, Rec: 0.7512953367875648, F1: 0.7004830917874396

Docs with false positives: 2; 6; 25; 24; 5; 9; 19; 14; 4; 1

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
ademetionine                                                           - C0036002             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 1, Prec: 0.6535087719298246, Rec: 0.772020725388601, F1: 0.7078384798099762

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
ademetionine                                                           - C0036002             -          1
diastolische disfunctie

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 2, Prec: 0.6535087719298246, Rec: 0.772020725388601, F1: 0.7078384798099762

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
ademetionine                                                           - C0036002             -          1
diastolische disfunctie

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 3, Prec: 0.6563876651982379, Rec: 0.772020725388601, F1: 0.7095238095238094

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 4, Prec: 0.6563876651982379, Rec: 0.772020725388601, F1: 0.7095238095238094

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 5, Prec: 0.6563876651982379, Rec: 0.772020725388601, F1: 0.7095238095238094

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 6, Prec: 0.6563876651982379, Rec: 0.772020725388601, F1: 0.7095238095238094

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/31 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: 7, Prec: 0.6563876651982379, Rec: 0.772020725388601, F1: 0.7095238095238094

Docs with false positives: 2; 6; 25; 24; 5; 27; 9; 19; 14; 4

Docs with false negatives: 2; 6; 25; 24; 5; 28; 9; 19; 14; 11



False Positives

ef~4ch~23~bij~matig~beeldkwaliteit                                     - LVEF_SPAN            -         54
collaps                                                                - C0332521             -          7
tapse                                                                  - C3888927             -          3
tricuspidalisinsufficiëntie                                            - C0040961             -          2
insufficiëntie van mitralisklep                                        - C0026266             -          2
lv~met~zeer~slechte~funcdtie                                           - C0080310             -          1
diastolische disfunctie                                                - C0520863             -          1
normaal~rv~functie     

'medcat_model_pack_7ede882a26d86dae'

## Supervised learning of MetaCAT model for "Left-ventricle function"

In [60]:
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, ByteLevelBPETokenizer

In [61]:
# load tokenizer from negation_model
# tokenizer folder 
tok_folder = 'T:/laupodteam/AIOS/Bram/language_modeling/Clinical_embeddings/unigrams/with_tokenizer/v2/tokenizer'
emb_folder = 'T:/laupodteam/AIOS/Bram/language_modeling/Clinical_embeddings/unigrams/with_tokenizer/v2/SG'
tokenizer = ByteLevelBPETokenizer.from_file(os.path.join(tok_folder, 'vocab.json'), 
                                            os.path.join(tok_folder, 'merges.txt'))
wrapped_tokenizer = TokenizerWrapperBPE(hf_tokenizers=tokenizer)
wrapped_tokenizer.save(medcat_dir + "/umls-dutch-v1-10_echo/assets/tokenizer")

In [62]:
from gensim.models import Word2Vec, KeyedVectors
vec_path = os.path.join(emb_folder, 'sg')
print(vec_path)
w2v = KeyedVectors.load(vec_path)

T:/laupodteam/AIOS/Bram/language_modeling/Clinical_embeddings/unigrams/with_tokenizer/v2/SG\sg


In [64]:
# Create embedding matrix
embeddings = []
words_not_present = []

for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v:
        embeddings.append(w2v[word])
    else:
        words_not_present.append(i)
        embeddings.append(np.random.random(300))
        
mean_vector = np.mean(embeddings, axis=0)

for i in words_not_present:
    embeddings[i] = mean_vector

# Save the embeddings
embeddings_array = np.array(embeddings)
np.save(open(medcat_dir + "/umls-dutch-v1-10_echoV2/assets/embeddings/embedding.npy", 
             'wb'), embeddings_array)

In [68]:
config_metacat = ConfigMetaCAT()
config_metacat.general['category_name'] = 'LeftVentricleFunction'
config_metacat.train['nepochs'] = 25
config_metacat.train['score_average'] = 'binary'
config_metacat.model['hidden_size'] = 300
config_metacat.model['input_size'] = 300
config_metacat.model['dropout'] = 0.25
config_metacat.model['num_layers'] = 3
config_metacat.model['num_directions'] = 2
config_metacat.model['nclasses'] = 8
config_metacat.model['model_name'] = 'lstm'

In [69]:
meta_cat = MetaCAT(tokenizer=wrapped_tokenizer,
                   embeddings=embeddings_array, 
                   config=config_metacat)

In [70]:
train_path = os.path.join(medcat_dir, 
                          'umls-dutch-v1-10_echoV2', 'input', 'meta_anno', 'metacat_outdb_140423.json')
model_path = os.path.join(medcat_dir,
                          'umls-dutch-v1-10_echoV2', 'assets', 'models')
print("Commencing training...")
meta_cat.train(json_path=train_path, 
               save_dir_path=model_path)

Commencing training...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

{'report': {'0': {'precision': 0.9826388888888888,
   'recall': 0.9860627177700348,
   'f1-score': 0.9843478260869565,
   'support': 287},
  '2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
  '3': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3},
  '4': {'precision': 0.9245283018867925,
   'recall': 0.9423076923076923,
   'f1-score': 0.9333333333333333,
   'support': 52},
  '5': {'precision': 1.0,
   'recall': 0.8333333333333334,
   'f1-score': 0.9090909090909091,
   'support': 12},
  '6': {'precision': 1.0,
   'recall': 0.9259259259259259,
   'f1-score': 0.9615384615384615,
   'support': 54},
  '7': {'precision': 0.9568965517241379,
   'recall': 0.9487179487179487,
   'f1-score': 0.9527896995708154,
   'support': 117},
  'accuracy': 0.9619771863117871,
  'macro avg': {'precision': 0.8377233917856884,
   'recall': 0.8051925168649908,
   'f1-score': 0.820157175660068,
   'support': 526},
  'weighted avg': {'precision': 0.9715774322451491,
   'recall

In [40]:
meta_cat.save(save_dir_path=model_path)
# now manually add the model to the model_pack....

## Load  new model pack

In [73]:
MCATnew = CAT.load_model_pack(os.path.join(medcat_dir, pack_location))



## Apply to texts

In [74]:
from spacy import displacy

In [75]:
texts.iloc[232].spans

[{'start': 61,
  'end': 112,
  'token_start': 11,
  'token_end': 17,
  'label': 'lv_sys_func_moderate'}]

In [76]:
texts.text.values[232][61:112]

'Visueel matige tot redelijke systolische LV functie'

In [31]:
i = 232
doc = MCATnew(texts.text.values[i])
displacy.render(doc, style='ent')

In [78]:
doc = MCATnew(texts.text.values[i])
for ent in doc.ents:
    print(ent.text,  ent._.meta_anns)
    print("\n")

Linker ventrikel {'LeftVentricleFunction': {'value': 'lv_sys_func_moderate', 'confidence': 0.9993826150894165, 'name': 'LeftVentricleFunction'}, 'Negation': {'value': 'not negated', 'confidence': 0.9999436140060425, 'name': 'Negation'}}


harttransplantatie {'LeftVentricleFunction': {'value': 'lv_sys_func_normal', 'confidence': 0.999862551689148, 'name': 'LeftVentricleFunction'}, 'Negation': {'value': 'not negated', 'confidence': 0.999977707862854, 'name': 'Negation'}}


aortaklep {'LeftVentricleFunction': {'value': 'lv_sys_func_normal', 'confidence': 0.9999521970748901, 'name': 'LeftVentricleFunction'}, 'Negation': {'value': 'not negated', 'confidence': 0.9999804496765137, 'name': 'Negation'}}


aortaklepinsufficiëntie {'LeftVentricleFunction': {'value': 'lv_sys_func_moderate', 'confidence': 0.9956921935081482, 'name': 'LeftVentricleFunction'}, 'Negation': {'value': 'negated', 'confidence': 0.9999101161956787, 'name': 'Negation'}}


mitralisklepinsufficientie {'LeftVentricleFunction':

In [None]:
hand_voet_re = re.compile(r'(handen|voeten|hand|voet)(.*)', re.IGNORECASE)
texts = texts.assign(text_hv=texts.text.str.extract(hand_voet_re)[1])
texts = texts.fillna('')

In [None]:
documents = texts.text_hv.values
documents = [(i,t) for i,t in enumerate(documents)]

#cat.cdb.config.linking['filters']['cuis'] = set()

# make iterator for the documents
def doc_iter(docs):
    for i, doc in docs:
        yield (i, doc)
docerator = doc_iter(documents)

res = MCATnew.multiprocessing(docerator, 
                           nproc=16, 
                           batch_size_chars=300000,
                           save_dir_path=medcat_dir + "/tmp")

In [None]:
# go through the entities and check if they are in the inclusion list
# if they are in the inclusion store the entity in defaultdict(float) tmp_dict
# with the float being the mean polarity.

# entity_1   |  entity_2 
# -1         |   1

neg_map = {'negated': -1, 'not negated': 1}


prettified_res_negation_max = []
prettified_res_negation = []
prettified_res_progress = []
prettified_res_stable= []

cui_pretty = []
index_list = []

# TODO: change += to [] and finish with max()

for k,v in res.items():
    index_list.append(k)
    tmp_neg = defaultdict(int)
    tmp_neg_max = defaultdict(int)
    tmp_prog = defaultdict(int)
    tmp_stab = defaultdict(int)
    num_ents = len(v['entities'])
    for _, ent in v['entities'].items():
        cui_pretty.append(tuple((ent['cui'], ent['pretty_name'])))
        tmp_neg[ent['cui']] += \
             neg_map[ent['meta_anns']['Negation']['value']]*\
                ent['meta_anns']['Negation']['confidence']/num_ents
        tmp_neg_max[ent['cui']] = \
             max(tmp_neg_max[ent['cui']], neg_map[ent['meta_anns']['Negation']['value']]*\
                    ent['meta_anns']['Negation']['confidence'])
        if ent['meta_anns']['Progressive Erosion']['value']=='Positive Progressive':
            tmp_prog[ent['cui']] += ent['meta_anns']['Progressive Erosion']['confidence']
        if ent['meta_anns']['Progressive Erosion']['value']=='Positive Stable':
            tmp_stab[ent['cui']] += ent['meta_anns']['Progressive Erosion']['confidence']
    
    prettified_res_negation_max.append(tmp_neg_max)
    prettified_res_negation.append(tmp_neg)
    prettified_res_progress.append(tmp_prog)
    prettified_res_stable.append(tmp_stab)

cui_pretty_map = dict(set(cui_pretty))   

In [None]:
res_df_negation_max = pd.DataFrame(prettified_res_negation_max, index=index_list)
res_df_negation = pd.DataFrame(prettified_res_negation, index=index_list)
res_df_progressive = pd.DataFrame(prettified_res_progress, index=index_list)
res_df_stable = pd.DataFrame(prettified_res_stable, index=index_list)

cuis = res_df_progressive.columns
res_df_progressive['progression_count_all'] = res_df_progressive.sum(axis=1)
res_df_stable['progression_count_all'] = res_df_stable.sum(axis=1)

In [None]:
toi = ['degeneratie', 'erosie', 'versmalling', 
       'ankylose', 'artritis', 'verlies', 'subluxatie', 
       'slijtage', 'destructie', 'botappositie', 'vervorming',
       'afwijking', 'progressie', 'reuma']
coi = [c for c in cuis if any(_c in cui_pretty_map[c].lower() for _c in toi)]

In [None]:
cui_pretty_map['C3495832']

In [None]:
tmp = res_df_progressive[cuis].sum().sort_values(ascending=False)
tmp.index = tmp.index.map(cui_pretty_map)

_coi = [c for c in coi if c in res_df_progressive.columns]
res_df_progressive['progression_count_toi'] = res_df_progressive[_coi].sum(axis=1)
_coi = [c for c in coi if c in res_df_stable.columns]
res_df_stable['progression_count_toi'] = res_df_stable[_coi].sum(axis=1)

res_df_negation['presence_sum_all'] = res_df_negation.sum(axis=1)
res_df_negation['presence_sum_coi'] = res_df_negation[coi].sum(axis=1)
res_df_negation['presence_pos_sum_coi'] = res_df_negation[coi].clip(lower=0).sum(axis=1)

res_df_negation_max['presence_sum_all'] = res_df_negation_max.sum(axis=1)
res_df_negation_max['presence_sum_coi'] = res_df_negation_max[coi].sum(axis=1)
res_df_negation_max['presence_pos_sum_coi'] = res_df_negation_max[coi].clip(lower=0).sum(axis=1)

In [None]:
res_df_negation_max.loc[4166, coi]

In [None]:
(res_df_progressive.progression_count_all>0).sum(),\
                (res_df_progressive.progression_count_toi>0).sum()

In [None]:
(res_df_stable.progression_count_all>0).sum(),\
                (res_df_stable.progression_count_toi>0).sum()

In [None]:
texts_labeled = texts.join(res_df_progressive[['progression_count_toi','progression_count_all']])

In [None]:
texts_labeled[texts_labeled.progression_count_toi==0].text.values[235]

In [None]:
progressive_results_step1 = texts[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt']]\
        .merge(res_df_progressive[['progression_count_all', 'progression_count_toi']],
                        how='inner', left_index=True, right_index=True)

progressive_results_step2 = labels[['studyId_RA_hackathon', 'onderznr', 
                                        'Onderz_dt', 'manual_annotated']]\
                                .merge(progressive_results_step1, 
                                        how='inner', 
                                        on=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt'])

In [None]:
import matplotlib.pyplot as plt

def get_scores(threshold):
    tmp = progressive_results_step2.loc[(progressive_results_step2.progression_count_toi>threshold) &
                                        (progressive_results_step2.manual_annotated>-1)]
    mean_true = tmp.manual_annotated.mean()
    recall = tmp[tmp.manual_annotated==1].shape[0]/(progressive_results_step2.manual_annotated==1).sum()

    #tmp = progressive_results_step2.loc[(progressive_results_step2.progression_count_toi==0) &
    #                                    (progressive_results_step2.manual_annotated>-1)]
    #mean_true = tmp.manual_annotated.mean()
    #recall = tmp[tmp.manual_annotated==0].shape[0]/(progressive_results_step2.manual_annotated==0).sum()

    return mean_true, recall

threshold = np.arange(0, 3.1, 0.1)
_res = []
for thr in threshold:
    acc, sens = get_scores(thr)
    _res.append({'accuracy':acc, 'sensitivity' :sens, 'threshold': thr})

res_df = pd.DataFrame(data=_res)
sns.lineplot(data=res_df, y='sensitivity', x='threshold')
ax2 = plt.twinx()
sns.lineplot(data=res_df, y='accuracy', x='threshold', ax=ax2, color='red')

In [None]:
progressive_results_step2

# Alternative
 
If there is a positive presence/progression stable of the target disease AND there is NO mentioning of a comparison, we consider it as progressive as well

In [None]:
def get_regex_label(df, reg, txtcol, colname, conds=None):
    df[colname]=False
    if isinstance(conds, list)==False:
        df.loc[df[txtcol].str.contains(reg, case=True), colname] = True
    else:
        df.loc[(df[txtcol].str.contains(reg, case=True)) & 
                (conds==True), colname] = True
    return df


# extract if there is any mentioning of a comparison
re_geen_eerder_onderzoek = re.compile(r''+'(Geen eerder onderzoek|Geen oud onderzoek|Eerste onderzoek|Eerste bezoek|Geen voorgaande foto|Geen eerdere foto)', 
        re.IGNORECASE)
texts['geen_eerder_onderzoek_handen_voeten']  = \
            texts.text_hv.str.contains(re_geen_eerder_onderzoek).astype(int)
texts['report_rank'] = texts.groupby('studyId_RA_hackathon').Onderz_dt.rank().astype(int)
# indicator of comparison

comparison_markers = ['([Vv]ergeleken met)',
                    '([Vv]ergelijking met)',
                    '([Ii]n vergelijking)',
                    '([Ii]n vergelijking met onderzoek)',
                    '(Ten opzichte)',
                    '([Tt]\.?o\.?v\.?)',  
                    '(Er kan vergeleken worden met onderzoek DATUM)',
                    '(Nieuwe)',
                    '(Ten opzichte voorgaande foto)',
                    '(Status quo)',
                    '(Thans)',
                    '(Recente)',
                    '(Ter vergelijking DATUM)',
                    '(wijzigingen)',
                    '(toename bestaande)',
                    '(nieuwe)',
                    '(DATUM ter vergelijk)',
                    '(onveranderd)',
                    '(veranderd)',
                    '(gewijzigd)',
                    '(ongewijzigd)',
                    '(conforme?)',
                    '([Cc]onform datum)',
                    '([Vv]ergeleken wordt)'
                ]
comparison_markers_re = re.compile('|'.join(comparison_markers), re.IGNORECASE)
texts = get_regex_label(texts, comparison_markers_re, 
                    'text', 'comparison_marker')
texts.loc[((texts.comparison_marker)|(texts.report_rank>1)) &
          (texts.geen_eerder_onderzoek_handen_voeten==0), 'comparison_marker']=True

In [None]:
texts[texts.comparison_marker==True].text.values[0]

In [None]:
neg_results_step1 = texts[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'comparison_marker']]\
                    .merge(res_df_negation_max[['presence_sum_all', 'presence_sum_coi', 'presence_pos_sum_coi']],
                        how='inner', left_index=True, right_index=True)

neg_results_step2 = labels[['studyId_RA_hackathon', 'onderznr', 
                                        'Onderz_dt', 'manual_annotated']]\
                                .merge(neg_results_step1, 
                                        how='inner', 
                                        on=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt'])

threshold = 0.999
neg_results_step2['medcat_label_new'] = np.nan
conds = (neg_results_step2.presence_pos_sum_coi>=threshold) & (neg_results_step2.comparison_marker==False)
neg_results_step2.loc[conds, 'medcat_label_new'] = 1



In [None]:
threshold = 1.
tmp = neg_results_step2.loc[(neg_results_step2.presence_pos_sum_coi<threshold) &
                            (neg_results_step2.manual_annotated>-1) & 
                            (neg_results_step2.comparison_marker==False)]
mean_true = 1-tmp.manual_annotated.mean()
recall = tmp[tmp.manual_annotated==0].shape[0]/(neg_results_step2.manual_annotated==0).sum()

print(mean_true, recall)

In [None]:
stable_results_step1 = texts[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'comparison_marker']]\
                    .merge(res_df_stable[['progression_count_all', 'progression_count_toi']],
                        how='inner', left_index=True, right_index=True)

stable_results_step2 = labels[['studyId_RA_hackathon', 'onderznr', 
                                        'Onderz_dt', 'manual_annotated']]\
                                .merge(stable_results_step1, 
                                        how='inner', 
                                        on=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt'])

threshold = 0.5
stable_results_step2['medcat_label_stable_new'] = np.nan
conds = (stable_results_step2.progression_count_toi>=threshold) & (stable_results_step2.comparison_marker==False)
stable_results_step2.loc[conds, 'medcat_label_stable_new'] = 1


In [None]:
threshold = 0
tmp = stable_results_step2.loc[(stable_results_step2.progression_count_toi>threshold) &
                               (stable_results_step2.manual_annotated>-1) & 
                               (stable_results_step2.comparison_marker==False)]
mean_true = tmp.manual_annotated.mean()
recall = tmp[tmp.manual_annotated==1].shape[0]/(stable_results_step2.manual_annotated==1).sum()

print(mean_true, recall)

In [None]:
#neg_results_step2
#progressive_results_step2
#stable_results_step2

# ['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'comparison_marker', 'presence_pos_sum_coi']
neg_results_step2.rename(columns={'presence_pos_sum_coi':'medcat_presence_sum'}, inplace=True)

# ['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'progression_count_toi']
#progressive_results_step2
progressive_results_step2.rename(columns={'progression_count_toi':'medcat_progressive_sum'}, inplace=True)

# ['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'progression_count_toi']
stable_results_step2.rename(columns={'progression_count_toi':'medcat_stable_sum'}, inplace=True)

In [None]:
medcat_final = neg_results_step2[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'comparison_marker', 'medcat_presence_sum']].merge(
                                    progressive_results_step2[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'medcat_progressive_sum']],
                                    how='inner', on=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt']).merge(
                                    stable_results_step2[['studyId_RA_hackathon', 'onderznr', 'Onderz_dt', 'medcat_stable_sum']],
                                    how='inner', on=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt'])
medcat_final.rename(columns={'comparison_marker': 'comparing_with_previous'}, inplace=True)
medcat_final = medcat_final[~medcat_final.duplicated(subset=['studyId_RA_hackathon', 'onderznr', 'Onderz_dt'])]

In [None]:
medcat_final.to_csv('./collected_labels/inferred/medcat_final.csv', index=False)