In [1]:
import os
import json
import pandas as pd
import spacy
import numpy as np
from tqdm.notebook import tqdm
from spacy.tokens import DocBin, Doc, Span, SpanGroup
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.scorer import Scorer, PRFScore
from spacy.training import Example
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

In [12]:
def load_and_run_regex(base_path, file):
    # Load pattern matcher and test data
    folder = os.path.dirname(os.path.dirname(base_path))
    regex_file = folder + '/regex/' + file + '.txt'
    doc_file = base_path + file + '/test.spacy'
    
    nlp = spacy.blank('nl')    
    doc_bin = DocBin().from_disk(doc_file)
    docs = list(doc_bin.get_docs(nlp.vocab))
    matcher = Matcher(nlp.vocab)

    # Read regex patterns
    patterns = []
    with open(regex_file, encoding = "ISO-8859-1") as f:
        for l in f.readlines():
            patterns.append(json.loads(l)) 

    # And add them to the matcher
    for pat in patterns:
        matcher.add(pat['label'], [pat['pattern']])

    # Run inference
    examples = []
    for doc in docs:
        prediction = nlp(doc.text)
        matches = matcher(prediction)
        spans = []
        for match in matches:
            match_span = prediction[match[1]:match[2]]
            match_span.label_ = prediction.vocab.strings[match[0]]
            spans.append(match_span)
        prediction.spans['sc'] = spans
        example = Example(prediction, doc)
        examples.append(example)
    
    return examples

def _tuple_overlap(tL, tR):
    # tL: tuple(begin, end)
    # tR: tuple(begin, end)
    tLrange = set(range(len(tL)))
    tRrange = set(range(len(tR)))
               
    InterSection = len(tLrange.intersection(tRrange))
    Union = len(tLrange.union(tRrange))

    return InterSection/Union if Union>0 else np.nan


def span_overlap_counter(examples):
    overlaplist = []
    for case in examples:
        span_set_spancat = set()
        span_set_labeled = set()
        
        for span in case.reference.spans['sc']:
            span_set_labeled.add(span)
            
        for span in case.predicted.spans['sc']:
            span_set_spancat.add(span)

        jaccard_indices = []
        for span_l in span_set_labeled:
            _jaccard_indices = []
            for span_s in span_set_spancat:
                _jaccard_indices.append(_tuple_overlap(span_l, span_s))
            try:
                max_ = max(_jaccard_indices)
            except:
                max_ = np.nan
            jaccard_indices.append(max_)
        overlaplist.append(jaccard_indices)
    return overlaplist


def span_overlap_counter_reverse(examples):
    overlaplist = []
    for case in examples:
        span_set_spancat = set()
        span_set_labeled = set()
        
        for span in case.reference.spans['sc']:
            span_set_labeled.add(span)
            
        for span in case.predicted.spans['sc']:
            span_set_spancat.add(span)

        jaccard_indices = []
        for span_s in span_set_spancat:
            _jaccard_indices = []
            for span_l in span_set_labeled:
                _jaccard_indices.append(_tuple_overlap(span_l, span_s))
            try:
                max_ = max(_jaccard_indices)
            except:
                max_ = np.nan
            jaccard_indices.append(max_)
        overlaplist.append(jaccard_indices)
    return overlaplist


def regex_scorer(examples, span_key, **cfg):
    score = PRFScore()
    score_gold = PRFScore() 
    score_per_type = dict()
    score_gold_per_type = dict()
    total_pred = 0
    total_pred_gold = 0
    for example in examples:
        pred_doc = example.predicted
        gold_doc = example.reference

        # Find all labels
        labels = set([k.label_ for k in gold_doc.spans[span_key]])
        
        # If labeled, find all labels in pred
        if len(pred_doc.spans[span_key])>0:
            labels |= set([k.label_ for k in pred_doc.spans[span_key]])
            
        # Set up all labels for per type scoring and prepare gold per type
        gold_per_type: Dict[str, Set] = {label: set() for label in labels}
        
        for label in labels:
            if label not in score_per_type:
                score_per_type[label] = PRFScore()
                score_gold_per_type[label] = PRFScore()
                
        # Find all predidate labels
        gold_spans = set()
        pred_spans = set()
        pred_spans_gold = set()
        for span in gold_doc.spans[span_key]:
            gold_span: Tuple
            gold_span = (span.label_, span.start, span.end - 1)
            gold_spans.add(gold_span)
            gold_per_type[span.label_].add(gold_span)                
        pred_per_type: Dict[str, Set] = {label: set() for label in labels}        
        pred_gold_per_type: Dict[str, Set] = {label: set() for label in labels} 
        if len(pred_doc.spans[span_key])>0:
            for span in pred_doc.spans[span_key]:
                total_pred += 1
                pred_span: Tuple
                pred_span = (span.label_, span.start, span.end - 1)
                pred_spans.add(pred_span)
                pred_per_type[span.label_].add(pred_span)                    
            for span in example.get_aligned_spans_x2y(pred_doc.spans[span_key]):
                total_pred_gold += 1
                pred_span_gold: Tuple
                pred_span_gold = (span.label_, span.start, span.end - 1)
                pred_spans_gold.add(pred_span_gold)  
                pred_gold_per_type[span.label_].add(pred_span)    
        # Scores per label
        for k, v in score_per_type.items():
            if k in pred_per_type:
                v.score_set(pred_per_type[k], gold_per_type[k])
        for k, v in score_gold_per_type.items():
            if k in pred_gold_per_type:
                v.score_set(pred_gold_per_type[k], gold_per_type[k])
        # Score for all labels
        score_gold.score_set(pred_spans_gold, gold_spans)
        score.score_set(pred_spans, gold_spans)
    # Assemble final result
    final_scores: Dict[str, Any] = {
        "pgold": None,
        "rgold": None,
        "fgold": None,
        "p": None,
        "r": None,
        "f": None,
    }
    final_scores["score_per_type"] = None
    final_scores["score_gold_per_type"] = None
    if len(score) > 0:
        final_scores["pgold"] = score_gold.precision
        final_scores["rgold"] = score_gold.recall
        final_scores["fgold"] = score_gold.fscore
        final_scores["fpgold"] = score_gold.fp / len(examples)
        final_scores["p"] = score.precision
        final_scores["r"] = score.recall
        final_scores["f"] = score.fscore
        final_scores["fp"] = score.fp / len(examples)          
        final_scores["score_per_type"] = {
            k: v.to_dict() for k, v in score_per_type.items()
        }      
        final_scores["score_gold_per_type"] = {
            k: v.to_dict() for k, v in score_per_type.items()
        }    
    return final_scores

In [13]:
base_path = '/training/echo/text_mining/spancat_models/reduced_labels/spacy_data/'

df = pd.DataFrame(columns=['entity', 'p_w', 'p_m', 'pgold_w', 'pgold_m', 'r_w', 'r_m', 'rgold_w', 'rgold_m', 'f_w', 'f_m', 'fgold_w', 'fgold_m',
                           'jaccard', 'jaccard_rev', 'fp', 'fpgold'])

files = [x for x in os.listdir(base_path) if not x.startswith('.')]

# Iterate over all abnormalities
for file in tqdm(files):
    if file == 'merged_labels':
        continue
    data = {'entity': file}

    # Load model and data, run inference
    examples = load_and_run_regex(base_path, file)
    
    # Assess PRF for all spans
    scores = regex_scorer(examples, 'sc')
    
    for metric in ['p', 'r', 'f']:    
        # Table 3
        data[f'{metric}gold_w'] = scores[f'{metric}gold'] # Weighted PRF for gold spans
        data[f'{metric}gold_m'] = np.mean([v[f'{metric}'] for _, v in scores['score_gold_per_type'].items()]) # Macro PRF for gold spans        
        
        # Table 4
        data[f'{metric}_w'] = scores[f'{metric}'] # Weighted PRF (identical to PRF reported in meta.json)
        data[f'{metric}_m'] = np.mean([v[f'{metric}'] for _, v in scores['score_per_type'].items()]) # Macro PRF

    # Assess Jaccard index (Table 5)
    OverlapJaccardIndices = span_overlap_counter(examples)
    OverlapJaccardIndicesRev = span_overlap_counter_reverse(examples)
    data[f'jaccard'] = round(np.nanmean([_v for v in OverlapJaccardIndices for _v in v]), 2)
    data[f'jaccard_rev'] = round(np.nanmean([_v for v in OverlapJaccardIndicesRev for _v in v]), 2)

    # Table x - False positive predictions
    data['fp'] = scores['fp']
    data['fpgold'] = scores['fpgold']
    
    # Add data row
    df.loc[len(df)] = data

  0%|          | 0/12 [00:00<?, ?it/s]

In [14]:
df.sort_values('entity', inplace=True)
df.reset_index(drop=True, inplace=True)

In [15]:
df

Unnamed: 0,entity,p_w,p_m,pgold_w,pgold_m,r_w,r_m,rgold_w,rgold_m,f_w,f_m,fgold_w,fgold_m,jaccard,jaccard_rev,fp,fpgold
0,aortic_regurgitation,0.959276,0.930054,0.97136,0.930054,0.867076,0.847638,0.832311,0.847638,0.910849,0.886045,0.896476,0.886045,0.99,0.99,0.016158,0.010772
1,aortic_stenosis,0.844156,0.803139,0.868243,0.803139,0.785498,0.728043,0.776435,0.728043,0.813772,0.760961,0.819777,0.760961,0.96,0.96,0.049536,0.040248
2,diastolic_dysfunction,0.577947,0.620456,0.592742,0.620456,0.522337,0.481922,0.505155,0.481922,0.548736,0.523364,0.545455,0.523364,0.86,0.84,0.114551,0.104231
3,lv_dil,0.693878,0.859709,0.85974,0.859709,0.762332,0.835749,0.742152,0.835749,0.726496,0.845332,0.796631,0.845332,0.96,0.86,0.154799,0.055728
4,lv_syst_func,0.302491,0.331638,0.331897,0.331638,0.086207,0.082655,0.078093,0.082655,0.134175,0.13034,0.126437,0.13034,0.82,0.82,0.20227,0.159959
5,mitral_regurgitation,0.951271,0.933722,0.961712,0.933722,0.873541,0.859109,0.830739,0.859109,0.910751,0.894824,0.891441,0.894824,0.99,0.98,0.023736,0.017544
6,pe,0.691244,0.192308,0.691244,0.192308,0.614754,0.162162,0.614754,0.162162,0.650759,0.175953,0.650759,0.175953,0.87,0.86,0.038886,0.038886
7,rv_dil,0.792576,0.788608,0.763496,0.788608,0.688805,0.662766,0.563567,0.662766,0.737056,0.716867,0.648472,0.716867,0.97,0.92,0.05908,0.057214
8,rv_syst_func,0.845794,0.875899,0.935484,0.875899,0.370902,0.30635,0.356557,0.30635,0.51567,0.439737,0.51632,0.439737,0.98,0.97,0.034056,0.012384
9,tricuspid_regurgitation,0.919075,0.845579,0.924242,0.845579,0.880886,0.822905,0.844875,0.822905,0.899576,0.833812,0.882779,0.833812,0.99,0.99,0.028896,0.0258


In [16]:
for metric in ['f', 'r', 'p']:
    df[metric] = df.apply(lambda x: str(round(x[f'{metric}_w'], 2)) + ' (' + str(round(x[f'{metric}_m'], 2)) + ')', axis=1)
    df[f'{metric}gold'] = df.apply(lambda x: str(round(x[f'{metric}gold_w'], 2)) + ' (' + str(round(x[f'{metric}gold_m'], 2)) + ')', axis=1)

In [17]:
## Table 3 - PRF scores for the exact gold spans
#df[['entity', 'fgold', 'rgold', 'pgold']].to_latex('/training/echo/text_mining/output/table3_regex_pipeline_performance_goldspans.tex', index=False)
df[['entity', 'fgold', 'rgold', 'pgold']]

Unnamed: 0,entity,fgold,rgold,pgold
0,aortic_regurgitation,0.9 (0.89),0.83 (0.85),0.97 (0.93)
1,aortic_stenosis,0.82 (0.76),0.78 (0.73),0.87 (0.8)
2,diastolic_dysfunction,0.55 (0.52),0.51 (0.48),0.59 (0.62)
3,lv_dil,0.8 (0.85),0.74 (0.84),0.86 (0.86)
4,lv_syst_func,0.13 (0.13),0.08 (0.08),0.33 (0.33)
5,mitral_regurgitation,0.89 (0.89),0.83 (0.86),0.96 (0.93)
6,pe,0.65 (0.18),0.61 (0.16),0.69 (0.19)
7,rv_dil,0.65 (0.72),0.56 (0.66),0.76 (0.79)
8,rv_syst_func,0.52 (0.44),0.36 (0.31),0.94 (0.88)
9,tricuspid_regurgitation,0.88 (0.83),0.84 (0.82),0.92 (0.85)


In [18]:
## Table 4 - PRF scores for all predicted spans
#df[['entity', 'f', 'r', 'p']].to_latex('/training/echo/text_mining/output/table4_regex_pipeline_performance.tex', index=False)
df[['entity', 'f', 'r', 'p']]

Unnamed: 0,entity,f,r,p
0,aortic_regurgitation,0.91 (0.89),0.87 (0.85),0.96 (0.93)
1,aortic_stenosis,0.81 (0.76),0.79 (0.73),0.84 (0.8)
2,diastolic_dysfunction,0.55 (0.52),0.52 (0.48),0.58 (0.62)
3,lv_dil,0.73 (0.85),0.76 (0.84),0.69 (0.86)
4,lv_syst_func,0.13 (0.13),0.09 (0.08),0.3 (0.33)
5,mitral_regurgitation,0.91 (0.89),0.87 (0.86),0.95 (0.93)
6,pe,0.65 (0.18),0.61 (0.16),0.69 (0.19)
7,rv_dil,0.74 (0.72),0.69 (0.66),0.79 (0.79)
8,rv_syst_func,0.52 (0.44),0.37 (0.31),0.85 (0.88)
9,tricuspid_regurgitation,0.9 (0.83),0.88 (0.82),0.92 (0.85)


In [19]:
## Table 5 - Jaccard similarity for all gold spans and for all predicted spans 
#df[['entity', 'jaccard', 'jaccard_rev']].to_latex('/training/echo/text_mining/output/table5_regex_jaccard_labeltospan.tex', index=False)
df[['entity', 'jaccard', 'jaccard_rev']]

Unnamed: 0,entity,jaccard,jaccard_rev
0,aortic_regurgitation,0.99,0.99
1,aortic_stenosis,0.96,0.96
2,diastolic_dysfunction,0.86,0.84
3,lv_dil,0.96,0.86
4,lv_syst_func,0.82,0.82
5,mitral_regurgitation,0.99,0.98
6,pe,0.87,0.86
7,rv_dil,0.97,0.92
8,rv_syst_func,0.98,0.97
9,tricuspid_regurgitation,0.99,0.99


In [20]:
# Table x - False positives
df[['entity', 'fp', 'fpgold']]

Unnamed: 0,entity,fp,fpgold
0,aortic_regurgitation,0.016158,0.010772
1,aortic_stenosis,0.049536,0.040248
2,diastolic_dysfunction,0.114551,0.104231
3,lv_dil,0.154799,0.055728
4,lv_syst_func,0.20227,0.159959
5,mitral_regurgitation,0.023736,0.017544
6,pe,0.038886,0.038886
7,rv_dil,0.05908,0.057214
8,rv_syst_func,0.034056,0.012384
9,tricuspid_regurgitation,0.028896,0.0258
