In [1]:
import os
import json
import pandas as pd
import spacy
import numpy as np
from tqdm.notebook import tqdm
from spacy.tokens import DocBin
from spacy.vocab import Vocab
from spacy.scorer import Scorer
from spacy.training import Example
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
def load_and_run_model(base_path, file, model):
    # Load model components and test data
    nlp = spacy.load(base_path + file + '/' + model + '/model-best')
    doc_bin = DocBin().from_disk(base_path + file + '/test.spacy')
    docs = list(doc_bin.get_docs(nlp.vocab))
    tok2vec = nlp.get_pipe('tok2vec')
    spancat = nlp.get_pipe('spancat')
    scorer = Scorer(nlp)
    
    label_rev_map = {v: k for k, v in spancat._label_map.items()}
    label_rev_map[spancat._negative_label_i] = 'no_label'

    # Run inference
    examples = []
    true_spans = []
    pred_spans = []
    for doc in docs:
        # Run pipeline
        prediction = nlp(doc.text)
        example = Example(prediction, doc)
        examples.append(example)

        # Run components separately, to access predictions for specific spans
        doc = tok2vec(doc)
        indices, scores = spancat.predict([doc])
        for span in doc.spans['sc']:
            
            # Find index to corresponding prediction
            target_value = [span.start, span.end]
            span_index = None
            for i, value in enumerate(indices.data):
                if np.array_equal(value, target_value):
                    span_index = i
                    break
                    
            # Access prediction for gold span        
            gold_span_preds = scores[[span_index]]
            predicted_label = label_rev_map[gold_span_preds.argmax()]
            
            # Store span labels for PRF calculations
            true_spans.append(span.label_)
            pred_spans.append(predicted_label)
    
    # Assess performance from inference
    scores = scorer.score(examples)
    return examples, scores, true_spans, pred_spans

def _tuple_overlap(tL, tR):
    # tL: tuple(begin, end)
    # tR: tuple(begin, end)
    tLrange = set(range(len(tL)))
    tRrange = set(range(len(tR)))
               
    InterSection = len(tLrange.intersection(tRrange))
    Union = len(tLrange.union(tRrange))

    return InterSection/Union if Union>0 else np.nan


def span_overlap_counter(examples):
    overlaplist = []
    for case in examples:
        span_set_spancat = set()
        span_set_labeled = set()
        
        for span in case.reference.spans['sc']:
            span_set_labeled.add(span)
            
        for span in case.predicted.spans['sc']:
            span_set_spancat.add(span)

        jaccard_indices = []
        for span_l in span_set_labeled:
            _jaccard_indices = []
            for span_s in span_set_spancat:
                _jaccard_indices.append(_tuple_overlap(span_l, span_s))
            try:
                max_ = max(_jaccard_indices)
            except:
                max_ = np.nan
            jaccard_indices.append(max_)
        overlaplist.append(jaccard_indices)
    return overlaplist


def span_overlap_counter_reverse(examples):
    overlaplist = []
    for case in examples:
        span_set_spancat = set()
        span_set_labeled = set()
        
        for span in case.reference.spans['sc']:
            span_set_labeled.add(span)
            
        for span in case.predicted.spans['sc']:
            span_set_spancat.add(span)

        jaccard_indices = []
        for span_s in span_set_spancat:
            _jaccard_indices = []
            for span_l in span_set_labeled:
                _jaccard_indices.append(_tuple_overlap(span_l, span_s))
            try:
                max_ = max(_jaccard_indices)
            except:
                max_ = np.nan
            jaccard_indices.append(max_)
        overlaplist.append(jaccard_indices)
    return overlaplist


def calculate_false_label_proportion(true_spans, pred_spans):
    fp = 0
    total = len(true_spans)
    for true, pred in zip(true_spans, pred_spans):
        if true != pred and pred != 'no_label':
            fp += 1
    return fp / total

In [4]:
base_path = '/training/echo/text_mining/spancat_models/reduced_labels/spacy_data/'

df = pd.DataFrame(columns=['entity', 
                           'p_w_06', 'p_w_08', 'p_w_10', 'p_m_06', 'p_m_08', 'p_m_10', 
                           'r_w_06', 'r_w_08', 'r_w_10', 'r_m_06', 'r_m_08', 'r_m_10',
                           'f_w_06', 'f_w_08', 'f_w_10', 'f_m_06', 'f_m_08', 'f_m_10',
                           'jaccard_06', 'jaccard_08', 'jaccard_10', 'jaccard_rev_06', 'jaccard_rev_08', 'jaccard_rev_10',
                          'pgold_w_06', 'pgold_w_08', 'pgold_w_10', 'pgold_m_06', 'pgold_m_08', 'pgold_m_10', 'pgold_mi_06', 'pgold_mi_08', 'pgold_mi_10',
                          'rgold_w_06', 'rgold_w_08', 'rgold_w_10', 'rgold_m_06', 'rgold_m_08', 'rgold_m_10', 'rgold_mi_06', 'rgold_mi_08', 'rgold_mi_10',
                          'fgold_w_06', 'fgold_w_08', 'fgold_w_10', 'fgold_m_06', 'fgold_m_08', 'fgold_m_10', 'fgold_mi_06', 'fgold_mi_08', 'fgold_mi_10', 'fp'])

files = [x for x in os.listdir(base_path) if not x.startswith('.')]

# Iterate over all abnormalities
for file in tqdm(files):
    data = {'entity': file}

    # Iterate over model versions
    for model in [x for x in os.listdir(base_path + file) if not x.endswith('.spacy')]:
        
        # Extract model version
        nw = model.split('_')[-1]

        # Load model and data, run inference
        examples, scores, true_spans, pred_spans = load_and_run_model(base_path, file, model)
        
        # Assess PRF for all spans
        for metric in ['p', 'r', 'f']:
            # Table 3
            data[f'{metric}gold_w_{nw}'] = precision_score(true_spans, pred_spans, average='weighted', zero_division=0) # Weighted PRF for gold spans
            data[f'{metric}gold_m_{nw}'] = precision_score(true_spans, pred_spans, average='macro', zero_division=0) # Macro PRF for gold spans
            data[f'{metric}gold_mi_{nw}'] = precision_score(true_spans, pred_spans, average='micro', zero_division=0) # Micro PRF for gold spans
            
            # Table 4
            data[f'{metric}_w_{nw}'] = scores[f'spans_sc_{metric}'] # Weighted PRF (identical to PRF reported in meta.json)
            data[f'{metric}_m_{nw}'] = np.mean([v[f'{metric}'] for _, v in scores['spans_sc_per_type'].items()]) # Macro PRF 

        # Assess Jaccard index (Table 5)
        OverlapJaccardIndices = span_overlap_counter(examples)
        OverlapJaccardIndicesRev = span_overlap_counter_reverse(examples)
        data[f'jaccard_{nw}'] = np.nanmean([_v for v in OverlapJaccardIndices for _v in v])
        data[f'jaccard_rev_{nw}'] = np.nanmean([_v for v in OverlapJaccardIndicesRev for _v in v])

        # Calculate false label proportion (Table ?)
        data['fp'] = calculate_false_label_proportion(true_spans, pred_spans)
        
    # Add data row
    df.loc[len(df)] = data

  0%|          | 0/12 [00:00<?, ?it/s]

In [5]:
df

Unnamed: 0,entity,p_t_06,p_t_08,p_t_10,p_w_06,p_w_08,p_w_10,p_m_06,p_m_08,p_m_10,...,fgold_w_06,fgold_w_08,fgold_w_10,fgold_m_06,fgold_m_08,fgold_m_10,fgold_mi_06,fgold_mi_08,fgold_mi_10,fp
0,rv_dil,,,,0.9375,0.916155,0.931034,0.751793,0.692235,0.726056,...,0.990315,0.966986,0.974694,0.663194,0.615108,0.634728,0.810247,0.83871,0.870968,0.001898
1,aortic_stenosis,,,,0.889286,0.884211,0.890977,0.808844,0.830746,0.624895,...,0.986103,0.992145,0.953035,0.726667,0.76,0.545455,0.752266,0.758308,0.716012,0.009063
2,pe,,,,0.833333,0.903743,0.923529,0.266836,0.313543,0.184706,...,0.846199,0.854098,0.74866,0.316233,0.316667,0.16457,0.733607,0.692623,0.643443,0.020492
3,aortic_regurgitation,,,,0.953162,0.969072,0.914352,0.726851,0.487992,0.544527,...,0.971188,0.922764,0.913879,0.583424,0.4,0.450732,0.831301,0.772358,0.808943,0.010163
4,lv_dil,,,,0.813679,0.841232,0.884422,0.46069,0.694208,0.910864,...,0.897838,0.958001,0.992591,0.399605,0.590929,0.794834,0.730942,0.778027,0.789238,0.065022
5,merged_labels,,,,0.850537,0.850377,0.845899,0.761713,0.76254,0.73296,...,0.799695,0.785019,0.817884,0.769993,0.769095,0.750579,0.627636,0.621046,0.61797,0.230009
6,lv_syst_func,,,,0.806002,0.790202,0.80521,0.774486,0.7592,0.769599,...,0.994016,0.983827,0.990906,0.786203,0.76687,0.782016,0.762918,0.752786,0.721378,0.004053
7,mitral_regurgitation,,,,0.949791,0.946612,0.955752,0.703604,0.716748,0.719067,...,0.948272,0.952052,0.967983,0.564613,0.58146,0.589379,0.857418,0.876686,0.842004,0.026975
8,wma,,,,0.669565,0.652381,0.703125,0.66834,0.663054,0.710041,...,1.0,1.0,1.0,0.666667,0.666667,0.666667,0.589641,0.525896,0.537849,0.0
9,tricuspid_regurgitation,,,,0.925072,0.932515,0.958333,0.858377,0.850428,0.70137,...,0.96224,0.987121,0.934978,0.719666,0.775883,0.562016,0.883657,0.839335,0.828255,0.033241


In [14]:
# Choose entity-specific best model with regards to negative weight
# Based on highest F-score
df['f_w_max'] = np.nanmax(df[['f_w_06', 'f_w_08', 'f_w_10']], axis=1)
df['nw_max'] = np.nanargmax(df[['f_w_06', 'f_w_08', 'f_w_10']], axis=1)
df['nw_max'].replace({0: '06', 1: '08', 2: '10'}, inplace=True)

# Pick corresponding performance metrics
df['jaccard_max'] = df.apply(lambda x: round(x['jaccard_' + str(x['nw_max'])], 2), axis=1)
df['jaccard_rev_max'] = df.apply(lambda x: round(x['jaccard_rev_' + str(x['nw_max'])], 2), axis=1)

for metric in ['p', 'r', 'f']:
    df[f'{metric}'] = df.apply(lambda x: str(round(x[f'{metric}_w_' + str(x['nw_max'])], 2)) + ' (' + str(round(x[f'{metric}_m_' + str(x['nw_max'])], 2)) + ')', axis=1)
    df[f'{metric}gold'] = df.apply(lambda x: str(round(x[f'{metric}gold_w_' + str(x['nw_max'])], 2)) + ' (' + str(round(x[f'{metric}gold_m_' + str(x['nw_max'])], 2)) + ')', axis=1)

df['fp'] = df['fp'].apply(lambda x: round(x, 3))

In [8]:
df = df.sort_values('entity').reset_index(drop=True)

In [9]:
df

Unnamed: 0,entity,p_t_06,p_t_08,p_t_10,p_w_06,p_w_08,p_w_10,p_m_06,p_m_08,p_m_10,...,f_w_max,nw_max,jaccard_max,jaccard_rev_max,p,pgold,r,rgold,f,fgold
0,aortic_regurgitation,,,,0.953162,0.969072,0.914352,0.726851,0.487992,0.544527,...,0.888646,6,0.97,0.98,0.95 (0.73),0.97 (0.58),0.83 (0.59),0.97 (0.58),0.89 (0.65),0.97 (0.58)
1,aortic_stenosis,,,,0.889286,0.884211,0.890977,0.808844,0.830746,0.624895,...,0.818182,8,0.95,0.97,0.88 (0.83),0.99 (0.76),0.76 (0.66),0.99 (0.76),0.82 (0.73),0.99 (0.76)
2,diastolic_dysfunction,,,,0.92549,0.906015,0.916981,0.884612,0.864572,0.884596,...,0.874101,10,0.98,0.99,0.92 (0.88),0.98 (0.78),0.84 (0.8),0.98 (0.78),0.87 (0.84),0.98 (0.78)
3,lv_dil,,,,0.813679,0.841232,0.884422,0.46069,0.694208,0.910864,...,0.834123,10,0.96,0.96,0.88 (0.91),0.99 (0.79),0.79 (0.82),0.99 (0.79),0.83 (0.86),0.99 (0.79)
4,lv_syst_func,,,,0.806002,0.790202,0.80521,0.774486,0.7592,0.769599,...,0.783742,6,0.96,0.95,0.81 (0.77),0.99 (0.79),0.76 (0.73),0.99 (0.79),0.78 (0.75),0.99 (0.79)
5,merged_labels,,,,0.850537,0.850377,0.845899,0.761713,0.76254,0.73296,...,0.722229,6,0.97,0.98,0.85 (0.76),0.8 (0.77),0.63 (0.65),0.8 (0.77),0.72 (0.69),0.8 (0.77)
6,mitral_regurgitation,,,,0.949791,0.946612,0.955752,0.703604,0.716748,0.719067,...,0.921079,8,0.97,0.97,0.95 (0.72),0.95 (0.58),0.9 (0.69),0.95 (0.58),0.92 (0.7),0.95 (0.58)
7,pe,,,,0.833333,0.903743,0.923529,0.266836,0.313543,0.184706,...,0.784223,8,0.97,0.98,0.9 (0.31),0.85 (0.32),0.69 (0.24),0.85 (0.32),0.78 (0.27),0.85 (0.32)
8,rv_dil,,,,0.9375,0.916155,0.931034,0.751793,0.692235,0.726056,...,0.9,10,0.99,0.99,0.93 (0.73),0.97 (0.63),0.87 (0.66),0.97 (0.63),0.9 (0.69),0.97 (0.63)
9,rv_syst_func,,,,0.919214,0.905579,0.931663,0.672436,0.641564,0.667808,...,0.890063,6,0.99,0.99,0.92 (0.67),0.95 (0.57),0.86 (0.63),0.95 (0.57),0.89 (0.65),0.95 (0.57)


In [10]:
## Table 3 - PRF scores for the exact gold spans
#df[['entity', 'fgold', 'rgold', 'pgold']].to_latex('/training/echo/text_mining/output/table3_spancat_pipeline_performance_goldspans.tex', index=False)
df[['entity', 'fgold', 'rgold', 'pgold']]

Unnamed: 0,entity,fgold,rgold,pgold
0,aortic_regurgitation,0.97 (0.58),0.97 (0.58),0.97 (0.58)
1,aortic_stenosis,0.99 (0.76),0.99 (0.76),0.99 (0.76)
2,diastolic_dysfunction,0.98 (0.78),0.98 (0.78),0.98 (0.78)
3,lv_dil,0.99 (0.79),0.99 (0.79),0.99 (0.79)
4,lv_syst_func,0.99 (0.79),0.99 (0.79),0.99 (0.79)
5,merged_labels,0.8 (0.77),0.8 (0.77),0.8 (0.77)
6,mitral_regurgitation,0.95 (0.58),0.95 (0.58),0.95 (0.58)
7,pe,0.85 (0.32),0.85 (0.32),0.85 (0.32)
8,rv_dil,0.97 (0.63),0.97 (0.63),0.97 (0.63)
9,rv_syst_func,0.95 (0.57),0.95 (0.57),0.95 (0.57)


In [11]:
## Table 4 - PRF scores for all predicted spans
#df[['entity', 'f', 'r', 'p']].to_latex('/training/echo/text_mining/output/table4_spancat_pipeline_performance.tex', index=False)
df[['entity', 'f', 'r', 'p']]

Unnamed: 0,entity,f,r,p
0,aortic_regurgitation,0.89 (0.65),0.83 (0.59),0.95 (0.73)
1,aortic_stenosis,0.82 (0.73),0.76 (0.66),0.88 (0.83)
2,diastolic_dysfunction,0.87 (0.84),0.84 (0.8),0.92 (0.88)
3,lv_dil,0.83 (0.86),0.79 (0.82),0.88 (0.91)
4,lv_syst_func,0.78 (0.75),0.76 (0.73),0.81 (0.77)
5,merged_labels,0.72 (0.69),0.63 (0.65),0.85 (0.76)
6,mitral_regurgitation,0.92 (0.7),0.9 (0.69),0.95 (0.72)
7,pe,0.78 (0.27),0.69 (0.24),0.9 (0.31)
8,rv_dil,0.9 (0.69),0.87 (0.66),0.93 (0.73)
9,rv_syst_func,0.89 (0.65),0.86 (0.63),0.92 (0.67)


In [12]:
## Table 5 - Jaccard similarity for all gold spans and for all predicted spans 
#df[['entity', 'jaccard_max', 'jaccard_rev_max']].to_latex('/training/echo/text_mining/output/table5_spancat_jaccard_labeltospan.tex', index=False)
df[['entity', 'jaccard_max', 'jaccard_rev_max']]

Unnamed: 0,entity,jaccard_max,jaccard_rev_max
0,aortic_regurgitation,0.97,0.98
1,aortic_stenosis,0.95,0.97
2,diastolic_dysfunction,0.98,0.99
3,lv_dil,0.96,0.96
4,lv_syst_func,0.96,0.95
5,merged_labels,0.97,0.98
6,mitral_regurgitation,0.97,0.97
7,pe,0.97,0.98
8,rv_dil,0.99,0.99
9,rv_syst_func,0.99,0.99


In [15]:
## Table x - False positives as a percentage of all predicted labels
df[['entity', 'fp']]

Unnamed: 0,entity,fp
0,aortic_regurgitation,0.01
1,aortic_stenosis,0.009
2,diastolic_dysfunction,0.014
3,lv_dil,0.065
4,lv_syst_func,0.004
5,merged_labels,0.23
6,mitral_regurgitation,0.027
7,pe,0.02
8,rv_dil,0.002
9,rv_syst_func,0.025
