In [1]:
import numpy as np
import pandas as pd
import re
import spacy

from sklearn.model_selection import StratifiedGroupKFold
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import *

from evalutils import get_feature_names, evaluate_ner

import matplotlib.pyplot as plt
import seaborn as sns
# Pretty plots
%matplotlib inline
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['legend.title_fontsize'] = 12

# Display wide columns
pd.options.display.max_colwidth = 100

___
# Gold standard annotations
### Load gold standard concepts

In [2]:
# Load gold standard concepts
true_concepts = pd.read_csv("gold_concepts.csv")
true_concepts.drop(['concept_id','preceding', 'following'], axis=1, inplace=True)
print(true_concepts.shape)
true_concepts.head()

(1137, 8)


Unnamed: 0,histopathology_id,patient_id,report_no,concept,phrase,position,start_char,end_char
0,754,1,1,SampleType,skin,202 206,202,206
1,214,2,1,FungalDescriptor,Fungal elements,1374 1389,1374,1389
2,214,2,1,Stain,Grocott,1412 1419,1412,1419
3,214,2,1,positive,identified,1394 1404,1394,1404
4,214,2,1,FungalDescriptor,Fungal elements,1593 1608,1593,1608


### Separate discontinuous concepts

In [3]:
# Discont concepts have ;-separated positions
idx = true_concepts[true_concepts.position.str.contains(";")].index

# Split discont concepts into a separate dataframe
discont = true_concepts.iloc[idx].copy()
true_concepts.drop(idx, inplace=True)

In [4]:
# Loop over discont concepts extracting individual spans
for _,x in discont.iterrows():
    spans = []
    i = 0
    for pos in x.position.split(';'):
        # Extract start and end char positions
        start_char, end_char = map(int, pos.split())
        # Calculate span length
        len_span = end_char - start_char
        # Extract span text
        phrase = x.phrase[i:i+len_span]
        # Add to list of spans
        spans.append((start_char, end_char, phrase))
        i = i + len_span + 1
        
    # Sort extracted spans by starting position
    spans = sorted(spans, key=lambda x: x[0])
    
    # Append extracted spans to the dataframe with gold standard concepts 
    for span in spans:
        tmp = x.copy()
        tmp['start_char'] = span[0]
        tmp['end_char'] = span[1]
        tmp['phrase'] = span[2]
        true_concepts = pd.concat([true_concepts, tmp.to_frame().T], axis=0, ignore_index=True)
        
# Remove position column
true_concepts.drop('position', axis=1, inplace=True)
true_concepts.shape

(1155, 7)

### Load tokeniser

In [5]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Apply tokeniser to gold standard annotations
true_concepts['doc'] = true_concepts.phrase.apply(nlp.tokenizer)

### Assign BIOES tags

In [6]:
# Create dataframe to store concepts with BIOES tags
true_concepts_bioes = pd.DataFrame(columns=true_concepts.columns)

# Single-token entities
true_concepts_bioes = true_concepts[true_concepts.doc.apply(len) == 1].copy()

# Add the "S" tag
true_concepts_bioes.concept = true_concepts_bioes.concept.apply(lambda x: "S-" + x)

# Remove doc
true_concepts_bioes.drop('doc', axis=1, inplace=True)

In [7]:
# Multi-token entities
for _,x in true_concepts[true_concepts.doc.apply(len) > 1].iterrows():
    
    # Loop over tokens
    for token in x.doc:
        
        # Skip if whitespace
        if token.is_space:
            continue
        
        # If the first token tag with "B-"
        if token.i==0:
            concept = "B-" + x.concept
            
        # If the last token tag with "E-"
        elif token.i+1==len(x.doc):
            concept = "E-" + x.concept
            
        # If in the middle tag with "I-"
        else:
            concept = "I-" + x.concept

        # Adjust start char position
        start_char = x.start_char + token.idx 

        tmp = pd.DataFrame({
            'histopathology_id': x.histopathology_id,
            'patient_id': x.patient_id, 
            'report_no': x.report_no, 
            'concept': concept, 
            'phrase': token,
            'start_char': start_char,
            'end_char': start_char + len(token),
        }, index=[0])

        # Add to the table of concepts
        true_concepts_bioes = pd.concat([true_concepts_bioes, tmp], axis=0, ignore_index=True) 
        
# Sort BIOES tagged concepts
true_concepts_bioes.sort_values(by=['histopathology_id', 'start_char'], inplace=True)
true_concepts_bioes.shape

(1649, 7)

___
# Prepare data
### Load reports

In [4]:
# Load the development set of reports
df = pd.read_csv("reports_test_justin.csv")
print(df.shape)
df.head()

(52, 5)


Unnamed: 0,histopathology_id,patient_id,report_no,order_results,y
0,214,2,1,"""URNO XXXXXXXX \nLab No XXXXXXXXX Specimen BRUSHINGS \n\n\nCLINICAL NOTES: \n\nLU...",1
1,127,23,1,"""URNO XXXXXXXXX \nLab No XXXXXXXXX Specimen BAL \n\n\nSPECIMEN \n1. Right upper l...",1
2,833,25,1,"""URNO XXXXXXXXX \nLab No XXXXXXXXX Specimen WASHINGS \n\n\nCLINICAL NOTES: \nNo c...",1
3,194,38,1,"""XXXXXXX F XXXXXXXXXX Report (XXXXXXXX)\nMACROSCOPIC DESCRIPTION: Biopsy of cyst"""": Three fr...",1
4,649,48,1,"""XXXXXXX \nM\nXXXXXXXXX\n \nReport (XXXXXXXX)\nCLINICAL NOTES: AML R bronchial thickening on C...",1


### Tokenize

In [9]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Enable sentensizer
nlp.enable_pipe('senter')

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer']

In [10]:
# Apply the NLP pipe   line
df['doc'] = df.order_results.apply(nlp)

### Extract features

In [11]:
def get_neighbors(doc, i):
    """
    Extract the previous and following tokens ignoring whitespaces.
    """
    if (i==0) or (i==1 and doc[i-1].is_space):
        prev_token = ""
    else:
        prev_token = doc[i-2].text if doc[i-1].is_space else doc[i-1].text
        
    if (i==len(doc)-1) or (i==len(doc)-2 and doc[-1].is_space):
        next_token = ""
    else:
        next_token = doc[i+2].text if doc[i+1].is_space else doc[i+1].text
    
    return prev_token, next_token
    
    
def create_features(doc):
    """
    Parses a doc and creates a dictionary of features for each token that is not a whitespace.
    """
    features = []
    
    for token in doc:
        
        # Skip if whitespace
        if token.is_space:
            continue
            
        # Get previous and next token
        prev_token, next_token = get_neighbors(doc, token.i)

        # Create a dict of features
        token_features = {
                'phrase': token.text,
                'start_char': token.idx,
                'end_char': token.idx + len(token),
                'is_capitilized': token.is_alpha and (token.text[0] == token.text.upper()[0]),
                'is_upper': token.is_upper,
                'is_lower': token.is_lower,
                'prefix1': token.text[:1],
                'prefix2': token.text[:2] if len(token)>1 else "",
                'prefix3': token.text[:3] if len(token)>2 else "",
                'suffix1': token.text[-1:],
                'suffix2': token.text[-2:] if len(token)>1 else "",
                'suffix3': token.text[-3:] if len(token)>2 else "",
                'prev_token': prev_token,
                'next_token': next_token,
                'has_hyphen': '-' in token.text,
                'is_alpha': token.is_alpha,
                'is_digit': token.is_digit,
                'is_sent_start': token.is_sent_start,
                'is_sent_end': token.is_sent_end,
                'is_punct': token.is_punct,
            }
        
        features.append(token_features)
        
    return features

In [12]:
# Feature map
df['token_features'] = df.doc.apply(create_features)

### Label concepts

In [13]:
def create_labels(x):
    """
    Assigns categories to gold standard concepts or 0, if a token was not annotated.
    """
    labels = []
    for token in x.doc:
        # Skip if whitespace
        if token.is_space:
            continue

        # Is there an annotated entity in the same location?
        concept = true_concepts_bioes.loc[(true_concepts_bioes.histopathology_id==x.histopathology_id) & 
                                          (true_concepts_bioes.start_char==token.idx), 'concept'] 
        
        # Assign labels
        if concept.empty:
            labels.append("O")
        else:
            labels.append(concept.iloc[0])
    
    return labels

In [14]:
%%time
# Labels
df['token_labels'] = df.apply(create_labels, axis=1)

CPU times: user 33 s, sys: 552 ms, total: 33.6 s
Wall time: 36.4 s


### Hyperparameter tuning

In [15]:
%%time
X = df.order_results
y = df.y
groups = df.patient_id

param_space = {
    'c1': [0.01, 0.1, 1, 10],
    'c2': [0.01, 0.1, 1, 10],
#     'all_possible_states': [False, True],
#     'all_possible_transitions': [False, True],
}

best_params = None
best_score = float('-inf')

for c1 in param_space['c1']:
    for c2 in param_space['c2']:

        cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=3)
        f1_score = []

        for train_idx, val_idx in cv.split(X, y, groups):
            
            # Initialise CRF object 
            crf = CRF(algorithm='lbfgs', c1=c1, c2=c2)

            # Train the model
            crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

            # Make predictions on the validation fold
            y_pred = crf.predict(df.loc[val_idx, 'token_features'])

            # Calculate macro f1
            f1_score.append(flat_f1_score(df.loc[val_idx, 'token_labels'], y_pred, average='macro'))
            
        print("With parameters c1=%.2f and c2=%.2f, the model achieves %.2f (+/- %.2f)." % 
              (c1, c2, np.mean(f1_score), np.std(f1_score)))

        if np.mean(f1_score) > best_score:
            best_score = np.mean(f1_score)
            best_params = {'c1': c1, 'c2': c2}
                
                
print("Best macro F1 score = %.2f. Best hyperparameter values: c1=%.2f and c2=%.2f." % 
      (best_score, best_params['c1'], best_params['c2']))

With parameters c1=0.01 and c2=0.01, the model achieves 0.50 (+/- 0.10).
With parameters c1=0.01 and c2=0.10, the model achieves 0.51 (+/- 0.11).
With parameters c1=0.01 and c2=1.00, the model achieves 0.44 (+/- 0.11).
With parameters c1=0.01 and c2=10.00, the model achieves 0.19 (+/- 0.04).
With parameters c1=0.10 and c2=0.01, the model achieves 0.48 (+/- 0.10).
With parameters c1=0.10 and c2=0.10, the model achieves 0.50 (+/- 0.10).
With parameters c1=0.10 and c2=1.00, the model achieves 0.44 (+/- 0.11).
With parameters c1=0.10 and c2=10.00, the model achieves 0.19 (+/- 0.04).
With parameters c1=1.00 and c2=0.01, the model achieves 0.45 (+/- 0.10).
With parameters c1=1.00 and c2=0.10, the model achieves 0.45 (+/- 0.10).
With parameters c1=1.00 and c2=1.00, the model achieves 0.43 (+/- 0.11).
With parameters c1=1.00 and c2=10.00, the model achieves 0.19 (+/- 0.04).
With parameters c1=10.00 and c2=0.01, the model achieves 0.22 (+/- 0.05).
With parameters c1=10.00 and c2=0.10, the model

### Model evaluation

In [None]:
%%time
X = df.order_results
y = df.y
groups = df.patient_id

df['y_pred'] = np.empty((len(df), 0)).tolist()

cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=3)

for train_idx, val_idx in cv.split(X, y, groups):
            
    # Initialise CRF object 
    crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)

    # Train the model
    crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

    # Make predictions on the validation fold
    df.loc[val_idx, 'y_pred'] = pd.Series(crf.predict(df.loc[val_idx, 'token_features']), 
                                          index=val_idx)

In [None]:
detected_concepts_bioes = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 'fold',
                                                'concept', 'phrase', 'start_char', 'end_char'])

for _,x in df.iterrows(): 
    
    # Convert to dataframe
    tmp = pd.concat([pd.DataFrame(x.token_features, columns=['phrase', 'start_char', 'end_char']), 
                     pd.Series(x.y_pred, name='concept')],
                    axis=1)
    tmp = tmp[tmp.concept!='O']
    
    # Add metadata
    tmp['histopathology_id'] = x.histopathology_id
    tmp['patient_id'] = x.patient_id
    tmp['report_no'] = x.report_no
    tmp['fold'] = x.fold  
    
    
    # Add to the table of detected concepts
    detected_concepts_bioes = pd.concat([detected_concepts_bioes, tmp], axis=0, ignore_index=True)   
    
detected_concepts_bioes.shape

In [None]:
detected_concepts = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 'fold',
                                          'concept', 'phrase', 'start_char', 'end_char'])

for _,x in df.iterrows(): 
    
    ents = {k:[] for k in ('concept', 'phrase', 'start_char', 'end_char')}

    for i,y in enumerate(x.y_pred):
        if y=="O":
            continue
        if y.startswith("S-"):
            # Record start and end char positions
            start_char = x.token_features[i]['start_char']
            end_char = x.token_features[i]['end_char']

            # Add single-token entity
            ents['concept'].append(y[2:])
            ents['phrase'].append(x.order_results[start_char:end_char])
            ents['start_char'].append(start_char)
            ents['end_char'].append(end_char)

            # Reset start_char, end_char (optional)
            start_char, end_char = None, None

        elif y.startswith("B-"):
            # Only track a multi-token entity if B is followed by I or E
            if x.y_pred[i+1].startswith("I-") or x.y_pred[i+1].startswith("E-"):
                # Record start char position
                start_char = x.token_features[i]['start_char']
            else:
                continue

        elif y.startswith("I-"):
            continue

        elif y.startswith("E-"):
            # Record end char position
            end_char = x.token_features[i]['end_char']

            # Add multi-token entity
            ents['concept'].append(y[2:])
            ents['phrase'].append(x.order_results[start_char:end_char])
            ents['start_char'].append(start_char)
            ents['end_char'].append(end_char)
            
            # Reset start_char, end_char (optional)
            start_char, end_char = None, None

    # Convert to dataframe    
    tmp = pd.DataFrame(ents)
    
    # Add metadata
    tmp['histopathology_id'] = x.histopathology_id
    tmp['patient_id'] = x.patient_id
    tmp['report_no'] = x.report_no
    tmp['fold'] = x.fold   
    
    # Add to the table of detected concepts
    detected_concepts = pd.concat([detected_concepts, tmp], axis=0, ignore_index=True)   
    
detected_concepts.shape

In [None]:
detected_concepts.head()

### Evaluate results

In [None]:
feature_names = get_feature_names('concepts', ('B-', 'I-', 'E-', 'S-'))
evaluate_ner(df[['histopathology_id', 'fold']], true_concepts_bioes, detected_concepts_bioes, feature_names)

In [None]:
feature_names = get_feature_names('concepts')
evaluate_ner(df[['histopathology_id', 'fold']], true_concepts, detected_concepts, feature_names)

### Plot comparison

In [None]:
feature_names = get_feature_names('concepts')
feature_names

In [None]:
# Dictionary-based precision
dict_prec_mean = [0.92, 0.75, 0.82, 0.45, 0.94, 0.15, 0.04, 0.01, 0.14]
dict_prec_std = [0.13, 0.1, 0.3, 0.41, 0.05, 0.03, 0.02, 0.02, 0.04]

# Dictionary-based recall
dict_rec_mean = [0.53, 0.93, 0.92, 0.60, 0.95, 0.86, 0.83, 0.58, 0.98]
dict_rec_std = [0.35, 0.04, 0.15, 0.39, 0.09, 0.1, 0.17, 0.5, 0.05]

In [None]:
scores = pd.DataFrame({'feature_names': feature_names, 
                       'dict_prec_mean': dict_prec_mean, 
                       'dict_prec_std': dict_prec_std, 
                       'crf_prec_mean': crf_prec_mean, 
                       'crf_prec_std': crf_prec_std, 
                       'dict_rec_mean': dict_rec_mean, 
                       'dict_rec_std': dict_rec_std, 
                       'crf_rec_mean': crf_rec_mean, 
                       'crf_rec_std': crf_rec_std})

In [None]:
plt.rcParams['figure.figsize'] = (10, 4)

colors = (sns.color_palette()[0], sns.color_palette()[3])

# Dictionary-based approach
plt.errorbar(x=scores.feature_names, y=scores.dict_prec_mean, yerr=scores.dict_prec_std, 
             fmt='o', capsize=2, color=colors[0], label="Dictionary")

# CRF
plt.errorbar(x=scores.feature_names, y=scores.crf_prec_mean, yerr=scores.crf_prec_std, 
             fmt='o', capsize=2, color=colors[1], label="CRF")

plt.legend();
plt.xticks(rotation=90);
plt.title("Precision CV");
plt.ylim([-0.2, 1.5])

plt.savefig("comparison_precision_cv", dpi=300, bbox_inches="tight");

In [None]:
plt.rcParams['figure.figsize'] = (10, 4)

colors = (sns.color_palette()[0], sns.color_palette()[3])

# Dictionary-based approach
plt.errorbar(x=scores.feature_names, y=scores.dict_rec_mean, yerr=scores.dict_rec_std, 
             fmt='o', capsize=2, color=colors[0], label="Dictionary")

# CRF
plt.errorbar(x=scores.feature_names, y=scores.crf_rec_mean, yerr=scores.crf_rec_std, 
             fmt='o', capsize=2, color=colors[1], label="CRF")

plt.legend();
plt.xticks(rotation=90);
plt.title("Recall CV");
plt.ylim([-0.2, 1.5]);

plt.savefig("comparison_recall_cv", dpi=300, bbox_inches="tight");