In [1]:
import numpy as np
import pandas as pd
import re
import spacy

from sklearn.model_selection import StratifiedGroupKFold
from sklearn_crfsuite import CRF

from utils import *
from evalutils import *

import warnings
warnings.filterwarnings("ignore")

___
# Gold standard annotations
### Load gold standard concepts

In [2]:
# Load gold standard concepts
true_concepts = pd.read_csv("../datasets/gold_concepts.csv")
true_concepts.drop(['concept_id','preceding', 'following'], axis=1, inplace=True)
print(true_concepts.shape)
true_concepts.head()

(1155, 7)


Unnamed: 0,histopathology_id,patient_id,report_no,concept,phrase,start_char,end_char
0,658,13,1,Invasiveness,intravascular spaces,669,689
1,658,13,1,Stain,PAS,715,718
2,658,13,1,Stain,GMS,723,726
3,658,13,1,positive,positive,700,708
4,658,13,1,FungalDescriptor,necrotic fungi,651,665


### Load tokeniser

In [3]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Apply tokeniser to gold standard annotations
true_concepts['doc'] = true_concepts.phrase.apply(nlp.tokenizer)

### Assign BIOES tags

In [4]:
# Create dataframe to store concepts with BIOES tags
true_concepts_bioes = pd.DataFrame(columns=true_concepts.columns)

# Single-token entities
true_concepts_bioes = true_concepts[true_concepts.doc.apply(len) == 1].copy()

# Add the "S" tag
true_concepts_bioes.concept = true_concepts_bioes.concept.apply(lambda x: "S-" + x)

# Remove doc
true_concepts_bioes.drop('doc', axis=1, inplace=True)

In [5]:
# Multi-token entities
for _,x in true_concepts[true_concepts.doc.apply(len) > 1].iterrows():
    
    # Loop over tokens
    for token in x.doc:
        
        # Skip if whitespace
        if token.is_space:
            continue
        
        # If the first token tag with "B-"
        if token.i==0:
            concept = "B-" + x.concept
            
        # If the last token tag with "E-"
        elif token.i+1==len(x.doc):
            concept = "E-" + x.concept
            
        # If in the middle tag with "I-"
        else:
            concept = "I-" + x.concept

        # Adjust start char position
        start_char = x.start_char + token.idx 

        tmp = pd.DataFrame({
            'histopathology_id': x.histopathology_id,
            'patient_id': x.patient_id, 
            'report_no': x.report_no, 
            'concept': concept, 
            'phrase': token,
            'start_char': start_char,
            'end_char': start_char + len(token),
        }, index=[0])

        # Add to the table of concepts
        true_concepts_bioes = pd.concat([true_concepts_bioes, tmp], axis=0, ignore_index=True) 
        
# Sort BIOES tagged concepts
true_concepts_bioes.sort_values(by=['histopathology_id', 'start_char'], inplace=True)
true_concepts_bioes.shape

(1649, 7)

___
# Prepare data
### Load reports

In [6]:
# Load the development set of reports
df = pd.read_csv("../datasets/reports_dev.csv")
print(df.shape)
df.head()

(231, 8)


Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset,order_results,y
0,13,1,Positive,658,10.0,development,"""XXXXXX M XXXXXXXXXX Report (XXXXXXXX)\nCLINI...",1
1,14,1,Positive,189,7.0,development,"""URNO XXXXXXXX \nLab No XXXXXXXXX ...",1
2,28,1,Negative,529,8.0,development,"""URNO XXXXXXXXX Lab No XXXXXXXXX ...",0
3,28,2,Positive,325,8.0,development,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",1
4,28,3,Negative,559,8.0,development,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",0


### Tokenize

In [7]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Enable sentensizer
nlp.enable_pipe('senter')

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer']

In [8]:
# Apply the NLP pipeline
df['doc'] = df.order_results.apply(nlp)

### Extract features

In [9]:
# Feature map
df['token_features'] = df.doc.apply(create_features)

### Label concepts

In [10]:
# Labels
df['token_labels'] = df.apply(create_labels, true_concepts=true_concepts_bioes, axis=1)

___
# Model selection
### Hyperparameter tuning

In [None]:
# %%time
# X = df.order_results
# y = df.y
# groups = df.patient_id

# param_space = {
#     'c1': [0.01, 0.1, 1, 10],
#     'c2': [0.01, 0.1, 1, 10],
# #     'all_possible_states': [False, True],
# #     'all_possible_transitions': [False, True],
# }

# best_params = None
# best_score = float('-inf')

# for c1 in param_space['c1']:
#     for c2 in param_space['c2']:

#         cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=3)
#         f1_score = []

#         for train_idx, val_idx in cv.split(X, y, groups):
            
#             # Initialise CRF object 
#             crf = CRF(algorithm='lbfgs', c1=c1, c2=c2)

#             # Train the model
#             crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

#             # Make predictions on the validation fold
#             y_pred = crf.predict(df.loc[val_idx, 'token_features'])

#             # Calculate macro f1
#             f1_score.append(flat_f1_score(df.loc[val_idx, 'token_labels'], y_pred, average='macro'))
            
#         print("With parameters c1=%.2f and c2=%.2f, the model achieves %.2f (+/- %.2f)." % 
#               (c1, c2, np.mean(f1_score), np.std(f1_score)))

#         if np.mean(f1_score) > best_score:
#             best_score = np.mean(f1_score)
#             best_params = {'c1': c1, 'c2': c2}
                
                
# print("Best macro F1 score = %.2f. Best hyperparameter values: c1=%.2f and c2=%.2f." % 
#       (best_score, best_params['c1'], best_params['c2']))

### Model evaluation

In [11]:
%%time
X = df.order_results
y = df.y
groups = df.patient_id

df['y_pred'] = np.empty((len(df), 0)).tolist()

cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=3)

for train_idx, val_idx in cv.split(X, y, groups):
            
    # Initialise CRF object 
    crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)

    # Train the model
    crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

    # Make predictions on the validation fold
    df.loc[val_idx, 'y_pred'] = pd.Series(crf.predict(df.loc[val_idx, 'token_features']), 
                                          index=val_idx)

CPU times: user 4min 10s, sys: 2.54 s, total: 4min 13s
Wall time: 4min 29s


### Transform predicitons

In [12]:
# BIOES entities
detected_concepts_bioes = prediction2concept_bioes(df)
# Composite entities
detected_concepts = prediction2concept(df)

detected_concepts_bioes.shape, detected_concepts.shape

((855, 7), (617, 7))

### Calculate precision & recall for CV

In [None]:
# # BIOES entities
# feature_names = get_feature_names('concepts', ('B-', 'I-', 'E-', 'S-'))

# evaluate_ner_cv(df[['histopathology_id', 'val_fold']], 
#                           true_concepts_bioes, 
#                           detected_concepts_bioes, 
#                           feature_names)

In [14]:
# Composite entities
feature_names = get_feature_names('concepts')

counts=evaluate_ner_cv(df[['histopathology_id', 'val_fold']], 
                true_concepts, 
                detected_concepts, 
                feature_names)

                  mean   std
concept                     
ClinicalQuery     0.98  0.08
FungalDescriptor  0.93  0.06
Fungus            0.98  0.05
Invasiveness      0.67  0.58
Stain             0.96  0.06
SampleType        0.59  0.14
positive          0.73  0.37
equivocal          NaN   NaN
negative          0.89  0.09
                  mean   std
concept                     
ClinicalQuery     0.72  0.22
FungalDescriptor  0.80  0.11
Fungus            0.80  0.21
Invasiveness      0.13  0.21
Stain             0.92  0.09
SampleType        0.26  0.11
positive          0.20  0.21
equivocal         0.00  0.00
negative          0.58  0.12


In [15]:
counts.reset_index(inplace=True)

In [17]:
counts[counts.concept=='equivocal'].groupby('val_fold')[['tp', 'fp', 'fn']].sum()

Unnamed: 0_level_0,tp,fp,fn
val_fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0,0,0
2.0,0,0,2
3.0,0,0,0
4.0,0,0,2
5.0,0,0,0
6.0,0,0,2
7.0,0,0,1
8.0,0,0,0
9.0,0,0,0
10.0,0,0,1


___
# Test
### Train CRF on the full training set

In [18]:
# CRF-Suite is clearly very trustworthy since the same two lines below do not work outside of a for loop. 
for _ in range(1):        
    # Initialise CRF object 
    crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)

    # Train the model
    crf.fit(df.token_features, df.token_labels)

### Load and prepare test data

In [19]:
# Load the test set of reports
df_test = pd.read_csv("../datasets/reports_test.csv")
print(df_test.shape)
df_test.head()

(52, 8)


Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset,order_results,y
0,2,1,Positive,214,,test,"""URNO XXXXXXXX \nLab No XXXXXXXXX ...",1
1,23,1,Positive,127,,test,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",1
2,25,1,Positive,833,,test,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",1
3,38,1,Positive,194,,test,"""XXXXXXX F XXXXXXXXXX Report (XXXXXXXX)\nMACR...",1
4,48,1,Positive,649,,test,"""XXXXXXX \nM\nXXXXXXXXX\n \nReport (XXXXXXXX)\...",1


### Extract features

In [20]:
# Apply the NLP pipeline
df_test['doc'] = df_test.order_results.apply(nlp)

# Feature map
df_test['token_features'] = df_test.doc.apply(create_features)

### Run the model and evaluate predicitions

In [21]:
# Make predictions
df_test['y_pred'] = crf.predict(df_test.token_features)

# BIOES entities
detected_concepts_bioes = prediction2concept_bioes(df_test)
# Composite entities
detected_concepts = prediction2concept(df_test)

detected_concepts_bioes.shape, detected_concepts.shape

((200, 7), (145, 7))

In [None]:
# # BIOES entities
# feature_names = get_feature_names('concepts', ('B-', 'I-', 'E-', 'S-'))

# evaluate_ner(df_test.histopathology_id, 
#              true_concepts_bioes, 
#              detected_concepts_bioes, 
#              feature_names)

In [22]:
# Composite entities
feature_names = get_feature_names('concepts')

counts_test=evaluate_ner(df_test.histopathology_id, 
             true_concepts, 
             detected_concepts, 
             feature_names)

concept
ClinicalQuery       1.000000
FungalDescriptor    0.934783
Fungus              0.933333
Invasiveness             NaN
Stain               0.970588
SampleType          0.500000
positive            1.000000
equivocal                NaN
negative            0.900000
dtype: float64

concept
ClinicalQuery       1.000000
FungalDescriptor    0.860000
Fungus              0.777778
Invasiveness        0.000000
Stain               0.942857
SampleType          0.325581
positive            0.200000
equivocal                NaN
negative            0.346154
dtype: float64


In [23]:
counts_test.reset_index(inplace=True)

In [25]:
counts_test[counts_test.concept=='equivocal'][['tp', 'fp', 'fn']].sum()

tp    0
fp    0
fn    0
dtype: int64