In [1]:
import numpy as np
import pandas as pd
import re
import spacy

from sklearn_crfsuite import CRF

import utils, crf_utils, eval_utils

import warnings
warnings.filterwarnings("ignore")

___
# Load CHIFIR data

In [2]:
# Path to the CHIFIR dataset
path = "../../../Data/CHIFIR/"

In [3]:
# Load the csv file with report metadata
df = pd.read_csv(path + "chifir_metadata.csv")
df.head()

Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset
0,13,1,Positive,658,10.0,development
1,14,1,Positive,189,7.0,development
2,28,1,Negative,529,8.0,development
3,28,2,Positive,325,8.0,development
4,28,3,Negative,559,8.0,development


In [4]:
# Separate the test set to prevent data leakage
df_test = df[df.dataset=='test'].copy()
df.drop(index=df_test.index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
# Add free-text reports to the dataframe
df['report'] = df.apply(utils.read_report, path=path + "reports/", axis=1)

In [6]:
# Parse annotation files and load gold standard annotations
concepts = utils.read_annotations(df, path=path + "annotations/")
concepts.head()

Unnamed: 0,histopathology_id,patient_id,report_no,concept_id,concept,phrase,start_char,end_char
0,658,13,1,T2,Invasiveness,intravascular spaces,669,689
1,658,13,1,T4,Stain,PAS,715,718
2,658,13,1,T5,Stain,GMS,723,726
3,658,13,1,T9,positive,positive,700,708
4,658,13,1,T3,FungalDescriptor,necrotic fungi,651,665


___
# Prepare gold standard annotations and reports
### Load tokeniser

In [7]:
# Load spacy model with disabled NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Enable sentensizer
nlp.enable_pipe('senter')

### Assign BIOES tags

In [8]:
# Apply tokenizer to distinguish between single and multiple token entities
concepts['doc'] = concepts.phrase.apply(nlp.tokenizer)

# Assign BIOES tags
concepts_bioes = crf_utils.assign_bioes_tags(concepts)

After assigning BIOES tags there are a total of 1366 concepts.


### Apply spacy pipeline to reports

In [9]:
df['doc'] = df.report.apply(nlp)

### Extract features

In [10]:
# Feature map
df['token_features'] = df.doc.apply(crf_utils.create_features)

### Label concepts

In [11]:
# Labels
df['token_labels'] = df.apply(crf_utils.create_labels, concepts=concepts_bioes, axis=1)

___
# Model selection
### Hyperparameter tuning

In [12]:
# %%time
# X = df.report
# y = df.y
# groups = df.patient_id

# param_space = {
#     'c1': [0.01, 0.1, 1, 10],
#     'c2': [0.01, 0.1, 1, 10],
# #     'all_possible_states': [False, True],
# #     'all_possible_transitions': [False, True],
# }

# best_params = None
# best_score = float('-inf')

# for c1 in param_space['c1']:
#     for c2 in param_space['c2']:

#         cv = utils.get_cv_strategy()
#         f1_score = []

#         for train_idx, val_idx in cv.split(X, y, groups):
            
#             # Initialise CRF object 
#             crf = CRF(algorithm='lbfgs', c1=c1, c2=c2)

#             # Train the model
#             crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

#             # Make predictions on the validation fold
#             y_pred = crf.predict(df.loc[val_idx, 'token_features'])

#             # Calculate macro f1
#             f1_score.append(flat_f1_score(df.loc[val_idx, 'token_labels'], y_pred, average='macro'))
            
#         print("With parameters c1=%.2f and c2=%.2f, the model achieves %.2f (+/- %.2f)." % 
#               (c1, c2, np.mean(f1_score), np.std(f1_score)))

#         if np.mean(f1_score) > best_score:
#             best_score = np.mean(f1_score)
#             best_params = {'c1': c1, 'c2': c2}
                
                
# print("Best macro F1 score = %.2f. Best hyperparameter values: c1=%.2f and c2=%.2f." % 
#       (best_score, best_params['c1'], best_params['c2']))

### Model evaluation

In [13]:
X = df.report
y = df.y_report
groups = df.patient_id

cv = utils.get_cv_strategy()

df['y_pred'] = np.empty((len(df), 0)).tolist()

for train_idx, val_idx in cv.split(X, y, groups):
            
    # Initialise CRF object 
    crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)

    # Train the model
    crf.fit(df.loc[train_idx, 'token_features'], df.loc[train_idx, 'token_labels'])

    # Make predictions on the validation fold
    df.loc[val_idx, 'y_pred'] = pd.Series(crf.predict(df.loc[val_idx, 'token_features']), 
                                          index=val_idx)

### Evaluate predicitons

In [14]:
# Transform predictions to a dataframe of concepts
detected_concepts = crf_utils.prediction2concept(df)

# Get feature names
feature_names = utils.get_feature_names('concepts')

# Calculate precision and recall
eval_utils.evaluate_ner_cv(df[['histopathology_id', 'val_fold']], 
                           concepts, detected_concepts, feature_names)

                  mean   std
concept                     
ClinicalQuery     0.98  0.08
FungalDescriptor  0.93  0.06
Fungus            0.98  0.05
Invasiveness      0.67  0.58
Stain             0.96  0.06
SampleType        0.59  0.14
positive          0.73  0.37
equivocal          NaN   NaN
negative          0.89  0.09
                  mean   std
concept                     
ClinicalQuery     0.72  0.22
FungalDescriptor  0.80  0.11
Fungus            0.80  0.21
Invasiveness      0.13  0.21
Stain             0.92  0.09
SampleType        0.26  0.11
positive          0.20  0.21
equivocal         0.00  0.00
negative          0.58  0.12


### Evaluate predicitons with BIOES tags

In [16]:
# Transform predictions to a dataframe of concepts
detected_concepts = crf_utils.prediction2concept_bioes(df)

# Get feature names
feature_names = utils.get_feature_names('concepts', ('B-', 'I-', 'E-', 'S-'))

# Calculate precision and recall
eval_utils. evaluate_ner_cv(df[['histopathology_id', 'val_fold']], 
                            concepts_bioes, detected_concepts, feature_names)

                    mean   std
concept                       
B-ClinicalQuery     0.86  0.21
I-ClinicalQuery     0.60  0.27
E-ClinicalQuery     0.77  0.28
S-ClinicalQuery     0.50  0.71
B-FungalDescriptor  0.87  0.09
I-FungalDescriptor  0.83  0.41
E-FungalDescriptor  0.92  0.07
S-FungalDescriptor  0.81  0.18
B-Fungus            1.00  0.00
I-Fungus             NaN   NaN
E-Fungus            1.00  0.00
S-Fungus            0.97  0.07
B-Invasiveness      0.50  0.71
I-Invasiveness      0.33  0.58
E-Invasiveness      0.67  0.58
S-Invasiveness       NaN   NaN
B-Stain             1.00   NaN
I-Stain              NaN   NaN
E-Stain             1.00   NaN
S-Stain             0.92  0.08
B-SampleType        0.68  0.31
I-SampleType         NaN   NaN
E-SampleType        0.68  0.31
S-SampleType        0.59  0.20
B-positive          1.00  0.00
I-positive           NaN   NaN
E-positive          1.00  0.00
S-positive          0.62  0.44
B-equivocal          NaN   NaN
I-equivocal          NaN   NaN
E-equivo

___
# Test
### Train CRF on the full development set

In [17]:
# CRF-Suite is clearly very trustworthy since the same two lines below do not work outside of a for loop. 
for _ in range(1):        
    # Initialise CRF object 
    crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)

    # Train the model
    crf.fit(df.token_features, df.token_labels)

### Load and prepare test data, run the model

In [18]:
# Add free-text reports to the dataframe
df_test['report'] = df_test.apply(utils.read_report, path=path + "reports/", axis=1)

# Apply spacy pipeline to reports
df_test['doc'] = df_test.report.apply(nlp)

# Extract features
df_test['token_features'] = df_test.doc.apply(crf_utils. create_features)

# Make predictions
df_test['y_pred'] = crf.predict(df_test.token_features)

### Load and prepare gold standard concepts, evaluate predicitions

In [19]:
# Parse annotation files and load gold standard annotations
concepts = utils.read_annotations(df_test, path=path + "annotations/")

# Transform predictions to a dataframe of concepts
detected_concepts = crf_utils.prediction2concept(df_test)

# Get feature names
feature_names = utils.get_feature_names('concepts')

# Calculate precision and recall
eval_utils.evaluate_ner(df_test.histopathology_id, 
                        concepts, detected_concepts, feature_names)

concept
ClinicalQuery       1.00
FungalDescriptor    0.93
Fungus              0.93
Invasiveness         NaN
Stain               0.97
SampleType          0.50
positive            1.00
equivocal            NaN
negative            0.90
dtype: float64

concept
ClinicalQuery       1.00
FungalDescriptor    0.86
Fungus              0.78
Invasiveness        0.00
Stain               0.94
SampleType          0.33
positive            0.20
equivocal            NaN
negative            0.35
dtype: float64
