In [1]:
import numpy as np
import pandas as pd
import spacy

import utils, dict_utils, eval_utils

import warnings
warnings.filterwarnings("ignore")

___
# Load CHIFIR data

In [2]:
# Path to the CHIFIR dataset
path = "../../../Data/CHIFIR/"

In [3]:
# Load the csv file with report metadata
df = pd.read_csv(path + "chifir_metadata.csv")
df.head()

Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset
0,13,1,Positive,658,10.0,development
1,14,1,Positive,189,7.0,development
2,28,1,Negative,529,8.0,development
3,28,2,Positive,325,8.0,development
4,28,3,Negative,559,8.0,development


In [4]:
# Separate the test set to prevent data leakage
df_test = df[df.dataset=='test'].copy()
df.drop(index=df_test.index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
# Add free-text reports to the dataframe
df['report'] = df.apply(utils.read_report, path=path + "reports/", axis=1)

In [6]:
# Parse annotation files and load gold standard annotations
concepts = utils.read_annotations(df, path=path + "annotations/")
concepts.head()

Unnamed: 0,histopathology_id,patient_id,report_no,concept_id,concept,phrase,start_char,end_char
0,658,13,1,T2,Invasiveness,intravascular spaces,669,689
1,658,13,1,T4,Stain,PAS,715,718
2,658,13,1,T5,Stain,GMS,723,726
3,658,13,1,T9,positive,positive,700,708
4,658,13,1,T3,FungalDescriptor,necrotic fungi,651,665


___
# Prepare gold standard annotations and reports
### Clean text

In [7]:
df['clean_text'] = df.report.apply(dict_utils.clean_text)

### Load tokenizer

In [8]:
# Load spacy model with disabled NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

### Learn dictionary of concepts

In [9]:
X = df.clean_text
y = df.y_report
groups = df.patient_id

cv = utils.get_cv_strategy()

for train_idx, val_idx in cv.split(X, y, groups):
    
    # Create vocabulary
    vocab = dict_utils.create_vocab(df.loc[train_idx].histopathology_id, concepts, expand=True)
    
    # Match words in text to detect concepts
    df.loc[val_idx, 'doc'] = dict_utils.detect_concepts(df.loc[val_idx, 'clean_text'], nlp, vocab)

### Adjust positions of gold standard annotations

In [10]:
# Map character positions before and after text pre-processing
df['pos_mapping'] = df.report.apply(dict_utils.clean_text, mode='map positions')

# Add information about position changes
concepts = concepts.merge(df[['histopathology_id', 'pos_mapping']], on='histopathology_id')

# Adjust character positions
concepts[['start_char', 'end_char']] = pd.DataFrame(concepts.apply(dict_utils.adjust_position, axis=1).tolist(), 
                                                    index=concepts.index)

### Evaluate predicitons

In [11]:
# Transform predictions to a dataframe of concepts
detected_concepts = dict_utils.doc2concepts(df)

# Get feature names
feature_names = utils.get_feature_names('concepts')

# Calculate precision and recall
eval_utils.evaluate_ner_cv(df[['histopathology_id', 'val_fold']], 
                           concepts, detected_concepts, feature_names)

                  mean   std
concept                     
ClinicalQuery     0.98  0.08
FungalDescriptor  0.73  0.10
Fungus            0.89  0.09
Invasiveness      0.27  0.41
Stain             0.95  0.06
SampleType        0.14  0.03
positive          0.04  0.03
equivocal         0.01  0.01
negative          0.11  0.03
                  mean   std
concept                     
ClinicalQuery     0.49  0.22
FungalDescriptor  0.93  0.05
Fungus            0.93  0.14
Invasiveness      0.35  0.42
Stain             0.96  0.07
SampleType        0.85  0.13
positive          0.73  0.30
equivocal         0.50  0.50
negative          0.97  0.07


___
# Test
### Learn vocabulary from the full development set

In [12]:
vocab = dict_utils.create_vocab(df.histopathology_id, concepts, expand=True)

### Load and prepare test data, run the model

In [13]:
# Add free-text reports to the dataframe
df_test['report'] = df_test.apply(utils.read_report, path=path + "reports/", axis=1)

# Clean data
df_test['clean_text'] = df_test.report.apply(dict_utils.clean_text)

# Match words in text to detect concepts
df_test['doc'] = dict_utils.detect_concepts(df_test.clean_text, nlp, vocab)

### Load and prepare gold standard concepts, evaluate predicitions

In [14]:
# Parse annotation files and load gold standard annotations
concepts = utils.read_annotations(df_test, path=path + "annotations/")

# Map character positions before and after text pre-processing
df_test['pos_mapping'] = df_test.report.apply(dict_utils.clean_text, mode='map positions')

# Add information about position changes
concepts = concepts.merge(df_test[['histopathology_id', 'pos_mapping']], on='histopathology_id')

# Adjust character positions
concepts[['start_char', 'end_char']] = pd.DataFrame(concepts.apply(dict_utils.adjust_position, axis=1).tolist(), 
                                                    index=concepts.index)


# Transform predictions to a dataframe of concepts
detected_concepts = dict_utils.doc2concepts(df_test)

# Get feature names
feature_names = utils.get_feature_names('concepts')


# Calculate precision and recall
eval_utils.evaluate_ner(df_test.histopathology_id, 
                        concepts, detected_concepts, feature_names)

concept
ClinicalQuery       0.83
FungalDescriptor    0.80
Fungus              0.94
Invasiveness        0.33
Stain               0.94
SampleType          0.13
positive            0.04
equivocal           0.00
negative            0.10
dtype: float64

concept
ClinicalQuery       0.71
FungalDescriptor    0.98
Fungus              0.94
Invasiveness        0.33
Stain               0.97
SampleType          0.72
positive            0.80
equivocal            NaN
negative            1.00
dtype: float64
