In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pandas as pd
import re
import spacy

from sklearn.model_selection import StratifiedGroupKFold
from sklearn_crfsuite import CRF

from utils import *
from evalutils import *

import warnings
warnings.filterwarnings("ignore")

# Parse BERT pred file

In [3]:
# Metadata
df = pd.read_csv("../../../data/chifir_metadata.csv")
df = df[df.dataset == 'test']
print(df.shape)
df.head()



(52, 6)


Unnamed: 0,histopathology_id,patient_id,report_no,y_report,dataset,val_fold
231,214,2,1,Positive,test,
232,127,23,1,Positive,test,
233,833,25,1,Positive,test,
234,194,38,1,Positive,test,
235,649,48,1,Positive,test,


In [4]:
FOLDER='../../datasets/predictions/'

# Create dataframes to store annotations
concepts = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 
                                 'concept_id', 'concept', 'phrase', 'position', 'start_char', 'end_char'])
relations = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 
                                  'relation_id', 'relation', 'arg1', 'arg2'])

for _, x in df.iterrows():
    # Define filename
    filename = get_filename(x.patient_id, x.report_no, file_format='ann')
    
    # Open and read annotation file
    with open(FOLDER + filename, 'r') as f:
        annotation = f.readlines()
        
    if annotation:    
        # Loop over each line of the annotation file
        for line in annotation:

            # Concept
            if re.match("T", line):

                # Create an entry containing concept ID, category, position and the raw text
                substrings = line.strip().split('\t')
                concept_id = substrings[0]
                concept = substrings[1].split(maxsplit=1)[0]
                position = substrings[1].split(maxsplit=1)[1]
                start_char, end_char = re.split(' |;', position)[-2:]
                text = substrings[2]

                tmp = pd.DataFrame({
                    'histopathology_id': x.histopathology_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'concept_id': concept_id, 
                    'concept': concept, 
                    'phrase': text,
                    'position': position, 
                    'start_char': int(start_char),
                    'end_char': int(end_char),
                }, index=[0])

                # Add to the table of concepts
                concepts = pd.concat([concepts, tmp], axis=0, ignore_index=True)

            # Relation
            elif re.match("R", line):

                # Create an entry containing relation ID, type and IDs of the arguments
                substrings = line.strip().split()
                relation_id = substrings[0]
                relation = substrings[1]
                arg1 = substrings[2].split(':')[1]
                arg2 = substrings[3].split(':')[1]

                tmp = pd.DataFrame({
                    'histopathology_id': x.histopathology_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'relation_id': relation_id, 
                    'relation': relation, 
                    'arg1': arg1, 
                    'arg2': arg2
                }, index=[0])

                # Add to the table of relations
                relations = pd.concat([relations, tmp], axis=0, ignore_index=True)
                
# Convert patient ID and report number to int
concepts[['patient_id', 'report_no']] = concepts[['patient_id', 'report_no']].astype(int)
relations[['patient_id', 'report_no']] = relations[['patient_id', 'report_no']].astype(int)

print("Extracted %d concepts and %d relations." % (concepts.shape[0], relations.shape[0]))

Extracted 191 concepts and 0 relations.


In [5]:
# Discont concepts have ;-separated positions
idx = concepts[concepts.position.str.contains(";")].index

# Split discont concepts into a separate dataframe
discont = concepts.iloc[idx].copy()
concepts.drop(idx, inplace=True)

In [6]:
# Loop over discont concepts extracting individual spans
for _,x in discont.iterrows():
    spans = []
    i = 0
    for pos in x.position.split(';'):
        # Extract start and end char positions
        start_char, end_char = map(int, pos.split())
        # Calculate span length
        len_span = end_char - start_char
        # Extract span text
        phrase = x.phrase[i:i+len_span]
        # Add to list of spans
        spans.append((start_char, end_char, phrase))
        i = i + len_span + 1
        
    # Sort extracted spans by starting position
    spans = sorted(spans, key=lambda x: x[0])
    
    # Append extracted spans to the dataframe with gold standard concepts 
    for span in spans:
        tmp = x.copy()
        tmp['start_char'] = span[0]
        tmp['end_char'] = span[1]
        tmp['phrase'] = span[2]
        concepts = pd.concat([concepts, tmp.to_frame().T], axis=0, ignore_index=True)
        
# Remove position column
concepts.drop('position', axis=1, inplace=True)
concepts.shape

(191, 8)

In [7]:
def assign_termset(x):
    arg2_ids = relations[(relations.histopathology_id==x.histopathology_id) & 
                     (relations.arg1==x.concept_id)
                    ].arg2
    arg2_start_char = concepts[(concepts.histopathology_id==x.histopathology_id) & 
                                concepts.concept_id.isin(arg2_ids)
                               ].start_char
    return (x.start_char < arg2_start_char).any(), (x.start_char > arg2_start_char).any()

# Only check order for positive and negative cue
cues = concepts[concepts.concept.isin(['positive', 'negative'])]

# Determine if a cue is preceding and/or following
concepts[['preceding', 'following']] = pd.DataFrame(cues.apply(assign_termset, axis=1).tolist(), 
                                                    index=cues.index)

In [8]:
concepts.to_csv("../../datasets/bert_pred_concepts.csv", index=False)


# Get gold concepts

In [9]:
# Load gold standard concepts
true_concepts = pd.read_csv("../../datasets/gold_concepts.csv")
# true_concepts = pd.read_csv("../tmp/gold_concepts.csv")
true_concepts.drop(['concept_id','preceding', 'following'], axis=1, inplace=True)
print(true_concepts.shape)
true_concepts.head()

(1155, 7)


Unnamed: 0,histopathology_id,patient_id,report_no,concept,phrase,start_char,end_char
0,658,13,1,Invasiveness,intravascular spaces,669,689
1,658,13,1,Stain,PAS,715,718
2,658,13,1,Stain,GMS,723,726
3,658,13,1,positive,positive,700,708
4,658,13,1,FungalDescriptor,necrotic fungi,651,665


In [10]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Apply tokeniser to gold standard annotations
true_concepts['doc'] = true_concepts.phrase.apply(nlp.tokenizer)

In [11]:
# Create dataframe to store concepts with BIOES tags
true_concepts_bioes = pd.DataFrame(columns=true_concepts.columns)

# Single-token entities
true_concepts_bioes = true_concepts[true_concepts.doc.apply(len) == 1].copy()

# Add the "S" tag
true_concepts_bioes.concept = true_concepts_bioes.concept.apply(lambda x: "S-" + x)

# Remove doc
true_concepts_bioes.drop('doc', axis=1, inplace=True)

In [12]:
# Multi-token entities
for _,x in true_concepts[true_concepts.doc.apply(len) > 1].iterrows():
    
    # Loop over tokens
    for token in x.doc:
        
        # Skip if whitespace
        if token.is_space:
            continue
        
        # If the first token tag with "B-"
        if token.i==0:
            concept = "B-" + x.concept
            
        # If the last token tag with "E-"
        elif token.i+1==len(x.doc):
            concept = "E-" + x.concept
            
        # If in the middle tag with "I-"
        else:
            concept = "I-" + x.concept

        # Adjust start char position
        start_char = x.start_char + token.idx 

        tmp = pd.DataFrame({
            'histopathology_id': x.histopathology_id,
            'patient_id': x.patient_id, 
            'report_no': x.report_no, 
            'concept': concept, 
            'phrase': token,
            'start_char': start_char,
            'end_char': start_char + len(token),
        }, index=[0])

        # Add to the table of concepts
        true_concepts_bioes = pd.concat([true_concepts_bioes, tmp], axis=0, ignore_index=True) 
        
# Sort BIOES tagged concepts
true_concepts_bioes.sort_values(by=['histopathology_id', 'start_char'], inplace=True)
true_concepts_bioes.shape

(1649, 7)

# Get pred concepts

In [13]:
# Load gold standard concepts
pred_concepts = pd.read_csv("../../datasets/bert_pred_concepts.csv")
pred_concepts.drop(['concept_id','preceding', 'following'], axis=1, inplace=True)
print(pred_concepts.shape)
pred_concepts.head()

(191, 7)


Unnamed: 0,histopathology_id,patient_id,report_no,concept,phrase,start_char,end_char
0,214,2,1,FungalDescriptor,Fungal elements,1374,1389
1,214,2,1,positive,identified,1394,1404
2,214,2,1,Stain,Grocott,1412,1419
3,214,2,1,FungalDescriptor,Fungal elements,1593,1608
4,127,23,1,Stain,Grocott,345,352


In [14]:
# Load spacy model with disable NER
nlp = spacy.load("en_core_web_sm", exclude=['ner'])

# Apply tokeniser to gold standard annotations
pred_concepts['doc'] = pred_concepts.phrase.apply(nlp.tokenizer)

In [15]:
# Create dataframe to store concepts with BIOES tags
pred_concepts_bioes = pd.DataFrame(columns=pred_concepts.columns)

# Single-token entities
pred_concepts_bioes = pred_concepts[pred_concepts.doc.apply(len) == 1].copy()

# Add the "S" tag
pred_concepts_bioes.concept = pred_concepts_bioes.concept.apply(lambda x: "S-" + x)

# Remove doc
pred_concepts_bioes.drop('doc', axis=1, inplace=True)

In [16]:
# Multi-token entities
for _,x in pred_concepts[pred_concepts.doc.apply(len) > 1].iterrows():
    
    # Loop over tokens
    for token in x.doc:
        
        # Skip if whitespace
        if token.is_space:
            continue
        
        # If the first token tag with "B-"
        if token.i==0:
            concept = "B-" + x.concept
            
        # If the last token tag with "E-"
        elif token.i+1==len(x.doc):
            concept = "E-" + x.concept
            
        # If in the middle tag with "I-"
        else:
            concept = "I-" + x.concept

        # Adjust start char position
        start_char = x.start_char + token.idx 

        tmp = pd.DataFrame({
            'histopathology_id': x.histopathology_id,
            'patient_id': x.patient_id, 
            'report_no': x.report_no, 
            'concept': concept, 
            'phrase': token,
            'start_char': start_char,
            'end_char': start_char + len(token),
        }, index=[0])

        # Add to the table of concepts
        pred_concepts_bioes = pd.concat([pred_concepts_bioes, tmp], axis=0, ignore_index=True) 
        
# Sort BIOES tagged concepts
pred_concepts_bioes.sort_values(by=['histopathology_id', 'start_char'], inplace=True)
pred_concepts_bioes.shape

(246, 7)

In [17]:
# Load the test set of reports
df_test = pd.read_csv("../../datasets/reports_test.csv")
print(df_test.shape)
df_test.head()

(52, 8)


Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset,order_results,y
0,2,1,Positive,214,,test,"""URNO XXXXXXXX \nLab No XXXXXXXXX ...",1
1,23,1,Positive,127,,test,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",1
2,25,1,Positive,833,,test,"""URNO XXXXXXXXX \nLab No XXXXXXXXX ...",1
3,38,1,Positive,194,,test,"""XXXXXXX F XXXXXXXXXX Report (XXXXXXXX)\nMACR...",1
4,48,1,Positive,649,,test,"""XXXXXXX \nM\nXXXXXXXXX\n \nReport (XXXXXXXX)\...",1


In [20]:
# Composite entities
feature_names = get_feature_names('concepts')

evaluate_ner(df_test.histopathology_id, 
             true_concepts, 
             pred_concepts, 
             feature_names)

concept
ClinicalQuery       0.500000
FungalDescriptor    0.978261
Fungus              0.894737
Invasiveness        0.500000
Stain               0.970588
SampleType          0.647059
positive            0.900000
equivocal                NaN
negative            0.888889
dtype: float64
concept
ClinicalQuery       0.571429
FungalDescriptor    0.900000
Fungus              0.944444
Invasiveness        0.166667
Stain               0.942857
SampleType          0.511628
positive            0.900000
equivocal                NaN
negative            0.923077
dtype: float64
