In [1]:
import numpy as np
import pandas as pd
import re
from utils import get_filename, read_report

___
# Extract gold standard annotations
### Load metadata

In [2]:
# Path to the dataset
path = "../../../Data/CHIFIR/"

# Metadata
df = pd.read_csv(path + "chifir_metadata.csv")
print(df.shape)
df.head()

(283, 6)


Unnamed: 0,patient_id,report_no,y_report,histopathology_id,val_fold,dataset
0,13,1,Positive,658,10.0,development
1,14,1,Positive,189,7.0,development
2,28,1,Negative,529,8.0,development
3,28,2,Positive,325,8.0,development
4,28,3,Negative,559,8.0,development


### Parse annotation files

In [3]:
# Create dataframes to store annotations
concepts = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 
                                 'concept_id', 'concept', 'phrase', 'position', 'start_char', 'end_char'])
relations = pd.DataFrame(columns=['histopathology_id', 'patient_id', 'report_no', 
                                  'relation_id', 'relation', 'arg1', 'arg2'])

for _, x in df.iterrows():
    # Define filename
    filename = get_filename(x.patient_id, x.report_no, file_format='ann')
    
    # Open and read annotation file
    with open(path + "annotations/" + filename, 'r') as f:
        annotation = f.readlines()
        
    if annotation:    
        # Loop over each line of the annotation file
        for line in annotation:

            # Concept
            if re.match("T", line):

                # Create an entry containing concept ID, category, position and the raw text
                substrings = line.strip().split('\t')
                concept_id = substrings[0]
                concept = substrings[1].split(maxsplit=1)[0]
                position = substrings[1].split(maxsplit=1)[1]
                start_char, end_char = re.split(' |;', position)[-2:]
                text = substrings[2]

                tmp = pd.DataFrame({
                    'histopathology_id': x.histopathology_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'concept_id': concept_id, 
                    'concept': concept, 
                    'phrase': text,
                    'position': position, 
                    'start_char': int(start_char),
                    'end_char': int(end_char),
                }, index=[0])

                # Add to the table of concepts
                concepts = pd.concat([concepts, tmp], axis=0, ignore_index=True)

            # Relation
            elif re.match("R", line):

                # Create an entry containing relation ID, type and IDs of the arguments
                substrings = line.strip().split()
                relation_id = substrings[0]
                relation = substrings[1]
                arg1 = substrings[2].split(':')[1]
                arg2 = substrings[3].split(':')[1]

                tmp = pd.DataFrame({
                    'histopathology_id': x.histopathology_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'relation_id': relation_id, 
                    'relation': relation, 
                    'arg1': arg1, 
                    'arg2': arg2
                }, index=[0])

                # Add to the table of relations
                relations = pd.concat([relations, tmp], axis=0, ignore_index=True)
                
# Convert patient ID and report number to int
concepts[['patient_id', 'report_no']] = concepts[['patient_id', 'report_no']].astype(int)
relations[['patient_id', 'report_no']] = relations[['patient_id', 'report_no']].astype(int)

print("Extracted %d concepts and %d relations." % (concepts.shape[0], relations.shape[0]))

Extracted 1137 concepts and 606 relations.


### Separate discontinuous concepts

In [4]:
# Discont concepts have ;-separated positions
idx = concepts[concepts.position.str.contains(";")].index

# Split discont concepts into a separate dataframe
discont = concepts.iloc[idx].copy()
concepts.drop(idx, inplace=True)

In [5]:
# Loop over discont concepts extracting individual spans
for _,x in discont.iterrows():
    spans = []
    i = 0
    for pos in x.position.split(';'):
        # Extract start and end char positions
        start_char, end_char = map(int, pos.split())
        # Calculate span length
        len_span = end_char - start_char
        # Extract span text
        phrase = x.phrase[i:i+len_span]
        # Add to list of spans
        spans.append((start_char, end_char, phrase))
        i = i + len_span + 1
        
    # Sort extracted spans by starting position
    spans = sorted(spans, key=lambda x: x[0])
    
    # Append extracted spans to the dataframe with gold standard concepts 
    for span in spans:
        tmp = x.copy()
        tmp['start_char'] = span[0]
        tmp['end_char'] = span[1]
        tmp['phrase'] = span[2]
        concepts = pd.concat([concepts, tmp.to_frame().T], axis=0, ignore_index=True)
        
# Remove position column
concepts.drop('position', axis=1, inplace=True)
concepts.shape

(1155, 8)

### Preceding and following termsets

In [6]:
def assign_termset(x):
    arg2_ids = relations[(relations.histopathology_id==x.histopathology_id) & 
                     (relations.arg1==x.concept_id)
                    ].arg2
    arg2_start_char = concepts[(concepts.histopathology_id==x.histopathology_id) & 
                                concepts.concept_id.isin(arg2_ids)
                               ].start_char
    return (x.start_char < arg2_start_char).any(), (x.start_char > arg2_start_char).any()

# Only check order for positive and negative cue
cues = concepts[concepts.concept.isin(['positive', 'negative'])]

# Determine if a cue is preceding and/or following
concepts[['preceding', 'following']] = pd.DataFrame(cues.apply(assign_termset, axis=1).tolist(), 
                                                    index=cues.index)

### Save the extracted concepts and relations

In [7]:
concepts.to_csv("../datasets/gold_concepts.csv", index=False)
relations.to_csv("../datasets/gold_relations.csv", index=False)

### Create composite concepts

In [8]:
def add_composite_concepts(concepts, relation, composite_name):
    
    # Loop over the dataframe with extracted relations
    for _, x in relations[relations.relation==relation].iterrows():
        
        # Define the next vacant concept ID
        next_id = concepts[concepts.histopathology_id==x.histopathology_id].concept_id.apply(lambda x: 
                                                                                             int(x[1:])
                                                                                            ).max() + 1
        
        # Determine the object (Arg2) of a relation
        y = concepts[(concepts.histopathology_id==x.histopathology_id) & 
                     (concepts.concept_id==x.arg2)].iloc[0]
        
        # Create an entry containing concept ID, composite category, position and the raw text
        tmp = pd.DataFrame({
            'histopathology_id': x.histopathology_id,
            'patient_id': x.patient_id,
            'report_no': x.report_no, 
            'concept_id': 'T' + str(next_id), 
            'concept': composite_name + y.concept,
            'phrase': y.phrase,
            'start_char': y.start_char,
            'end_char': y.end_char,
        }, index=[0])
        
        # Add to the table of concepts
        concepts = pd.concat([concepts, tmp], axis=0, ignore_index=True)
        
    return concepts

# Add the argument of a positive relation to the table of concepts as an "affirmed" concept
concepts = add_composite_concepts(concepts, 'positive-rel', 'affirmed')

# Add the argument of a negative relation to the table of concepts as a "negated" concept
concepts = add_composite_concepts(concepts, 'negative-rel', 'negated')

# Drop duplicated composite concepts
concepts.drop_duplicates(subset=['histopathology_id', 'concept', 'start_char'], inplace=True, ignore_index=True)

print("Totalling %d concepts and composite concepts." % concepts.shape[0])

Totalling 1497 concepts and composite concepts.


In [9]:
concepts

Unnamed: 0,histopathology_id,patient_id,report_no,concept_id,concept,phrase,start_char,end_char,preceding,following
0,658,13,1,T2,Invasiveness,intravascular spaces,669,689,,
1,658,13,1,T4,Stain,PAS,715,718,,
2,658,13,1,T5,Stain,GMS,723,726,,
3,658,13,1,T9,positive,positive,700,708,False,True
4,658,13,1,T3,FungalDescriptor,necrotic fungi,651,665,,
...,...,...,...,...,...,...,...,...,...,...
1492,884,176,2,T10,negatedFungalDescriptor,fungal elements,583,598,,
1493,884,176,2,T11,negatedFungalDescriptor,fungal elements,1073,1088,,
1494,884,176,2,T12,negatedFungalDescriptor,fungal elements,1564,1579,,
1495,729,219,2,T5,negatedFungalDescriptor,fungi,1034,1039,,


### Save the extracted annotatons

In [10]:
concepts.to_csv("../datasets/gold_composite.csv", index=False)

___
# Prepare datasets
### Load reports

In [11]:
df['order_results'] = df.apply(read_report, path=path + "reports/", axis=1)

### Convert report labels to `int`

In [12]:
df['y'] = np.where(df.y_report=="Positive", 1, 0)

In [13]:
# Save datasets
df[df.dataset=='development'].to_csv("../datasets/reports_dev.csv", index=False)
df[df.dataset=='test'].to_csv("../datasets/reports_test.csv", index=False)