In [1]:
import numpy as np
import pandas as pd
import re
import utils

___
# Extract gold standard annotations
### Load metadata

In [2]:
# Load a csv file with report IDs and labels
df = pd.read_csv("../datasets/reports_metadata.csv")
print(df.shape)
df.head()

(283, 4)


Unnamed: 0,report_id,patient_id,report_no,y_report
0,754,1,1,Negative
1,214,2,1,Positive
2,125,3,1,Negative
3,859,4,1,Negative
4,381,5,1,Negative


### Parse annotation files

In [3]:
# Define the folder with annotation files
path = "../datasets/annotations/"

# Create dataframes to store annotations
concepts = pd.DataFrame(columns=['report_id', 'patient_id', 'report_no', 
                                 'concept_id', 'concept', 'location', 'phrase'])
relations = pd.DataFrame(columns=['report_id', 'patient_id', 'report_no', 
                                  'relation_id', 'relation', 'arg1', 'arg2'])

for _, x in df.iterrows():
    # Define filename
    filename = utils.get_filename(x.patient_id, x.report_no, file_format='ann')
    
    # Read the annotation file
    with open(path + filename, 'r') as f:
        annotation = f.readlines()
        
    if annotation:    
        # Loop over each line of the annotation file
        for line in annotation:

            # Concept
            if re.match("T", line):

                # Create an entry containing concept ID, category, location and the raw text
                substrings = line.strip().split('\t')
                concept_id = substrings[0]
                concept = substrings[1].split(maxsplit=1)[0]
                location = substrings[1].split(maxsplit=1)[1]
                text = substrings[2]

                tmp = pd.DataFrame({
                    'report_id': x.report_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'concept_id': concept_id, 
                    'concept': concept, 
                    'location': location, 
                    'phrase': text
                }, index=[0])

                # Add to the table of concepts
                concepts = pd.concat([concepts, tmp], axis=0, ignore_index=True)

            # Relation
            elif re.match("R", line):

                # Create an entry containing relation ID, type and IDs of the arguments
                substrings = line.strip().split()
                relation_id = substrings[0]
                relation = substrings[1]
                arg1 = substrings[2].split(':')[1]
                arg2 = substrings[3].split(':')[1]

                tmp = pd.DataFrame({
                    'report_id': x.report_id,
                    'patient_id': x.patient_id, 
                    'report_no': x.report_no, 
                    'relation_id': relation_id, 
                    'relation': relation, 
                    'arg1': arg1, 
                    'arg2': arg2
                }, index=[0])

                # Add to the table of relations
                relations = pd.concat([relations, tmp], axis=0, ignore_index=True)
                
# Convert patient ID and report number to int
concepts[['patient_id', 'report_no']] = concepts[['patient_id', 'report_no']].astype(int)
relations[['patient_id', 'report_no']] = relations[['patient_id', 'report_no']].astype(int)

print("Extracted %d concepts and %d relations." % (concepts.shape[0], relations.shape[0]))

Extracted 1137 concepts and 606 relations.


### Save the extracted concepts and relations

In [4]:
concepts.to_csv("../datasets/gold_concepts.csv", index=False)
relations.to_csv("../datasets/gold_relations.csv", index=False)

### Create composite concepts

In [5]:
def add_composite_concepts(concepts, relation, composite_name):
    
    # Loop over the dataframe with extracted relations
    for _, x in relations[relations.relation==relation].iterrows():
        
        # Define the next vacant concept ID
        next_id = concepts[concepts.report_id==x.report_id].concept_id.apply(lambda x: int(x[1:])).max() + 1
        
        # Determine the object (Arg2) of a relation
        y = concepts[(concepts.report_id==x.report_id) & (concepts.concept_id==x.arg2)].iloc[0]
        
        # Create an entry containing concept ID, composite category, location and the raw text
        tmp = pd.DataFrame({
            'report_id': x.report_id,
            'patient_id': x.patient_id,
            'report_no': x.report_no, 
            'concept_id': 'T' + str(next_id), 
            'concept': composite_name + y.concept,
            'location': y.location,
            'phrase': y.phrase
        }, index=[0])
        
        # Add to the table of concepts
        concepts = pd.concat([concepts, tmp], axis=0, ignore_index=True)
        
    return concepts

# Add the argument of a positive relation to the table of concepts as an "affirmed" concept
concepts = add_composite_concepts(concepts, 'positive-rel', 'affirmed')

# Add the argument of a negative relation to the table of concepts as a "negated" concept
concepts = add_composite_concepts(concepts, 'negative-rel', 'negated')

print("Totalling %d concepts and composite concepts." % concepts.shape[0])

Totalling 1482 concepts and composite concepts.


### Save the extracted annotatons

In [6]:
concepts.to_csv("../datasets/gold_composite.csv", index=False)

___
# Generate `.ann` files with composite concepts instead of relations

In [7]:
def write_ann_file(x):
    # Define filename for the new annotation file
    filename = utils.get_filename(x.patient_id.iloc[0], x.report_no.iloc[0])

    # Write annotations in the .ann file format
    with open(output_path + filename, 'w') as f:
        concepts[concepts.report_id==x.report_id.iloc[0]].apply(lambda y: 
                                                                f.write(y.concept_id +  '\t' + 
                                                                        y.concept + ' ' +
                                                                        y.location + '\t' + 
                                                                        y.phrase + '\n'),
                                                                axis=1)
        
# Define the output folder for the new annotation files
output_path = "../datasets/annotations_composite/"

# Write annotation files
df.groupby('report_id').apply(write_ann_file)