In [1]:
import concurrent.futures
import functools
import pathlib
import xml.etree.ElementTree as ET

import pandas as pd
import tqdm

## Format parents for searching

In [2]:
parents_df = pd.read_csv('../data/computed/parents_synonyms.tsv', sep='\t').dropna()

parents_df.head()

Unnamed: 0,parent_concept_id,parent_concept_name,parent_concept_code,concept_synonym_name
0,4039266.0,Dry skin,16386004.0,Anhydrotic skin
1,4039266.0,Dry skin,16386004.0,Dry skin (finding)
2,4039266.0,Dry skin,16386004.0,Dry skin
3,443432.0,Impaired cognition,386806002.0,Cognitive disturbance
4,443432.0,Impaired cognition,386806002.0,Cognitive dysfunction


In [3]:
parent_synonyms_to_parent_concept_id = (
    parents_df
    .assign(
        concept_synonym_name = lambda df: df['concept_synonym_name'].apply(lambda x: x.lower()),
        parent_concept_id = lambda df: df['parent_concept_id'].astype(int),
    )
    .set_index('concept_synonym_name')
    .loc[:, 'parent_concept_id']
    .to_dict()
)

# Parent concept_ids to parent concept names
concept_id_to_name = (
    parents_df
    .assign(
        parent_concept_id = lambda df: df['parent_concept_id'].astype(int),
    )
    .set_index('parent_concept_id')
    .loc[:, 'parent_concept_name']
    .to_dict()
)

## Search files by lines

In [4]:
trials_root = pathlib.Path('../data/AllPublicXML/')

trial_files = list(trials_root.glob('*/*.xml'))

len(trial_files)

320611

In [5]:
def xml_file_to_text(file_path):
    """
    Extract the eligibility criteria string from an XML file
    describing a clinical trial.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    try:
        eligibility_text = (
            root
            .find('eligibility/criteria/textblock')
            .text
            .strip()
        )
    except AttributeError:
        return ''
    return eligibility_text


def extract_concepts_one_file(file_path, concept_string_to_id, concept_id_to_name):
    """
    Extract concepts from a clinical trial's eligibility criteria.
    Extracted concepts are only those which are direct parents of 
    concepts with chosen severity modifiers.
    
    Parameters
    ----------
    file_path : pathlib.Path
        Path to the XML file for a single clinical trial.
    concept_string_to_id : Dict[str, int]
        Many-to-one map between synonyms for a concept and the concept ID.
        These strings are matched in the eligibility criteria text (without
        newlines, spaces, or upper/lowercase distinctions).
    concept_id_to_name : Dict[int, str]
        One-to-one map between a concept's ID and its concept name.
    """
    text = xml_file_to_text(file_path)
    nct_id = file_path.stem
    
    # Get all concepts that are found with full space, newline strips
    matched_parent_concepts = [
        (name, concept_id) 
        for name, concept_id in concept_string_to_id.items() 
        if name.replace(' ', '') in text.lower().replace('\n', '').replace(' ', '')
    ]
    
    outputs = list()
    for name, concept_id in matched_parent_concepts:
        try:
            start_index = text.lower().replace('\n', '').index(name)
        except ValueError:
            try:
                start_index = (text.lower().replace('\n', '').replace('  ', ' ')
                               .index(name.replace('  ', ' ')))
            except ValueError:
                continue
        concept_name = concept_id_to_name[concept_id]
        outputs.append(
            (nct_id, name, text[max(start_index - 100, 0):start_index + 100],
             concept_id, concept_name)
        )
    return outputs

In [6]:
extract_wrapper = functools.partial(
    extract_concepts_one_file,
    concept_string_to_id=parent_synonyms_to_parent_concept_id, 
    concept_id_to_name=concept_id_to_name,
)

with concurrent.futures.ProcessPoolExecutor() as executor:
    output_lists = list(tqdm.tqdm_notebook(
        executor.map(extract_wrapper, trial_files), total=len(trial_files)
    ))

# Flatten list of lists of tuples to list of tuples
outputs = [i for l in output_lists for i in l]


# Above is equivalent to the following:

# outputs = list()
# for file in tqdm.tqdm_notebook(trial_files):
#     concepts = extract_concepts_one_file(file, parent_synonyms_to_parent_concept_id, 
#                                          concept_id_to_name)
#     outputs.extend(concepts)   

HBox(children=(IntProgress(value=0, max=320611), HTML(value='')))




In [7]:
all_trials_df = pd.DataFrame(outputs, columns=['NCT_id', 'matched_string', 
                                               'criteria_string',
                                               'parent_concept_id', 
                                               'parent_concept_name'])

all_trials_df.to_csv('../data/extracted_parents.csv.xz', index=False, 
                     compression='xz')

all_trials_df.head()

Unnamed: 0,NCT_id,matched_string,criteria_string,parent_concept_id,parent_concept_name
0,NCT01828931,psychotic disorder,1. Between the ages of 18 and 70 years (i...,436073,Psychotic disorder
1,NCT01829815,disability,- Resident in one of five study sit...,4052648,Disability
2,NCT01823770,scid,e referred for a structured clinical\n ...,29783,Severe combined immunodeficiency disease
3,NCT01823770,painful,tor.\n\n 4. Subject has addition...,4329041,Pain
4,NCT01823770,pain,tor.\n\n 4. Subject has addition...,4329041,Pain
