# Extract criteria

This notebook extracts clinical trials eligibility criteria from all clinical trials and saves it to a single LZMA compressed (`.xz`) `.tsv` file.
The resulting file (`data/outputs/trial_eligibility_criteria.tsv.xz`) has two columns: NCT_ID and eligibility_criteria.
Eligibility criteria are normalized using `map_modifiers.normalize.normalize()` and retain spaces within the strings.

In [1]:
import concurrent.futures
import lzma
import pathlib
import xml.etree.ElementTree as ET

import tqdm.notebook

import map_modifiers

In [2]:
def xml_file_to_text(file_path):
    """
    Extract the eligibility criteria string from an XML file
    describing a clinical trial.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    try:
        eligibility_text = (
            root
            .find('eligibility/criteria/textblock')
            .text
            .strip()
        )
    except AttributeError:
        return ''
    return eligibility_text


def file_to_final_text(file_path):
    """
    Extract criteria string, normalize it, and output a list like
    [(NCT_ID, normalized_criteria), ...]
    """
    raw_text = xml_file_to_text(file_path)
    trial_id = file_path.stem
    return (trial_id, map_modifiers.normalize.normalize(raw_text))

In [3]:
trials_root = pathlib.Path('../../data/raw/clinical_trials_gov/')

trial_files = list(trials_root.glob('*/*.xml'))

len(trial_files)

330113

In [4]:
with concurrent.futures.ProcessPoolExecutor() as executor:
    outputs = list(tqdm.notebook.tqdm(
        executor.map(file_to_final_text, trial_files), total=len(trial_files)
    ))

HBox(children=(FloatProgress(value=0.0, max=330113.0), HTML(value='')))




In [5]:
# Write a copy of the file using LZMA compression    
with lzma.open('../../data/outputs/trial_eligibility_criteria.tsv.xz', 'w') as f:
    f.write(b'NCT_ID\teligibility_criteria\n')
    for line in outputs:
        f.write(bytes(('\t'.join(line) + '\n').encode('utf-8')))