In [1]:
import concurrent.futures
import functools
import json
import pathlib
import re
import sys
import xml.etree.ElementTree as ET

import pandas as pd
import tqdm

sys.path.insert(0, '../')
import map_modifiers

## Get parent synonyms to match for step 1

In [2]:
parents_df = (
    pd.read_csv(
        '../data/computed/parents_synonyms.tsv', 
        sep='\t',
        dtype={
            'parent_concept_id': int,
            'parent_concept_name': str,
            'parent_concept_code': int,
            'concept_synonym_name': str,
        }
    )
)

parents_df.head(5)

Unnamed: 0,parent_concept_id,parent_concept_name,parent_concept_code,concept_synonym_name
0,4039266,Dry skin,16386004,Anhydrotic skin
1,4039266,Dry skin,16386004,Dry skin (finding)
2,4039266,Dry skin,16386004,Dry skin
3,443432,Impaired cognition,386806002,Cognitive disturbance
4,443432,Impaired cognition,386806002,Cognitive dysfunction


In [3]:
# Create dictionary of {parent_synonym: parent_snomed_code}
parent_synonyms_to_parent_concept_code = (
    parents_df
    .set_index('concept_synonym_name')
    .loc[:, 'parent_concept_code']
    .to_dict()
)

# Check that all 'abc (finding)' exist also as 'abc'
for synonym in parent_synonyms_to_parent_concept_code:
    if '(finding)' in synonym:
        assert synonym.replace(' (finding)', '') in parent_synonyms_to_parent_concept_code
    elif '(disorder)' in synonym:
        assert synonym.replace(' (disorder)', '') in parent_synonyms_to_parent_concept_code

# Remove synonyms having (finding) or (disorder) and normalize synonyms
print('Concepts before removal: ', len(parent_synonyms_to_parent_concept_code))
parent_synonyms_to_parent_concept_code = {
    map_modifiers.utils.normalize_text(synonym): code 
    for synonym, code in parent_synonyms_to_parent_concept_code.items()
    if '(finding)' not in synonym and '(disorder)' not in synonym
}
print('Concepts after removal: ', len(parent_synonyms_to_parent_concept_code))

############## POSSIBLE FUTURE IMPROVEMENT ######################
# # All 'ABC - Alpha Beta Charlie' should also be present as 
# #  'ABC' and 'Alpha Beta Charlie'
# for synonym, code in list(parent_synonyms_to_parent_concept_code.items()):
#     if ' - ' in synonym:
#         first, second = synonym.split(' - ', maxsplit=1)
#         # If 'Alpha Beta Charlie' not a synonym, ignore this case
#         if second in parent_synonyms_to_parent_concept_code:
#             parent_synonyms_to_parent_concept_code[first] = code
# print('Concepts after adding acronyms: ', len(parent_synonyms_to_parent_concept_code))
##################################################################

# Save map from parent synonym to parent SNOMED CT code for use in package
with open('../map_modifiers/parent_synonyms.json', 'w') as f:
    json.dump(parent_synonyms_to_parent_concept_code, f, sort_keys=True, indent=2)

Concepts before removal:  1539
Concepts after removal:  1080


## Open trials files

In [4]:
def xml_file_to_text(file_path):
    """
    Extract the eligibility criteria string from an XML file
    describing a clinical trial.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    try:
        eligibility_text = (
            root
            .find('eligibility/criteria/textblock')
            .text
            .strip()
        )
    except AttributeError:
        return ''
    return eligibility_text

In [5]:
trials_root = pathlib.Path('../data/AllPublicXML/')

trial_files = list(trials_root.glob('*/*.xml'))

len(trial_files)

320611

In [6]:
all_matches = list()
for trial_file in tqdm.tqdm_notebook(trial_files):
    text = xml_file_to_text(trial_file)
    normalized_text = map_modifiers.utils.normalize_text(text)
    
    matches = map_modifiers.recognize_parents.find_possible_pre_coordination(
        normalized_text, parent_synonyms_to_parent_concept_code, 5
    )
    
    for match in matches:
        match['NCT_id'] = trial_file.stem
    
    all_matches.extend(matches)

HBox(children=(IntProgress(value=0, max=320611), HTML(value='')))




In [7]:
colnames = ['NCT_id', 'criteria_string', 'matched_synonym', 'parent_code']
all_trials_df = pd.DataFrame(all_matches, columns=colnames)

all_trials_df.to_csv('../data/all_trials_word_matches.csv.xz', compression='xz', index=False)

print(all_trials_df.shape[0])

all_trials_df.head(2)

386936


Unnamed: 0,NCT_id,criteria_string,matched_synonym,parent_code
0,NCT01828931,diagnosis of one of the psychotic disorders li...,psychotic disorder,69322001
1,NCT01829815,exclusion criteria - severe cognitive disabili...,disability,21134002
