In [3]:
import xml
import xml.etree.ElementTree as ET
import os
import pandas as pd
from tqdm.notebook import tqdm

In [9]:
dataset_root = "/nas/Datasets/trec_ct"
part = 'ClinicalTrials.2021-04-27.part1'
folder = "NCT0000xxxx/" # dependent on the 'part' folder
file = os.path.join(dataset_root,part,folder,'NCT01607801.xml')

In [13]:
def gen_simple_doc(root : xml.etree.ElementTree.Element) -> dict:
    """
    Columns that comprise a doc:
        'id' using nct_id
        'summary' using brief_summary
        'gender' using eligibility.gender
        'min_age' using eligibility.minimum_age
        'max_age' using eligibility.maximum_age
    """
    try:
        id_ = root.find('id_info').find('nct_id').text
        title = root.find('brief_title').text
        summary = root.find('brief_summary')[0].text
        gender = root.find('eligibility').find('gender').text
        min_age = root.find('eligibility').find('minimum_age').text
        max_age = root.find('eligibility').find('maximum_age').text

        doc = {'id':id_,'title':title, 'summary':summary,'gender':gender,'min_age':min_age,'max_age':max_age}
    except:
        doc = None
    return doc

In [14]:
def create_collection_sample_to_csv(dataset_root, part, save_path, max_collection_size):
    """
    Creates a sample of max_collection_size documents in a csv format.
    Current version is not even a real saple as it generates docs in an orderly manner.
    
    Supported columns are defined in the function <gen_simple_doc>
    """
    docs = []
    ignored_docs = 0

    i = 0
    stop = False
    part_path = os.path.join(dataset_root,part)

    for folder in os.listdir(part_path):
        folder_path = os.path.join(part_path,folder)
        files = os.listdir(folder_path)
        for file in files:
            filepath = os.path.join(folder_path,file)

            # read and parse file
            root = ET.parse(filepath).getroot()
            doc = gen_simple_doc(root)

            if not doc:
                ignored_docs +=1
                continue

            docs.append(doc)
            i +=1

            if i % 1000 == 0:
                print(f'Progress {i}/{max_collection_size}')

            if i == max_collection_size:
                print(folder)
                print(file)
                stop = True
                break


        if stop:
            break
    print('Success!')
    print(f'ignored docs: {ignored_docs}')
    
    df = pd.DataFrame(docs)
    df.to_csv(save_path,index=False)
    return True

create_collection_sample_to_csv(dataset_root, part,'data/5ksample.csv',5000)

Progress 1000/5000
Progress 2000/5000
Progress 3000/5000
Progress 4000/5000
Progress 5000/5000
NCT0000xxxx
NCT00002067.xml
Success!
ignored docs: 0


True