# Meta

This notebook generates the data required for the workshop using the original dataset from [csiro](https://dl.acm.org/doi/abs/10.1145/2911451.2914672). All data is stored under `data/`


In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from trectools import TrecQrel
from trectools.trec_topics import TrecTopics
from bs4 import BeautifulSoup
import codecs
import xml

# Get Patients

In [2]:
topics_path = '/nas/Datasets/csiro_ct/topics-2014_2015-description.topics'
topicsParser = TrecTopics()
# note, for some reason tags have to be in lower_case for this to work. seems to be due to beautifulsoup
topicsParser.read_topics_from_file(topics_path, topic_tag='top',numberid_tag='num',querytext_tag='title',number_attr=False)
patients = topicsParser.topics.copy()

In [3]:
print(f'There are {len(patients)} patients in our database')

There are 60 patients in our database


# Get qrels

In [4]:
qrels_file = "/nas/Datasets/csiro_ct/qrels-clinical_trials.txt"
qrels = TrecQrel(qrels_file)

# Internally TrecTools save the objects as Pandas dataframes
qrels = qrels.qrels_data
# some preprocessing
qrels = (qrels
         .rename(columns={'query':'qid'})
         .astype({'qid':str})
        )

# print
qrels.head(3)
qrels.shape

Unnamed: 0,qid,q0,docid,rel
0,1158,0,NCT02371057,0
1,1158,0,NCT01102998,0
2,1158,0,NCT00494468,1


(3506, 4)

# select patients that only exist in qrels & topics

In [5]:
patients_in_qrels_not_topics = [e for e in qrels.qid.unique() if e not in patients.keys()]
patients_in_topics_not_qrels = [e for e in patients.keys() if e not in qrels.qid.unique()]

print(f'Patients in qrels but not in topics: {len(patients_in_qrels_not_topics)}')
print(f'Patients in topics but not in qrels: {len(patients_in_topics_not_qrels)}')

Patients in qrels but not in topics: 4
Patients in topics but not in qrels: 9


In [6]:
common_patients = [e for e in qrels.qid.unique() if e in patients.keys()]

patients = {p:patients[p] for p in common_patients}
print(f'We have {len(patients)} common patients!')

We have 51 common patients!


In [7]:
# now do the same for qrels (only select patients that exist on both sides)
qrels = qrels[qrels.qid.isin(common_patients)]
qrels.head(3)
qrels.shape

Unnamed: 0,qid,q0,docid,rel
160,20141,0,NCT00000408,0
161,20141,0,NCT00000492,1
162,20141,0,NCT00000501,0


(3346, 4)

# Save patients data

In [8]:
# data in format acceptable to pandas
data = [{'patientid':e[0],'description':e[1]} for e in list(patients.items())]
df = pd.DataFrame(data).set_index('patientid')
df.head(2)
df.shape
df.to_csv('../data/patients_sample.csv')

Unnamed: 0_level_0,description
patientid,Unnamed: 1_level_1
20141,A 58-year-old African-American woman presents ...
201410,A physician is called to see a 67-year-old wom...


(51, 1)

# Save qrels data

In [9]:
qrels.drop(columns={'q0'}).to_csv('../data/qrels_sample.csv',index=False)

# Now onto the documents

Let's read the documents that exist in the qrels file

In [10]:
def gen_simple_doc(root : xml.etree.ElementTree.Element) -> dict:
    """
    Columns that comprise a doc:
        'id' using nct_id
        'summary' using brief_summary
        'gender' using eligibility.gender
        'min_age' using eligibility.minimum_age
        'max_age' using eligibility.maximum_age
    """
    try:
        id_ = root.find('id_info').find('nct_id').text
        title = root.find('brief_title').text
        summary = root.find('brief_summary')[0].text
        gender = root.find('eligibility').find('gender').text
        min_age = root.find('eligibility').find('minimum_age').text
        max_age = root.find('eligibility').find('maximum_age').text

        doc = {'id':id_,'title':title, 'summary':summary,'gender':gender,'min_age':min_age,'max_age':max_age}
    except:
        doc = None
    return doc

In [11]:
documents = qrels.docid.unique().tolist()
documents_dir = '/nas/Datasets/csiro_ct/clinicaltrials.gov-16_dec_2015/'
doc_list = []
none_list = []
for docid in documents:
    document_file = os.path.join(documents_dir,docid) + '.xml'
    root = ET.parse(document_file).getroot()
    doc = gen_simple_doc(root)
    if doc is None:
        none_list.append(docid)
    else:
        doc_list.append(doc)
        
print(f'Docs that are in qrels but not in collection: {none_list}')

# build sample collection
collection = pd.DataFrame(doc_list)
collection.head(3)
collection.shape

Docs that are in qrels but not in collection: ['NCT02006251']


Unnamed: 0,id,title,summary,gender,min_age,max_age
0,NCT00000408,Low Back Pain Patient Education Evaluation,\n Back pain is one of the most common of...,Both,18 Years,
1,NCT00000492,Beta-Blocker Heart Attack Trial (BHAT),\n To determine whether the regular admin...,Both,30 Years,69 Years
2,NCT00000501,Hypertension Prevention Trial (HPT) Feasibilit...,\n To test the feasibility and the effica...,Both,25 Years,49 Years


(3170, 6)

# Save the sample collection of documents

In [12]:
collection.to_csv('../data/sample_collection.csv',index=False)