In [1]:
import pandas as pd
import pickle
import numpy as np
import xml.etree.cElementTree as ET
import os
import re
from tqdm import tqdm
import csv
import requests

In [None]:
payment = pickle.load(open('./data/payment_dict.pkl', 'rb'))
trial_list = list(payment.keys())

In [3]:
def get_icd_from_nih(disease_name):
	prefix = 'https://clinicaltables.nlm.nih.gov/api/icd10cm/v3/search?sf=code,name&terms='
	url = prefix + disease_name 
	response = requests.get(url)
	text = response.text 
	if text == '[0,[],null,[]]':
		return [] 
	text = text[1:-1]
	idx1 = text.find('[')
	idx2 = text.find(']')
	codes = text[idx1+1:idx2].split(',')
	codes = [i[1:-1] for i in codes]
	return codes

In [4]:
def process_trial(path):
    cur_trial = ET.ElementTree(file=path)
    cur_dict = {}
    if cur_trial.find('start_date') == None:
        return None
    else:
        cur_str = cur_trial.find('start_date').text
        # Use regex to find year
        cur_dict['start_date'] = pd.to_datetime(cur_str).strftime('%Y-%m-%d')
        
    if cur_trial.find('completion_date') == None:
        cur_dict['completion_date'] = np.nan
    else:
        cur_str = cur_trial.find('completion_date').text
        cur_dict['completion_date'] = pd.to_datetime(cur_str).strftime('%Y-%m-%d')
    
    # phase
    if cur_trial.find('phase') == None:
        cur_dict['phase'] = np.nan
    else:
        cur_dict['phase'] = cur_trial.find('phase').text
    
    cur_cond = []
    for x in cur_trial.findall('condition_browse/mesh_term'):
        cur_cond += get_icd_from_nih(x.text)
        
    cur_dict['condition'] = list(set(cur_cond))
    
    sum_text = cur_trial.find('brief_summary/textblock').text
    # Remove all \r\n, \n and extra spaces
    sum_text = re.sub(r'[\r\n]+', ' ', sum_text)
    sum_text = re.sub(r' +', ' ', sum_text)
    cur_dict['summary'] = sum_text
    
    if cur_trial.find('eligibility/criteria/textblock') == None:
        return None
    else:
        cur_dict['criteria'] = cur_trial.find('eligibility/criteria/textblock').text
    return cur_dict    

In [None]:
trial_dict = {}
for path, subdirs, files in tqdm(os.walk('./data/AllPublicXML/')):
    for name in files:
        if name[:-4] in trial_list:
            cur_trial = os.path.join(path, name)
            cur_res = process_trial(cur_trial)
            if cur_res != None:
                res = process_trial(cur_trial)
                if res != None:
                    trial_dict[name[:-4]] = process_trial(cur_trial)
                else:
                    continue
            else:
                continue
        else:
            continue


In [9]:
trial_dict

{'NCT00004451': {'start_date': '1998-03-01',
  'completion_date': nan,
  'phase': 'N/A',
  'condition': ['G20.B1', 'G20.A1', 'G20.A2', 'G20.B2'],
  'summary': " This study will determine whether glucose facilitates memory in healthy elderly people and those with Parkinson's disease. ",
  'criteria': "\n        Inclusion Criteria:\r\n\r\n          -  Three groups of subjects are eligible for this study: Healthy young adults between\r\n             ages 18 and 30; Healthy elderly adults between ages 60 and 90; AND Anyone diagnosed\r\n             with Parkinson's disease\r\n\r\n        Exclusion Criteria:\r\n\r\n          -  Prior surgery to remove part of the stomach\r\n\r\n          -  Diabetes, Addison's, or Cushing's diseases\r\n      "},
 'NCT00089245': {'start_date': '2004-07-01',
  'completion_date': '2025-07-01',
  'phase': 'Phase 1',
  'condition': ['C46.9',
   'C92.31',
   'C96.A',
   'C92.32',
   'C46.0',
   'C46.2',
   'C96.22'],
  'summary': ' The purpose of this study is to

In [22]:
trial_dict['NCT00089245']

{'start_date': '2004-07-01',
 'completion_date': '2025-07-01',
 'condition': ['C46.9',
  'C92.32',
  'C92.31',
  'C46.2',
  'C96.22',
  'C46.0',
  'C96.A'],
 'summary': ' The purpose of this study is to find a safe dose of a new medicine called antibody 8H9. Antibodies are made by the body to fight infections and in some cases, to fight tumors. The antibody 8H9 is made by mice and can attack many kinds of tumors. 8H9 antibody can have a dose of radiation attached to it called 131-I. 131I-8H9 has been given in the vein to patients to find cancer cells. This is the first study using 131I-8H9 in the fluid in the spine to kill cancer cells. 131-I is a beta emitting isotope used extensively for radiation targeted therapies. ',
 'criteria': '\n        Subject Inclusion Criteria:\r\n\r\n          -  Patients must have a histologically confirmed diagnosis of a malignancy known to be\r\n             8H9 reactive. 8H9 expression must be confirmed by immunohistochemical staining of\r\n           

In [None]:
pickle.dump(trial_dict, open('./data/trial_info.pkl', 'wb'))

In [None]:
# Generate parser input csv
rows = []
for each_trial in trial_dict:
    rows.append([each_trial, '', '', trial_dict[each_trial]['condition'], trial_dict[each_trial]['criteria']])
with open('./data/trial_info.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['#nct_id', 'title', 'has_us_facility', 'conditions', 'eligibility_criteria'])
    writer.writerows(rows)

In [16]:
trial_dict['NCT00089245']

{'start_date': '2004-07-01',
 'completion_date': '2025-07-01',
 'condition': 'Brain and Central Nervous System Tumors',
 'criteria': '\n        Subject Inclusion Criteria:\r\n\r\n          -  Patients must have a histologically confirmed diagnosis of a malignancy known to be\r\n             8H9 reactive. 8H9 expression must be confirmed by immunohistochemical staining of\r\n             tumor and assessed by the Department of Pathology or by immunofluorescence of bone\r\n             marrow except for patients confirmed to have neuroblastoma.\r\n\r\n          -  Patients must have CNS/ leptomeningeal disease which is refractory to conventional\r\n             therapies or for which no conventional therapy exists OR a recurrent brain tumors with\r\n             a predilection for leptomeningeal dissemination (medulloblastoma, PNET, rhabdoid\r\n             tumor).\r\n\r\n          -  Patients must have no rapidly progressing or deteriorating neurologic examination.\r\n\r\n          -  P