In [1]:
import clinical_trial_extractor
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import SQLConnect

data = clinical_trial_extractor.extract()
query = SQLConnect.UnmergedV1

In [6]:
# Types of information to be collected
# Phase 1 upload
bioentity = []
org = []
people = []
work = []
# Phase 2 upload
people_org = []
work_org = []
work_people = []
keyword = []

In [7]:
# People name exception
job_title = set()
exception_names = ['NADIA NATHALIE HANSEL', 'BALAMURALI AMBATI', 'HYO-SUK LEE', 'RAZELLE KURZROCK', 'SUNG RAE KIM', 'MALCOLM K BRENNER']
job_title.update(['md', 'MMath', 'm', 'MMed', 'Ph.D.', 'Phd', 'ph.D.', 'MD,PhD', 'DNs', 'MD，PhD', 'PhD，MD', 'MD PhD', 'MBChB', 'DPhil'])

In [10]:
# Get data from extractor
for record in data:
    bioentity += [
        {
            'origin_database': 'ClinicalTrials.gov',
            'name': entity
        }
    for entity in record['bio'] if entity]
    org += [
        {
            'origin_database': 'ClinicalTrials.gov',
            'name': entity,
            'funding': None
        }
    for entity in (record['aff'] + ([record['name_org'][1]] if record['name_org'] else [])) if entity]
    # Splitting the name from the JSON file may leave behind titles like PhD,
    # MD PhD, etc. The loop tries to remove these kinds of erroneous names
    new_people = record['name'] + ([record['name_org'][0]] if record['name_org'] else [])
    valid_name = set()
    for name in new_people:
        if (name == 'PhD' or name.isupper()) and name not in exception_names:
            job_title.add(name)
        else:
            valid_name.add(name)
    people += [
        {
            'origin_database': 'ClinicalTrials.gov',
            'email': None,
            'phone': None,
            'name': entity,
            'first_name': None,
            'last_name': None,
            'middle_name': None,
            'nih_id': None
        }
    for entity in valid_name if entity]
    work.append({
        'origin_database': 'ClinicalTrials.gov',
        'title': record['title'],
        'start_date': record['start_date'],
        'end_date': None,
        'type': 'Project',
        'pmid': None
    })
    if record['name_org']:
        people_org.append(record['name_org'] + (int(record['start_date'].split('-')[0]),))
    work_org += [{
        'work': (record['title'], record['start_date']),
        'org': entity
    } for entity in (record['aff'] + ([record['name_org'][1]] if record['name_org'] else [])) if entity]
    work_people += [{
        'work': (record['title'], record['start_date']),
        'people': entity
    } for entity in valid_name if entity]
    keyword += [{
        'work': (record['title'], record['start_date']),
        'bio': entity
    } for entity in record['bio'] if entity]

In [21]:
# Phase 1 upload
bioentity_query = [SQLConnect.insert_query_dict('Bioentity', entity) for entity in bioentity]
org_query = [SQLConnect.insert_query_dict('Org', entity) for entity in org]
people_query = [SQLConnect.insert_query_dict('People', entity) for entity in people]
work_query = [SQLConnect.insert_query_dict('Work', entity) for entity in work]
phase_1_queries = bioentity_query + org_query + people_query + work_query
phase_1_types = ['INSERT' for _ in range(len(phase_1_queries))]
# SQLConnect.connect_and_query(phase_1_queries, phase_1_types, "UnmergedV1")

In [33]:
# Get id data
bioentity_id_req = "SELECT bio_id, name FROM Bioentity WHERE origin_database='ClinicalTrials.gov';"
org_id_req = "SELECT org_id, name FROM Org WHERE origin_database='ClinicalTrials.gov';"
people_id_req = "SELECT people_id, name FROM People WHERE origin_database='ClinicalTrials.gov';"
work_id_req = "SELECT work_id, title, start_date FROM Work WHERE origin_database='ClinicalTrials.gov' AND type = 'Project';"
id_req = [bioentity_id_req, org_id_req, people_id_req, work_id_req]
response = SQLConnect.connect_and_query(id_req, ['SELECT' for _ in range(4)], 'UnmergedV1')

Connection to database established
MySQL connection is closed


In [36]:
# Make dictionary mapping data to id
bioentity_id_dict = {}
org_id_dict = {}
people_id_dict = {}
work_id_dict = {}
for record in response[0]:
    bioentity_id_dict[record[1]] = record[0]
for record in response[1]:
    org_id_dict[record[1]] = record[0]
for record in response[2]:
    people_id_dict[record[1]] = record[0]
for record in response[3]:
    date = str(record[2].year) + '-'
    if record[2].month < 10:
        date += '0'
    date += str(record[2].month) + '-'
    if record[2].day < 10:
        date += '0'
    date += str(record[2].day)
    work_id_dict[(record[1], date)] = record[0]

In [37]:
# Entries for queries in phase 2
people_org_id = []
people_org_exceptions = set()                                           # Failure Cases
for record in people_org:
    if record[0] in people_id_dict and record[1] in org_id_dict:
        people_org_id.append({
            'people_id': people_id_dict[record[0]],
            'org_id': org_id_dict[record[1]],
            'year': record[2]
        })
    else:
        people_org_exceptions.add(record)


In [38]:
work_org_id = []
work_org_exception = []
for record in work_org:
    if record['work'] in work_id_dict and record['org'] in org_id_dict:
        work_org_id.append({
            'work_id': work_id_dict[record['work']],
            'org_id': org_id_dict[record['org']]
        })
    else:
        work_org_exception.append(record)

In [42]:
work_people_id = []
work_people_exception = []
for record in work_people:
    if record['work'] in work_id_dict and record['people'] in people_id_dict:
        work_people_id.append({
            'work_id': work_id_dict[record['work']],
            'people_id': people_id_dict[record['people']]
        })
    else:
        work_people_exception.append(record)

In [46]:
keyword_id = []
keyword_exception = []
for record in keyword:
    if record['work'] in work_id_dict and record['bio'] in bioentity_id_dict:
        keyword_id.append({
            'work_id': work_id_dict[record['work']],
            'bio_id': bioentity_id_dict[record['bio']]
        })
    else:
        keyword_exception.append(record)

In [50]:
# Make queries for phase 2
people_org_queries = [SQLConnect.insert_query_dict('PeopleOrg', record) for record in people_org_id]
work_org_queries = [SQLConnect.insert_query_dict('WorkOrg', record) for record in work_org_id]
work_people_queries = [SQLConnect.insert_query_dict('WorkPeople', record) for record in work_people_id]
keyword_queries = [SQLConnect.insert_query_dict('Keyword', record) for record in keyword_id]
phase_2_queries = people_org_queries + work_org_queries + work_people_queries + keyword_queries
phase_2_types = ['INSERT' for _ in range(len(phase_2_queries))]
# SQLConnect.connect_and_query(phase_2_queries, phase_2_types, 'UnmergedV1')