In [1]:
import pandas as pd
import numpy as numpy
from importlib import reload
from tqdm import tqdm_notebook as tqdm
import time

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot
%matplotlib inline

  """)


In [2]:
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.show_constraints()
ss.load_studies()

WHERE 1=1 
    AND (study_type = 'Interventional')

252253 studies loaded!


In [3]:
ss.add_dimensions([
    'countries', 
    'conditions', 
    'browse_conditions',
    'keywords', 
    'pending_results', 
    'design_outcomes', 
    'designs', 
    'result_groups',
    'milestones', 
    'baseline_counts', 
    'baseline_measurements', 
    'outcomes', 
    'outcome_analyses', 
    'outcome_analysis_groups', 
    'outcome_counts',
    'outcome_measurements', 
    'browse_interventions', 
    'interventions', 
    'intervention_other_names'
])

Successfuly added these 19 dimensions: ['countries', 'conditions', 'browse_conditions', 'keywords', 'pending_results', 'design_outcomes', 'designs', 'result_groups', 'milestones', 'baseline_counts', 'baseline_measurements', 'outcomes', 'outcome_analyses', 'outcome_analysis_groups', 'outcome_counts', 'outcome_measurements', 'browse_interventions', 'interventions', 'intervention_other_names']
Failed to add these 0 dimensions: []


In [4]:
ss.refresh_dim_data()

HBox(children=(IntProgress(value=0, max=505), HTML(value='')))

Syncing the temp table temp_cur_studies in 505 chunks x 500 records each

Creating index on the temp table
 - Loading dimension countries
 -- Loading raw data
 -- Sorting index
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension keywords
 -- Loading raw data
 -- Sorting index
 - Loading dimension pending_results
 -- Loading raw data
 -- Sorting index
 - Loading dimension design_outcomes
 -- Loading raw data
 -- Sorting index
 - Loading dimension designs
 -- Loading raw data
 -- Sorting index
 - Loading dimension result_groups
 -- Loading raw data
 -- Sorting index
 - Loading dimension milestones
 -- Loading raw data
 -- Sorting index
 - Loading dimension baseline_counts
 -- Loading raw data
 -- Sorting index
 - Loading dimension baseline_measurements
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcomes
 -- Loading raw data
 -- Sorting index
 - Loading di

In [5]:
countries = ss.dimensions['countries'].data.add_prefix('country_')

In [6]:
# Count of studies per country, one study can be associated with multiple countries

countries.reset_index().groupby(['country_name']).agg({'country_id':'count'}).sort_values(
    by='country_id', ascending=False).head()

Unnamed: 0_level_0,country_id
country_name,Unnamed: 1_level_1
United States,108607
Canada,19306
France,18495
Germany,16483
United Kingdom,15397


In [7]:
# Count of countries per study

countries.reset_index().groupby(['nct_id']).agg({'country_name':'nunique'}).sort_values(
    by='country_name', ascending=False).reset_index().groupby(['country_name']).agg({'nct_id':'count'}).sort_values(
    by='nct_id', ascending=False).head()

Unnamed: 0_level_0,nct_id
country_name,Unnamed: 1_level_1
1,203181
2,8778
3,3273
4,2203
5,1757


In [8]:
keywords = ss.dimensions['keywords'].data.add_prefix('keyword_')
keywords['keyword_cancer_related'] = keywords['keyword_downcase_name'].apply(lambda x:'cancer' in x)

In [9]:
# Count of studies with each keyword, one study can be associated with multiple keywords

keywords.reset_index().groupby(['keyword_downcase_name']).agg({'nct_id':'count'}).sort_values(
    by='nct_id', ascending=False).head()

Unnamed: 0_level_0,nct_id
keyword_downcase_name,Unnamed: 1_level_1
pharmacokinetics,2311
safety,2100
exercise,2081
pain,2078
obesity,2056


In [10]:
# Count of all studies related to cancer by keyword, broken down by keywords used

keywords[keywords.keyword_downcase_name.str.contains('cancer')].reset_index().groupby(
    ['keyword_downcase_name']).agg({'nct_id':'count'}).sort_values(by='nct_id', ascending=False).head()

Unnamed: 0_level_0,nct_id
keyword_downcase_name,Unnamed: 1_level_1
breast cancer,1867
cancer,1785
prostate cancer,948
lung cancer,637
colorectal cancer,572


In [11]:
# Count of all studies, related to cancer by keyword or not

keywords.reset_index().groupby(['keyword_cancer_related']).agg({'nct_id':'nunique'})

Unnamed: 0_level_0,nct_id
keyword_cancer_related,Unnamed: 1_level_1
False,165035
True,16526


In [12]:
# Count of studies submitted per year

ss.studies['study_first_submitted_year'] = pd.to_datetime(ss.studies['study_first_submitted_date']).dt.year
ss.studies.reset_index().groupby('study_first_submitted_year').agg({'nct_id':'count'}).head()

Unnamed: 0_level_0,nct_id
study_first_submitted_year,Unnamed: 1_level_1
1999,3169
2000,1236
2001,1100
2002,1374
2003,1442


In [13]:
# Count of studies by result dates, completion dates and last known status

ss.studies['results_first_submitted_date_is_null'] = ss.studies['results_first_submitted_date'].isnull()
ss.studies['results_first_submitted_qc_date_is_null'] = ss.studies['results_first_submitted_qc_date'].isnull()
ss.studies['results_first_posted_date_is_null'] = ss.studies['results_first_posted_date'].isnull()
ss.studies['completion_date_is_null'] = ss.studies['completion_date'].isnull()

ss.studies.reset_index().groupby([
    'last_known_status', 'results_first_submitted_qc_date_is_null', 
    'results_first_submitted_date_is_null', 'results_first_posted_date_is_null', 
    'completion_date_is_null']).agg({'nct_id':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,nct_id
last_known_status,results_first_submitted_qc_date_is_null,results_first_submitted_date_is_null,results_first_posted_date_is_null,completion_date_is_null,Unnamed: 5_level_1
"Active, not recruiting",False,False,False,False,31
"Active, not recruiting",False,False,False,True,15
"Active, not recruiting",True,True,True,False,4236
"Active, not recruiting",True,True,True,True,1075
Enrolling by invitation,True,True,True,False,776
Enrolling by invitation,True,True,True,True,87
Not yet recruiting,True,True,True,False,3734
Not yet recruiting,True,True,True,True,662
Recruiting,True,True,True,False,11517
Recruiting,True,True,True,True,2032


In [130]:
# Interventional studies related to cancer by keyword

df = ss.studies.merge(
    keywords[keywords['keyword_downcase_name'].str.contains('cancer')].reset_index().groupby(['nct_id'])
    .agg({'keyword_id':'count'}), left_index=True, right_index=True, how='inner')

In [131]:
# Count of interventional cancer studies by result dates, completion dates and last known status

df.reset_index().groupby([
    'last_known_status', 'results_first_submitted_qc_date_is_null', 
    'results_first_submitted_date_is_null', 'results_first_posted_date_is_null', 
    'completion_date_is_null']).agg({'nct_id':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,nct_id
last_known_status,results_first_submitted_qc_date_is_null,results_first_submitted_date_is_null,results_first_posted_date_is_null,completion_date_is_null,Unnamed: 5_level_1
"Active, not recruiting",False,False,False,False,7
"Active, not recruiting",False,False,False,True,1
"Active, not recruiting",True,True,True,False,259
"Active, not recruiting",True,True,True,True,322
Enrolling by invitation,True,True,True,False,35
Enrolling by invitation,True,True,True,True,4
Not yet recruiting,True,True,True,False,105
Not yet recruiting,True,True,True,True,23
Recruiting,True,True,True,False,668
Recruiting,True,True,True,True,224


In [132]:
# Count of result groups and studies by result group type

df = df.merge(ss.dimensions['result_groups'].data.add_prefix('result_group_'), 
    left_index=True, right_index=True, how='left')

df['result_group_result_type'] = df['result_group_result_type'].fillna('None')

df.reset_index().groupby(['result_group_result_type']).agg({'nct_id':['count', 'nunique']})

Unnamed: 0_level_0,nct_id,nct_id
Unnamed: 0_level_1,count,nunique
result_group_result_type,Unnamed: 1_level_2,Unnamed: 2_level_2
Baseline,6701,2672
,13854,13854
Outcome,35237,2672
Participant Flow,5492,2672
Reported Event,5434,2670


In [133]:
# Number of result groups per study

df['number_of_groups'] = df['number_of_groups'].fillna('Unknown')
df.reset_index().groupby(['results_first_posted_date_is_null', 'number_of_groups']).agg({'nct_id':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,nct_id
results_first_posted_date_is_null,number_of_groups,Unnamed: 2_level_1
False,Unknown,52864
True,Unknown,13854


In [134]:
# Number of result group id's per study and study type, different from above

df[df.result_group_id.notnull()].reset_index().groupby(['result_group_result_type', 'nct_id']).agg(
    {'result_group_id':'count'}).sort_values(by='result_group_id', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,result_group_id
result_group_result_type,nct_id,Unnamed: 2_level_1
Outcome,NCT01347866,355
Outcome,NCT01449370,346
Outcome,NCT01026402,288
Outcome,NCT00982865,280
Outcome,NCT00141297,263


In [135]:
df[df.result_group_id.notnull()].shape

(52864, 74)

In [136]:
# Joining outcome analysis groups

outcome_analysis_groups = ss.dimensions['outcome_analysis_groups'].data.add_prefix(
    'outcome_analysis_group_')

outcome_analysis_groups.shape

(309051, 2)

In [137]:
df = df[df.result_group_id.notnull()].reset_index().set_index(['nct_id', 'result_group_id']).merge(
    outcome_analysis_groups.reset_index().set_index(['nct_id', 'result_group_id']),
    left_index=True, right_index=True, how='left')
df.shape

(56832, 76)

In [138]:
outcome_analyses = ss.dimensions['outcome_analyses'].data.add_prefix('outcome_analysis_')
outcome_analyses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,outcome_analysis_id,outcome_analysis_non_inferiority_type,outcome_analysis_non_inferiority_description,outcome_analysis_param_type,outcome_analysis_param_value,outcome_analysis_dispersion_type,outcome_analysis_dispersion_value,outcome_analysis_p_value_modifier,outcome_analysis_p_value,outcome_analysis_ci_n_sides,outcome_analysis_ci_percent,outcome_analysis_ci_lower_limit,outcome_analysis_ci_upper_limit,outcome_analysis_ci_upper_limit_na_comment,outcome_analysis_p_value_description,outcome_analysis_method,outcome_analysis_method_description,outcome_analysis_estimate_description,outcome_analysis_groups_description,outcome_analysis_other_analysis_description
nct_id,outcome_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT00000378,4572247,2511203,Superiority or Other,,,,,,<,0.05,,,,,,actual calculation,"Regression, Logistic",,,logistic regression and mixed effects model,
NCT00000392,4572185,2511190,Superiority or Other,,,,,,,0.18,,,,,,,ANOVA,,,,
NCT00000620,4572249,2511204,Superiority or Other,,Hazard Ratio (HR),0.94,,,,0.3,2-Sided,95.0,0.85,1.05,,P-value presented is not adjusted for multiple...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,,
NCT00000620,4572250,2511205,Superiority or Other,,Hazard Ratio (HR),0.92,,,,0.32,2-Sided,95.0,0.79,1.08,,P-value is adjusted for interim monitoring. A ...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,Recruitment for the Glycemia Trial was designe...,
NCT00000620,4572251,2511206,Superiority or Other,,Hazard Ratio (HR),0.59,,,,0.01,2-Sided,95.0,0.39,0.89,,P-value presented is not adjusted for multiple...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,,


In [139]:
df = df.reset_index().set_index(['nct_id', 'outcome_analysis_id']).merge(
    outcome_analyses.reset_index().set_index(['nct_id', 'outcome_analysis_id']),
    left_index=True, right_index=True, how='left')
df.shape

(56832, 96)

In [140]:
outcomes = ss.dimensions['outcomes'].data.add_prefix('outcome_')
outcomes.head()

Unnamed: 0_level_0,outcome_id,outcome_outcome_type,outcome_title,outcome_description,outcome_time_frame,outcome_population,outcome_anticipated_posting_date,outcome_anticipated_posting_month_year,outcome_units,outcome_units_analyzed,outcome_dispersion_type,outcome_param_type
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NCT00000125,4572276,Primary,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,"5 yrs (OHTS I, June 2002) and 13.0 yrs (comple...",1636 ocular hypertensive participants were ran...,,,percent of participants,,,Number
NCT00000134,4572275,Primary,Morbidity,"To determine the best therapeutic regimen, usi...","Patients will be seen at baseline, monthly for...",,,,participants,,,Number
NCT00000135,4572274,Primary,Mortality Rate,to evaluate the efficacy of an intravenous hum...,All patients enrolled were followed for a 17 m...,,,,deaths per person-year,,,Number
NCT00000136,4572273,Primary,Mortality,,All patients enrolled will be followed until a...,,,,participants,,,Number
NCT00000142,4572272,Primary,Survival,,All patients enrolled will be followed until a...,,,,participants,,,Number


In [141]:
df = df.reset_index().set_index(['nct_id', 'outcome_id']).merge(
    outcomes.reset_index().set_index(['nct_id', 'outcome_id']), left_index=True, right_index=True, how='left')

df.shape

(56832, 107)

In [142]:
outcome_counts = ss.dimensions['outcome_counts'].data.add_prefix('outcome_count_')
outcome_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_count_id,outcome_count_ctgov_group_code,outcome_count_scope,outcome_count_units,outcome_count_count
nct_id,result_group_id,outcome_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT00000125,15120265,4572276,10780426,O2,Measure,Participants,817
NCT00000125,15120266,4572276,10780427,O1,Measure,Participants,819
NCT00000134,15120254,4572275,10780423,O3,Measure,Participants,93
NCT00000134,15120255,4572275,10780424,O2,Measure,Participants,93
NCT00000134,15120256,4572275,10780425,O1,Measure,Participants,88


In [143]:
df = df.reset_index().set_index(['nct_id', 'result_group_id', 'outcome_id']).merge(
    outcome_counts, left_index=True, right_index=True, how='left')

df.shape

(56909, 111)

In [144]:
outcome_measurements = ss.dimensions['outcome_measurements'].data.add_prefix('outcome_measurements_')
outcome_measurements.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_measurements_id,outcome_measurements_ctgov_group_code,outcome_measurements_classification,outcome_measurements_category,outcome_measurements_title,outcome_measurements_description,outcome_measurements_units,outcome_measurements_param_type,outcome_measurements_param_value,outcome_measurements_param_value_num,outcome_measurements_dispersion_type,outcome_measurements_dispersion_value,outcome_measurements_dispersion_value_num,outcome_measurements_dispersion_lower_limit,outcome_measurements_dispersion_upper_limit,outcome_measurements_explanation_of_na
nct_id,result_group_id,outcome_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
NCT00000125,15120265,4572276,34193878,O2,"Incidence of glaucoma 13 yr (OHTS II, March 2009)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,14.1,14.1,,,,,,
NCT00000125,15120265,4572276,34193880,O2,"Incidence of glaucoma at 5 yr (OHTS I, June 2002)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,4.4,4.4,,,,,,
NCT00000125,15120266,4572276,34193879,O1,"Incidence of glaucoma 13 yr (OHTS II, March 2009)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,20.0,20.0,,,,,,
NCT00000125,15120266,4572276,34193881,O1,"Incidence of glaucoma at 5 yr (OHTS I, June 2002)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,9.5,9.5,,,,,,
NCT00000134,15120254,4572275,34193875,O3,,,Morbidity,"To determine the best therapeutic regimen, usi...",participants,Number,93.0,93.0,,,,,,


In [145]:
df = df.merge(outcome_measurements, left_index=True, right_index=True, how='left')
df.shape

(229873, 127)

In [157]:
df.reset_index().groupby('outcome_measurements_title').agg({'nct_id':'count'}).sort_values(
    by='nct_id', ascending=False)

Unnamed: 0_level_0,nct_id
outcome_measurements_title,Unnamed: 1_level_1
Change From Baseline in European Organization for the Research and Treatment of Cancer Quality of Life Questionnaire (EORTC QLQ-C30) Score at Day 1 of Every Cycle and End of Study,66825
Change From Baseline in 26-item Pancreatic Cancer-specific Quality of Life Questionnaire (QLQ-PAN26) Score at Day 1 of Every Cycle and End of Study,54810
Change From Baseline in Functional Assessment of Cancer Treatment – Colorectal (FACT-C) Score,10800
OS in Subgroups That Were Defined by Germline PDGFRB Polymorphisms,4992
PFS in Subgroups That Were Defined by Germline PDGFRB Polymorphisms,4992
Changes in Immune Cell Subsets in Peripheral Blood Mononuclear Cells (PBMC),3468
"Randomized Participants With Non-missing pCR & Biomarker Expression (GENE [Probe Set]) to Explore Whether Gene Expression Patterns for GTSE1, Isoforms of β-tubulin, Kallikreins 5, 6, 10 Are Differentially Predictive of pCR/RCB1",2700
"Randomized Participants With Non-missing pCR & Biomarker Expression (GENE [Probe Set]), to Explore Whether Gene Expression Patterns for GTSE1, Isoforms of β-tubulin, Kallikreins 5, 6, 10 Are Differentially Predictive of pCR",2700
EORTC-QLQ-C30,2655
PFS in Subgroups That Were Defined by RNA Expression Profile,2592


In [None]:
# What kinds of baseline measurements (params and dispersion types) are associated with each study and 
# result group type?

df[df.result_group_id.notnull()].reset_index().set_index(
    ['nct_id', 'result_group_id']).merge(ss.dimensions['baseline_measurements'].data.reset_index().set_index(
    ['nct_id', 'result_group_id']), left_index=True, right_index=True, how='left').reset_index().groupby(
    ['result_group_result_type', 'param_type', 'dispersion_type']).agg(
    {'nct_id':['count', 'nunique'], 'result_group_id':['count', 'nunique']})

In [None]:
interventional_cancer_studies_rg[interventional_cancer_studies_rg.result_group_id.notnull()].reset_index().set_index(
    ['nct_id', 'result_group_id']).merge(ss.dimensions['baseline_measurements'].data.add_prefix(
    'baseline_measurement_').reset_index().set_index(['nct_id', 'result_group_id']), 
    left_index=True, right_index=True, how='left').reset_index().columns

In [None]:
# Join outcome analysis groups to result groups
interventional_cancer_studies_rg_oag = interventional_cancer_studies_rg[interventional_cancer_studies_rg.result_group_id.notnull()
    ].reset_index().set_index(['nct_id', 'result_group_id']).merge(
    ss.dimensions['outcome_analysis_groups'].data.add_prefix('outcome_analysis_group_').reset_index().set_index(
    ['nct_id', 'result_group_id']), left_index=True, right_index=True, how='left').reset_index()

In [None]:
# Count of studies and result groups with outcome analysis groups

interventional_cancer_studies_rg_oag['outcome_analysis_group_id_is_null'] = interventional_cancer_studies_rg_oag['outcome_analysis_group_id'].isnull()
interventional_cancer_studies_rg_oag['outcome_analysis_id_is_null'] = interventional_cancer_studies_rg_oag['outcome_analysis_id'].isnull()

interventional_cancer_studies_rg_oag.groupby(
    ['outcome_analysis_group_id_is_null', 'outcome_analysis_id_is_null']).agg(
    {'nct_id':['count', 'nunique'], 'result_group_id':['count', 'nunique']})

In [None]:
ss.dimensions['outcome_analyses'].data.add_prefix('outcome_analysis_').reset_index().set_index(['nct_id', 'outcome_analysis_id'])

In [None]:
# Join outcome analyses to outcome analysis groups

interventional_cancer_studies_rg_oag.columns

In [None]:
ss.dimensions['outcome_analyses'].data.add_prefix('outcome_analysis_').reset_index().set_index(['nct_id', 'outcome_analysis_id'])