This project seeks to evaluate the replicability of trial findings.
1. For a handful of diseases collect drug names, collect panel data on trials related to these drugs.
2. Evaluate predictive power of early studies on later studies (R^2 and corr coeff can be thought of as replicability measure)
3. Predict predictive power on trial observables (strength of the results, sample size, funder, balance, trial design)

An app: create drug studies ratings

In [1]:
import pandas as pd
import numpy as numpy
from importlib import reload
from tqdm import tqdm_notebook as tqdm
import time

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot
%matplotlib inline

pd.set_option('display.max_columns', 999)

  """)


In [2]:
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.show_constraints()
ss.load_studies()

WHERE 1=1 
    AND (study_type = 'Interventional')

253162 studies loaded!


#### Adding dimensions to filter dataset and add data related to results and outcomes

In [3]:
ss.add_dimensions([
    'countries', 
    'conditions', 
    'browse_conditions',
    'keywords', 
    'pending_results', 
    'design_outcomes', 
    'designs', 
    'result_groups',
    'milestones', 
    'baseline_counts', 
    'baseline_measurements', 
    'outcomes', 
    'outcome_analyses', 
    'outcome_analysis_groups', 
    'outcome_counts',
    'outcome_measurements', 
    'browse_interventions', 
    'interventions', 
    'intervention_other_names'
])

Successfuly added these 19 dimensions: ['countries', 'conditions', 'browse_conditions', 'keywords', 'pending_results', 'design_outcomes', 'designs', 'result_groups', 'milestones', 'baseline_counts', 'baseline_measurements', 'outcomes', 'outcome_analyses', 'outcome_analysis_groups', 'outcome_counts', 'outcome_measurements', 'browse_interventions', 'interventions', 'intervention_other_names']
Failed to add these 0 dimensions: []


In [4]:
ss.refresh_dim_data()

HBox(children=(IntProgress(value=0, max=507), HTML(value='')))

Syncing the temp table temp_cur_studies in 507 chunks x 500 records each

Creating index on the temp table
 - Loading dimension countries
 -- Loading raw data
 -- Sorting index
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension keywords
 -- Loading raw data
 -- Sorting index
 - Loading dimension pending_results
 -- Loading raw data
 -- Sorting index
 - Loading dimension design_outcomes
 -- Loading raw data
 -- Sorting index
 - Loading dimension designs
 -- Loading raw data
 -- Sorting index
 - Loading dimension result_groups
 -- Loading raw data
 -- Sorting index
 - Loading dimension milestones
 -- Loading raw data
 -- Sorting index
 - Loading dimension baseline_counts
 -- Loading raw data
 -- Sorting index
 - Loading dimension baseline_measurements
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcomes
 -- Loading raw data
 -- Sorting index
 - Loading di

#### Studies with result groups

In [5]:
df = ss.studies.reset_index().merge(ss.dimensions['result_groups'].data.add_prefix('result_group_'), 
    left_on='nct_id', right_index=True, how='inner')

df['result_group_result_type'] = df['result_group_result_type'].fillna('None')

df.reset_index().groupby(['result_group_result_type']).agg({'nct_id':['count', 'nunique']})

Unnamed: 0_level_0,nct_id,nct_id
Unnamed: 0_level_1,count,nunique
result_group_result_type,Unnamed: 1_level_2,Unnamed: 2_level_2
Baseline,107767,37348
Outcome,678680,37348
Participant Flow,88348,37348
Reported Event,89364,37151


In [6]:
df.reset_index()['nct_id'].nunique()

37348

In [7]:
df.shape

(964159, 69)

#### Joining Interventions

In [8]:
interventions = ss.dimensions['interventions'].data.add_prefix('intervention_')
interventions.head()

df = df.set_index('nct_id').merge(interventions, left_index=True, right_index=True, how='left')
print(df.shape)
df.reset_index()[['nct_id','result_group_id', 'intervention_id']].nunique()

(2647355, 72)


nct_id              37348
result_group_id    964159
intervention_id     86762
dtype: int64

#### Associating studies with the drugs they are testing

In [9]:
## Sample list of 2019 FDA approved drugs

drugs = [
'certolizumab pegol',
 'esketamine',
 'solriamfetol',
 'risankizumab-rzaa',
 'brexanolone',
 'rimabotulinumtoxinB',
 'oral semaglutide',
 'romosozumab-aqqg',
 'pretomanid tablets',
 'tenapanor',
 'pembrolizumab',
 'apremilast',
 'lasmiditan',
 'amifampridine',
 'pexidartinib',
 'relebactam',
 'onasemnogene abeparvovec-xioi',
 'caplacizumab-yhdp',
 'cladribine',
 'ruxolitinib',
 'fedratinib',
 'upadacitinib',
 'bremelanotide',
 'atezolizumab',
 'dengue tetravalent vaccine',
 'hyaluronidase-oysk',
 'nintedanib',
 'trifarotene',
 'alpelisib',
 'midazolam',
 'selinexor',
 'erdafitinib',
 'polatuzumab vedotin-piiq',
 'ramucirumab',
 'istradefylline',
 'ixekizumab',
 'siponimod',
 'sumatriptan',
 'halobetasol propionate and tazarotene',
 'entrectinib',
 'darolutamide',
 'prabotulinumtoxinA-xvfs',
 'testosterone undecanoate',
 'trastuzumab',
 'dupilumab',
 'pitolisant',
 'triclabendazole',
 'lefamulin',
 'brolucizumab-dbll'
]

In [10]:
df[['intervention_description', 'intervention_name', 'official_title']] = \
    df[['intervention_description', 'intervention_name', 'official_title']].fillna('None')

In [11]:
print(df.shape)
df.reset_index()[['nct_id','result_group_id', 'intervention_id']].nunique()

(2647355, 72)


nct_id              37348
result_group_id    964159
intervention_id     86762
dtype: int64

In [12]:
# This assumes every intervention can only be associated with one drug

for drug in drugs:
    df.loc[df.intervention_description.str.contains(drug), 'drug_name'] = drug

In [13]:
df.groupby('drug_name').agg({'intervention_id':'count'})\
    .sort_values(by='intervention_id', ascending=False).head()

Unnamed: 0_level_0,intervention_id
drug_name,Unnamed: 1_level_1
midazolam,2093
trastuzumab,1538
esketamine,888
certolizumab pegol,866
sumatriptan,717


In [14]:
df.reset_index().groupby(['drug_name', 'phase']).agg({'nct_id':'nunique'})\
    .reset_index().pivot(index='drug_name', columns='phase', values='nct_id')\
    .fillna(0)

phase,N/A,Phase 1,Phase 1/Phase 2,Phase 2,Phase 2/Phase 3,Phase 3,Phase 4
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apremilast,0.0,0.0,0.0,2.0,0.0,3.0,2.0
atezolizumab,0.0,0.0,1.0,2.0,0.0,1.0,0.0
bremelanotide,0.0,0.0,0.0,1.0,0.0,0.0,0.0
certolizumab pegol,0.0,1.0,0.0,1.0,0.0,2.0,1.0
cladribine,0.0,1.0,1.0,1.0,0.0,3.0,0.0
esketamine,0.0,0.0,0.0,1.0,0.0,4.0,0.0
istradefylline,0.0,0.0,0.0,0.0,0.0,1.0,0.0
lasmiditan,0.0,0.0,0.0,0.0,0.0,1.0,0.0
midazolam,16.0,9.0,1.0,4.0,1.0,3.0,18.0
nintedanib,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
df.head()

Unnamed: 0_level_0,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,result_group_id,result_group_ctgov_group_code,result_group_result_type,result_group_title,result_group_description,intervention_id,intervention_intervention_type,intervention_name,intervention_description,drug_name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
NCT00000125,ClinicalTrials.gov processed this data on Octo...,1999-09-23,2015-03-06,,2019-09-06,1999-09-23,1999-09-24,Estimate,2015-03-06,2015-03-19,Estimate,,,,2019-09-06,2019-09-17,Actual,February 1994,Actual,1994-02-28,September 2019,2019-09-30,March 2020,Anticipated,2020-03-31,June 2002,Actual,2002-06-30,,Interventional,OHTS,1636 ocular hypertensive patients were randomi...,Ocular Hypertension Treatment Study (OHTS),Ocular Hypertension Treatment Study (OHTS),"Active, not recruiting",,Phase 3,1636.0,Actual,Washington University School of Medicine,,2.0,,,False,,,,True,,,,,,,,,,,,,2019-11-01 21:13:15.921729,2019-11-01 21:13:15.921729,16234355,B3,Baseline,Total,Total of all reporting groups,10239365,Drug,Topical ocular hypotensive eye drops.,Topical ocular hypotensive eye drops.,
NCT00000125,ClinicalTrials.gov processed this data on Octo...,1999-09-23,2015-03-06,,2019-09-06,1999-09-23,1999-09-24,Estimate,2015-03-06,2015-03-19,Estimate,,,,2019-09-06,2019-09-17,Actual,February 1994,Actual,1994-02-28,September 2019,2019-09-30,March 2020,Anticipated,2020-03-31,June 2002,Actual,2002-06-30,,Interventional,OHTS,1636 ocular hypertensive patients were randomi...,Ocular Hypertension Treatment Study (OHTS),Ocular Hypertension Treatment Study (OHTS),"Active, not recruiting",,Phase 3,1636.0,Actual,Washington University School of Medicine,,2.0,,,False,,,,True,,,,,,,,,,,,,2019-11-01 21:13:15.921729,2019-11-01 21:13:15.921729,16234356,B2,Baseline,Treatment,Topical Antiglaucoma Agents: Topical Antiglauc...,10239365,Drug,Topical ocular hypotensive eye drops.,Topical ocular hypotensive eye drops.,
NCT00000125,ClinicalTrials.gov processed this data on Octo...,1999-09-23,2015-03-06,,2019-09-06,1999-09-23,1999-09-24,Estimate,2015-03-06,2015-03-19,Estimate,,,,2019-09-06,2019-09-17,Actual,February 1994,Actual,1994-02-28,September 2019,2019-09-30,March 2020,Anticipated,2020-03-31,June 2002,Actual,2002-06-30,,Interventional,OHTS,1636 ocular hypertensive patients were randomi...,Ocular Hypertension Treatment Study (OHTS),Ocular Hypertension Treatment Study (OHTS),"Active, not recruiting",,Phase 3,1636.0,Actual,Washington University School of Medicine,,2.0,,,False,,,,True,,,,,,,,,,,,,2019-11-01 21:13:15.921729,2019-11-01 21:13:15.921729,16234357,B1,Baseline,Observation,Observation only,10239365,Drug,Topical ocular hypotensive eye drops.,Topical ocular hypotensive eye drops.,
NCT00000125,ClinicalTrials.gov processed this data on Octo...,1999-09-23,2015-03-06,,2019-09-06,1999-09-23,1999-09-24,Estimate,2015-03-06,2015-03-19,Estimate,,,,2019-09-06,2019-09-17,Actual,February 1994,Actual,1994-02-28,September 2019,2019-09-30,March 2020,Anticipated,2020-03-31,June 2002,Actual,2002-06-30,,Interventional,OHTS,1636 ocular hypertensive patients were randomi...,Ocular Hypertension Treatment Study (OHTS),Ocular Hypertension Treatment Study (OHTS),"Active, not recruiting",,Phase 3,1636.0,Actual,Washington University School of Medicine,,2.0,,,False,,,,True,,,,,,,,,,,,,2019-11-01 21:13:15.921729,2019-11-01 21:13:15.921729,16234358,P2,Participant Flow,Treatment,Topical ocular hypotensive eye drops.,10239365,Drug,Topical ocular hypotensive eye drops.,Topical ocular hypotensive eye drops.,
NCT00000125,ClinicalTrials.gov processed this data on Octo...,1999-09-23,2015-03-06,,2019-09-06,1999-09-23,1999-09-24,Estimate,2015-03-06,2015-03-19,Estimate,,,,2019-09-06,2019-09-17,Actual,February 1994,Actual,1994-02-28,September 2019,2019-09-30,March 2020,Anticipated,2020-03-31,June 2002,Actual,2002-06-30,,Interventional,OHTS,1636 ocular hypertensive patients were randomi...,Ocular Hypertension Treatment Study (OHTS),Ocular Hypertension Treatment Study (OHTS),"Active, not recruiting",,Phase 3,1636.0,Actual,Washington University School of Medicine,,2.0,,,False,,,,True,,,,,,,,,,,,,2019-11-01 21:13:15.921729,2019-11-01 21:13:15.921729,16234359,P1,Participant Flow,Observation,Close Observation,10239365,Drug,Topical ocular hypotensive eye drops.,Topical ocular hypotensive eye drops.,


In [16]:
df[df.result_group_id.notnull()].shape

(2647355, 73)

In [17]:
df.reset_index()[['nct_id', 'result_group_id', 'intervention_id']].nunique()

nct_id              37348
result_group_id    964159
intervention_id     86762
dtype: int64

#### Distinct studies per drug

In [18]:
df.reset_index().groupby(['nct_id', 'drug_name']).agg({'intervention_id':'count'})\
    .reset_index().groupby('drug_name').agg({'nct_id':'nunique'}).sort_values(by='nct_id', ascending=False).head()

Unnamed: 0_level_0,nct_id
drug_name,Unnamed: 1_level_1
midazolam,52
trastuzumab,47
sumatriptan,18
ramucirumab,9
apremilast,7


#### Distinct drugs per study

In [19]:
df.groupby(['nct_id', 'drug_name']).agg({'intervention_id':'count'})\
    .reset_index().groupby('nct_id').agg({'drug_name':'nunique'}).sort_values(by='drug_name', ascending=False).head()

Unnamed: 0_level_0,drug_name
nct_id,Unnamed: 1_level_1
NCT00140140,1
NCT00182793,1
NCT01567163,1
NCT01632904,1
NCT01641939,1


#### Limit to only studies related to sample 2019 drugs and apply drug name to entire study

In [20]:
df[df.drug_name.notnull()].reset_index().groupby(['nct_id', 'drug_name']).\
    agg({'intervention_id':'count'}).reset_index()[['nct_id', 'drug_name']].set_index('nct_id').shape

(176, 1)

In [21]:
df = df.reset_index().set_index('nct_id').merge(df[df.drug_name.notnull()].reset_index().groupby(['nct_id', 'drug_name']).\
    agg({'intervention_id':'count'}).reset_index()[['nct_id', 'drug_name']].set_index('nct_id'), left_index=True,
                                          right_index=True, how='inner')

In [22]:
df.head()

Unnamed: 0_level_0,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,result_group_id,result_group_ctgov_group_code,result_group_result_type,result_group_title,result_group_description,intervention_id,intervention_intervention_type,intervention_name,intervention_description,drug_name_x,drug_name_y
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
NCT00140140,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,16200343,B4,Baseline,Total,Total of all reporting groups,10209955,Drug,ABI-007,Weekly intravenous infusions over 30 minutes.,,trastuzumab
NCT00140140,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,16200343,B4,Baseline,Total,Total of all reporting groups,10209956,Drug,vinorelbine,Weekly intravenous infusions over 10-30 minute...,,trastuzumab
NCT00140140,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,16200343,B4,Baseline,Total,Total of all reporting groups,10209957,Drug,Trastuzumab,Trastuzumab was administered to participants w...,trastuzumab,trastuzumab
NCT00140140,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,16200343,B4,Baseline,Total,Total of all reporting groups,10209958,Biological,G-CSF,"During Part 1, participants followed a dosing ...",,trastuzumab
NCT00140140,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,16200344,B3,Baseline,Part 2: 90 mg ABI-007 + 20 mg Vinorelbine,Weekly intravenous infusion of 90 mg/m^2 ABI-0...,10209955,Drug,ABI-007,Weekly intravenous infusions over 30 minutes.,,trastuzumab


In [23]:
df.reset_index()[['nct_id']].nunique()

nct_id    176
dtype: int64

In [24]:
df.shape

(17383, 74)

#### Outcome analysis groups

Identifies the comparison groups that were involved with each outcome analysis

In [25]:
outcome_analysis_groups = ss.dimensions['outcome_analysis_groups'].data.add_prefix(
    'outcome_analysis_group_')

outcome_analysis_groups.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_analysis_group_id,outcome_analysis_group_ctgov_group_code
nct_id,result_group_id,outcome_analysis_id,Unnamed: 3_level_1,Unnamed: 4_level_1
NCT00000378,16234181,2700782,5224736,O2
NCT00000378,16234182,2700782,5224737,O1
NCT00000392,16233928,2700769,5224710,O2
NCT00000392,16233929,2700769,5224711,O1
NCT00000620,16234205,2700783,5224738,O2


In [26]:
df = df[df.result_group_id.notnull()].reset_index().set_index(['nct_id', 'result_group_id']).merge(
    outcome_analysis_groups.reset_index().set_index(['nct_id', 'result_group_id']),
    left_index=True, right_index=True, how='left')
df.shape

(18358, 76)

In [27]:
df.reset_index()[['nct_id', 'result_group_id', 'intervention_id', 'outcome_analysis_group_id']].nunique()

nct_id                        176
result_group_id              5305
intervention_id               504
outcome_analysis_group_id     961
dtype: int64

#### Outcome analyses

Results of scientifically appropriate statistical analyses performed on primary and 
secondary study outcomes. Includes results for treatment effect estimates, 
confidence intervals and othe rmeasures of dispersion, and p-values.

In [28]:
outcome_analyses = ss.dimensions['outcome_analyses'].data.add_prefix('outcome_analysis_')
outcome_analyses.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,outcome_analysis_id,outcome_analysis_non_inferiority_type,outcome_analysis_non_inferiority_description,outcome_analysis_param_type,outcome_analysis_param_value,outcome_analysis_dispersion_type,outcome_analysis_dispersion_value,outcome_analysis_p_value_modifier,outcome_analysis_p_value,outcome_analysis_ci_n_sides,outcome_analysis_ci_percent,outcome_analysis_ci_lower_limit,outcome_analysis_ci_upper_limit,outcome_analysis_ci_upper_limit_na_comment,outcome_analysis_p_value_description,outcome_analysis_method,outcome_analysis_method_description,outcome_analysis_estimate_description,outcome_analysis_groups_description,outcome_analysis_other_analysis_description
nct_id,outcome_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCT00000378,4909014,2700782,Superiority or Other,,,,,,<,0.05,,,,,,actual calculation,"Regression, Logistic",,,logistic regression and mixed effects model,
NCT00000392,4908952,2700769,Superiority or Other,,,,,,,0.18,,,,,,,ANOVA,,,,
NCT00000620,4909016,2700783,Superiority or Other,,Hazard Ratio (HR),0.94,,,,0.3,2-Sided,95.0,0.85,1.05,,P-value presented is not adjusted for multiple...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,,
NCT00000620,4909017,2700784,Superiority or Other,,Hazard Ratio (HR),0.92,,,,0.32,2-Sided,95.0,0.79,1.08,,P-value is adjusted for interim monitoring. A ...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,Recruitment for the Glycemia Trial was designe...,
NCT00000620,4909018,2700785,Superiority or Other,,Hazard Ratio (HR),0.59,,,,0.01,2-Sided,95.0,0.39,0.89,,P-value presented is not adjusted for multiple...,"Regression, Cox",Adjustment for the seven clinical center netwo...,,,


In [29]:
df = df.reset_index().set_index(['nct_id', 'outcome_analysis_id']).merge(
    outcome_analyses.reset_index().set_index(['nct_id', 'outcome_analysis_id']),
    left_index=True, right_index=True, how='left')
df.shape

(18358, 96)

In [30]:
df.reset_index()[['nct_id', 'result_group_id', 'intervention_id', 'outcome_analysis_group_id', 'outcome_analysis_id']].nunique()

nct_id                        176
result_group_id              5305
intervention_id               504
outcome_analysis_group_id     961
outcome_analysis_id           468
dtype: int64

#### Outcomes

Descriptions of outcomes, or observation that were measured to determine patterns
of diseases or traits, or associations with exposures, risk factors, or treatment. 
Includes information such as time frame, population and units. 
(Specific measurement results are stored in the Outcome_Measurements table.)

In [31]:
outcomes = ss.dimensions['outcomes'].data.add_prefix('outcome_')
outcomes.head()

Unnamed: 0_level_0,outcome_id,outcome_outcome_type,outcome_title,outcome_description,outcome_time_frame,outcome_population,outcome_anticipated_posting_date,outcome_anticipated_posting_month_year,outcome_units,outcome_units_analyzed,outcome_dispersion_type,outcome_param_type
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NCT00000125,4909043,Primary,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,"5 yrs (OHTS I, June 2002) and 13.0 yrs (comple...",1636 ocular hypertensive participants were ran...,,,percent of participants,,,Number
NCT00000134,4909042,Primary,Morbidity,"To determine the best therapeutic regimen, usi...","Patients will be seen at baseline, monthly for...",,,,participants,,,Number
NCT00000135,4909041,Primary,Mortality Rate,to evaluate the efficacy of an intravenous hum...,All patients enrolled were followed for a 17 m...,,,,deaths per person-year,,,Number
NCT00000136,4909040,Primary,Mortality,,All patients enrolled will be followed until a...,,,,participants,,,Number
NCT00000142,4909039,Primary,Survival,,All patients enrolled will be followed until a...,,,,participants,,,Number


In [32]:
df = df.reset_index().set_index(['nct_id', 'outcome_id']).merge(
    outcomes.reset_index().set_index(['nct_id', 'outcome_id']), left_index=True, right_index=True, how='left')

df.shape

(18358, 107)

In [33]:
df.reset_index()[['nct_id', 'result_group_id', 'intervention_id', 'outcome_analysis_group_id', \
                  'outcome_analysis_id', 'outcome_id']].nunique()

nct_id                        176
result_group_id              5305
intervention_id               504
outcome_analysis_group_id     961
outcome_analysis_id           468
outcome_id                    287
dtype: int64

#### Outcome counts

Sample size included in analysis for each outcome for each study group; 
usually participants but can represent other units of measure such as eyes 'lesions', etc.

In [34]:
outcome_counts = ss.dimensions['outcome_counts'].data.add_prefix('outcome_count_')
outcome_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_count_id,outcome_count_ctgov_group_code,outcome_count_scope,outcome_count_units,outcome_count_count
nct_id,result_group_id,outcome_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT00000125,16234360,4909043,11576970,O2,Measure,Participants,817
NCT00000125,16234361,4909043,11576971,O1,Measure,Participants,819
NCT00000134,16234349,4909042,11576967,O3,Measure,Participants,93
NCT00000134,16234350,4909042,11576968,O2,Measure,Participants,93
NCT00000134,16234351,4909042,11576969,O1,Measure,Participants,88


In [35]:
df = df.reset_index().set_index(['nct_id', 'result_group_id', 'outcome_id']).merge(
    outcome_counts, left_index=True, right_index=True, how='left')

df.shape

(18370, 111)

In [36]:
df.reset_index()[['nct_id', 'result_group_id', 'intervention_id', 'outcome_analysis_group_id', 
                  'outcome_id', 'outcome_count_id']].nunique()

nct_id                        176
result_group_id              5305
intervention_id               504
outcome_analysis_group_id     961
outcome_id                    287
outcome_count_id              678
dtype: int64

#### Outcome measurements

Summary data for primary and secondary outcome measures for each study group. 
Includes parameter estimates and measures of dispersion/precision.

In [37]:
outcome_measurements = ss.dimensions['outcome_measurements'].data.add_prefix('outcome_measurement_')
outcome_measurements.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome_measurement_id,outcome_measurement_ctgov_group_code,outcome_measurement_classification,outcome_measurement_category,outcome_measurement_title,outcome_measurement_description,outcome_measurement_units,outcome_measurement_param_type,outcome_measurement_param_value,outcome_measurement_param_value_num,outcome_measurement_dispersion_type,outcome_measurement_dispersion_value,outcome_measurement_dispersion_value_num,outcome_measurement_dispersion_lower_limit,outcome_measurement_dispersion_upper_limit,outcome_measurement_explanation_of_na
nct_id,result_group_id,outcome_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
NCT00000125,16234360,4909043,36764756,O2,"Incidence of glaucoma 13 yr (OHTS II, March 2009)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,14.1,14.1,,,,,,
NCT00000125,16234360,4909043,36764758,O2,"Incidence of glaucoma at 5 yr (OHTS I, June 2002)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,4.4,4.4,,,,,,
NCT00000125,16234361,4909043,36764757,O1,"Incidence of glaucoma 13 yr (OHTS II, March 2009)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,20.0,20.0,,,,,,
NCT00000125,16234361,4909043,36764759,O1,"Incidence of glaucoma at 5 yr (OHTS I, June 2002)",,Incidence of Primary Open-Angle Glaucoma in Hy...,Comparison of the cumulative proportion of par...,percent of participants,Number,9.5,9.5,,,,,,
NCT00000134,16234349,4909042,36764753,O3,,,Morbidity,"To determine the best therapeutic regimen, usi...",participants,Number,93.0,93.0,,,,,,


In [38]:
df = df.merge(outcome_measurements, left_index=True, right_index=True, how='left')
df.shape

(19871, 127)

In [39]:
df.reset_index()[['nct_id','result_group_id', 'outcome_analysis_group_id', 'outcome_id', 'intervention_id', \
                  'outcome_count_id', 'outcome_measurement_id']].nunique()

nct_id                        176
result_group_id              5305
outcome_analysis_group_id     961
outcome_id                    287
intervention_id               504
outcome_count_id              678
outcome_measurement_id        907
dtype: int64

In [40]:
df.reset_index().head()

Unnamed: 0,nct_id,result_group_id,outcome_id,outcome_analysis_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,result_group_ctgov_group_code,result_group_result_type,result_group_title,result_group_description,intervention_id,intervention_intervention_type,intervention_name,intervention_description,drug_name_x,drug_name_y,outcome_analysis_group_id,outcome_analysis_group_ctgov_group_code,outcome_analysis_non_inferiority_type,outcome_analysis_non_inferiority_description,outcome_analysis_param_type,outcome_analysis_param_value,outcome_analysis_dispersion_type,outcome_analysis_dispersion_value,outcome_analysis_p_value_modifier,outcome_analysis_p_value,outcome_analysis_ci_n_sides,outcome_analysis_ci_percent,outcome_analysis_ci_lower_limit,outcome_analysis_ci_upper_limit,outcome_analysis_ci_upper_limit_na_comment,outcome_analysis_p_value_description,outcome_analysis_method,outcome_analysis_method_description,outcome_analysis_estimate_description,outcome_analysis_groups_description,outcome_analysis_other_analysis_description,outcome_outcome_type,outcome_title,outcome_description,outcome_time_frame,outcome_population,outcome_anticipated_posting_date,outcome_anticipated_posting_month_year,outcome_units,outcome_units_analyzed,outcome_dispersion_type,outcome_param_type,outcome_count_id,outcome_count_ctgov_group_code,outcome_count_scope,outcome_count_units,outcome_count_count,outcome_measurement_id,outcome_measurement_ctgov_group_code,outcome_measurement_classification,outcome_measurement_category,outcome_measurement_title,outcome_measurement_description,outcome_measurement_units,outcome_measurement_param_type,outcome_measurement_param_value,outcome_measurement_param_value_num,outcome_measurement_dispersion_type,outcome_measurement_dispersion_value,outcome_measurement_dispersion_value_num,outcome_measurement_dispersion_lower_limit,outcome_measurement_dispersion_upper_limit,outcome_measurement_explanation_of_na
0,NCT00140140,16200343,,,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,B4,Baseline,Total,Total of all reporting groups,10209958,Biological,G-CSF,"During Part 1, participants followed a dosing ...",,trastuzumab,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,NCT00140140,16200343,,,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,B4,Baseline,Total,Total of all reporting groups,10209957,Drug,Trastuzumab,Trastuzumab was administered to participants w...,trastuzumab,trastuzumab,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,NCT00140140,16200343,,,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,B4,Baseline,Total,Total of all reporting groups,10209955,Drug,ABI-007,Weekly intravenous infusions over 30 minutes.,,trastuzumab,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,NCT00140140,16200343,,,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,B4,Baseline,Total,Total of all reporting groups,10209956,Drug,vinorelbine,Weekly intravenous infusions over 10-30 minute...,,trastuzumab,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,NCT00140140,16200344,,,ClinicalTrials.gov processed this data on Octo...,2005-08-30,2013-08-20,,2013-08-20,2005-08-30,2005-09-01,Estimate,2013-08-20,2013-10-24,Estimate,,,,2013-08-20,2013-10-24,Estimate,August 2005,,2005-08-31,August 2013,2013-08-31,February 2008,Actual,2008-02-29,February 2008,Actual,2008-02-29,,Interventional,,,"A Phase I/II Study of ABI-007 (Abraxane®, Nab®...",An Open-Label Phase I/II Study of Weekly ABI-0...,Terminated,,Phase 1/Phase 2,16.0,Actual,Celgene,,3.0,,Unable to determine the optimum tolerated dose,False,,,,False,,,,,,,,,,,,,2019-11-01 20:28:32.458523,2019-11-01 20:28:32.458523,B3,Baseline,Part 2: 90 mg ABI-007 + 20 mg Vinorelbine,Weekly intravenous infusion of 90 mg/m^2 ABI-0...,10209957,Drug,Trastuzumab,Trastuzumab was administered to participants w...,trastuzumab,trastuzumab,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


#### Rename drug_name columns

In [41]:
df = df.rename(columns={'drug_name_x':'drug_name_intervention', 'drug_name_y':'drug_name_study'})

In [42]:
df.reset_index().groupby(['nct_id'])\
    .agg({'drug_name_intervention':'count', 'drug_name_study':'count', 'result_group_id':'count'}).head()

Unnamed: 0_level_0,drug_name_intervention,drug_name_study,result_group_id
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00140140,42,168,168
NCT00182793,15,45,45
NCT00199381,4,4,4
NCT00213135,69,69,69
NCT00232479,5,5,5


In [43]:
df.shape

(19871, 127)

In [44]:
df = df.reset_index()

#### Finding drugs with multiple studies

In [45]:
df = df.set_index('drug_name_study')\
    .merge(df.groupby(['drug_name_study']).agg({'nct_id':'nunique'})\
           .rename(columns={'nct_id':'studies_related_to_drug'}), left_index=True, right_index=True, how='inner')

In [46]:
df = df[df['studies_related_to_drug'] > 1]

In [47]:
df.shape

(18462, 130)

In [None]:
# 'outcome_measurement_classification',
# 'outcome_measurement_category',
# 'outcome_measurement_title',
# 'outcome_measurement_description',
# 'outcome_measurement_units',
# 'outcome_measurement_param_type',
# 'outcome_measurement_param_value_num',
# 'outcome_measurement_dispersion_type',
# 'outcome_measurement_dispersion_value',
# 'outcome_measurement_dispersion_lower_limit',
# 'outcome_measurement_dispersion_upper_limit']].head()

In [58]:
df[df.outcome_measurement_classification!=''].groupby([
    'drug_name_study', 'outcome_measurement_classification']).agg({'nct_id':'nunique'})\
    .sort_values(by='nct_id', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nct_id
drug_name_study,outcome_measurement_classification,Unnamed: 2_level_1
apremilast,Complete Response,1
midazolam,midazolam control AUC,1
sumatriptan,"Diastolic, Change from Baseline, n=47, 36",1
sumatriptan,"Diastolic, Change from Baseline, n=47",1
sumatriptan,"Diastolic, Change from Baseline, n=41, 36",1


In [66]:
df[df.outcome_measurement_category!=''].groupby(['drug_name_study', 'outcome_measurement_category'])\
    .agg({'nct_id':'nunique'}).sort_values(by='nct_id', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,nct_id
drug_name_study,outcome_measurement_category,Unnamed: 2_level_1
midazolam,Satisfactory mask induction,1
midazolam,memory of mask induction-no,1
midazolam,memory of mask induction-yes,1
midazolam,satisfactory wake up behavior,1
midazolam,unsatisfactory mask induction,1
midazolam,unsatisfactory wake up behavior,1


In [73]:
df[df.outcome_measurement_title!=''].groupby(['drug_name_study', 'outcome_measurement_title'])\
    .agg({'nct_id':'nunique'}).reset_index().sort_values(by='nct_id', ascending=False).head(20)

Unnamed: 0,drug_name_study,outcome_measurement_title,nct_id
155,pembrolizumab,Overall Survival (OS),3
211,trastuzumab,Overall Survival (OS),3
167,ramucirumab,Progression-Free Survival (PFS),2
194,sumatriptan,Sustained Freedom From Migraine Pain Between 2...,2
51,atezolizumab,Overall Survival (OS),2
160,ramucirumab,Overall Survival (OS),2
226,trastuzumab,Progression-Free Survival (PFS),2
158,pembrolizumab,Progression-free Survival (PFS) by Response Ev...,1
157,pembrolizumab,Progression-free Survival (PFS),1
156,pembrolizumab,Progression Free Survival (PFS) Assessed by Cl...,1


In [70]:
df[df.outcome_measurement_description!=''].groupby(['drug_name_study', 'outcome_measurement_description'])\
    .agg({'nct_id':'nunique'}).reset_index().sort_values(by='nct_id', ascending=False).head()

Unnamed: 0,drug_name_study,outcome_measurement_description,nct_id
159,sumatriptan,Sustained freedom from migraine pain was defin...,2
60,esketamine,MADRS is clinician-rated scale designed to mea...,2
0,apremilast,A 100-mm VAS pain scale for oral ulcers was co...,1
139,ramucirumab,PFS was defined as time from randomization unt...,1
130,pembrolizumab,Progression free survival was defined as the t...,1


In [72]:
df[df.outcome_measurement_units!=''].groupby(['drug_name_study', 'outcome_measurement_units'])\
    .agg({'nct_id':'nunique'}).reset_index().sort_values(by='nct_id', ascending=False).head(20)

Unnamed: 0,drug_name_study,outcome_measurement_units,nct_id
41,midazolam,participants,5
77,trastuzumab,months,4
35,midazolam,minutes,4
47,midazolam,units on a scale,4
0,apremilast,Percentage of Participants,3
74,trastuzumab,Months,3
8,apremilast,percentage of participants,3
64,sumatriptan,Participants,3
11,apremilast,units on a scale,3
49,pembrolizumab,Months,3
