In [1]:
import sys
if "../../" not in sys.path:
    sys.path.append("../../")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [62]:
data = pd.read_pickle('withphase.p')
valid_idx = data.index
print(len(data))

2042


In [4]:
# loading all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, 
                               tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

259683 studies loaded!


In [5]:
# constraining to our interested studies
ss.studies = ss.studies.loc[valid_idx]

In [6]:
ss.studies.shape

(2042, 63)

### Example NCT IDs

In [None]:
data.head()

## Geographic Breakdown

In [None]:
# adding and loading dimensional data
ss.add_dimensions(['countries'])
ss.refresh_dim_data()

In [None]:
df = ss.dimensions['countries'].data
df = df[df['removed'] != True]

In [None]:
pct_countries = df.groupby('name').size().sort_values(ascending=False) / df.shape[0]

In [None]:
pct_countries.to_csv('countries.csv')

In [None]:
# getting top 5
pct_dict = {}
for c in pct_countries[:5].index:
    pct_dict[c] = pct_countries[c]
    
pct_dict['Other'] = pct_countries[5:].sum()
df_countries = pd.DataFrame(pct_dict, index=['pct']).T
df_countries.head()

## Phase Breakdown

In [30]:
df = ss.studies

In [35]:
df_phase = df.groupby('phase').size().sort_values(ascending=False)
df_phase / df_phase.sum()

phase
N/A                0.346719
Phase 4            0.234574
Phase 3            0.206660
Phase 2            0.146915
Phase 2/Phase 3    0.031832
Phase 1/Phase 2    0.015671
Phase 1            0.015181
Early Phase 1      0.002449
dtype: float64

In [40]:
data_phase = data.join(df[['phase']])
data_phase.head()

Unnamed: 0,g1m,g1f,g2m,g2f,g1_mean,g2_mean,num_parts1,num_parts2,num_measures1,num_measures2,g1t,g2t,g1_sd_sample,g2_sd_sample,imbal,effect,phase
NCT00114127,6.0,7.0,11.0,4.0,67.9,53.7,13,15,13,15,13.0,15.0,27.40219,30.596568,0.271795,0.140169,Phase 3
NCT00180479,469.0,200.0,218.0,114.0,0.14,0.28,301,134,301,134,669.0,332.0,0.41,0.48,0.04442,0.019668,Phase 3
NCT00405275,101.0,77.0,89.0,85.0,-2.12,-2.29,154,155,154,155,178.0,174.0,1.28,1.3,0.055921,0.010684,
NCT00446654,8.0,12.0,6.0,12.0,-0.01,0.08,21,21,21,21,20.0,18.0,0.21,0.25,0.066667,0.091983,Phase 1
NCT00728754,15.0,14.0,17.0,15.0,-0.744,-0.8445,24,28,65,68,29.0,32.0,1.516511,1.18663,0.014009,0.0082,


In [45]:
CUTOFF = 50

data_phase['totp'] = data_phase['g1t'] + data_phase['g2t']
small_data = data_phase[data_phase['totp'] < CUTOFF]
print(small_data.shape[0])

small_data.groupby('phase').size()

613


phase
Early Phase 1        3
N/A                251
Phase 1             21
Phase 1/Phase 2     22
Phase 2            111
Phase 2/Phase 3     23
Phase 3             50
Phase 4            132
dtype: int64

In [47]:
data_phase.to_pickle('withphase.p')

## Attrition

In [11]:
# adding and loading dimensional data
ss.add_dimensions(['milestones'])
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['milestones']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Syncing the temp table temp_cur_studies in 5 chunks x 500 records each

Creating index on the temp table
 - Loading dimension participant_flows
 -- Loading raw data
 -- Sorting index
 - Loading dimension milestones
 -- Loading raw data
 -- Sorting index


In [12]:
ms = ss.dimensions['milestones'].data
ms.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,ctgov_group_code,title,period,description,count
nct_id,result_group_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT00000371,4233000,1727630,P2,NOT COMPLETED,Overall Study,,16
NCT00000371,4233000,1727632,P2,COMPLETED,Overall Study,,12
NCT00000371,4233000,1727634,P2,STARTED,Overall Study,,28
NCT00000371,4233001,1727631,P1,NOT COMPLETED,Overall Study,,13
NCT00000371,4233001,1727633,P1,COMPLETED,Overall Study,,14


In [18]:
ms_counts = ms.reset_index().groupby(['nct_id', 'title'])[['count']].sum().reset_index()
ms_counts.head()

Unnamed: 0,nct_id,title,count
0,NCT00000371,COMPLETED,26
1,NCT00000371,NOT COMPLETED,29
2,NCT00000371,STARTED,55
3,NCT00000392,COMPLETED,143
4,NCT00000392,NOT COMPLETED,72


In [24]:
ms_counts = ms_counts[ms_counts['title'].isin(['COMPLETED', 'NOT COMPLETED', 'STARTED'])]

In [27]:
msp = ms_counts.pivot(index='nct_id', columns='title', values='count')
msp.head()

title,COMPLETED,NOT COMPLETED,STARTED
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000371,26,29,55
NCT00000392,143,72,215
NCT00001596,8,27,35
NCT00001723,171,29,200
NCT00004500,69,0,69


In [28]:
msp['attrition'] = msp['COMPLETED'] / msp['STARTED'] - 1

In [29]:
msp['attrition'].describe()

count    2042.000000
mean       -0.131295
std         0.149146
min        -1.000000
25%        -0.189170
50%        -0.090000
75%        -0.025398
max         0.000000
Name: attrition, dtype: float64

### Notes:
- mean attrition = 13.1%
- 25% of studies have 19% or more attrition
- 25% of studies have 3% or less attrition

## Sponsor Breakdown

In [48]:
# adding and loading dimensional data
ss.add_dimensions(['sponsors'])
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['sponsors']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Syncing the temp table temp_cur_studies in 5 chunks x 500 records each

Creating index on the temp table
 - Loading dimension participant_flows
 -- Loading raw data
 -- Sorting index
 - Loading dimension milestones
 -- Loading raw data
 -- Sorting index
 - Loading dimension sponsors
 -- Loading raw data
 -- Sorting index


In [51]:
sp = ss.dimensions['sponsors'].data
sp

Unnamed: 0_level_0,id,agency_class,lead_or_collaborator,name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NCT00000371,2230942,Other,lead,Massachusetts General Hospital
NCT00000392,2230360,NIH,lead,National Institute of Mental Health (NIMH)
NCT00001596,2230537,NIH,collaborator,National Human Genome Research Institute (NHGRI)
NCT00001596,2230536,NIH,lead,"William Gahl, M.D."
NCT00001723,2230467,Industry,collaborator,Roche Pharma AG
...,...,...,...,...
NCT03831451,1760582,Other,lead,University of Michigan
NCT04109703,1725661,Other,collaborator,Northern California Research Corporation
NCT04109703,1725662,Other,collaborator,University of Washington
NCT04109703,1725660,Industry,lead,"Carewave Medical, Inc."


In [54]:
sp_lead = sp[sp['lead_or_collaborator'] == 'lead']

In [60]:
sp_lead.groupby('agency_class').size() / sp_lead.groupby('agency_class').size().sum()

agency_class
Industry    0.402547
NIH         0.012243
Other       0.546523
U.S. Fed    0.038688
dtype: float64

In [56]:
# Industry
sp_ind = sp_lead[sp_lead['agency_class'] == 'Industry']
sp_oth = sp_lead[sp_lead['agency_class'] == 'Other']

In [58]:
sp_ind.groupby('name').size().sort_values(ascending=False)

name
GlaxoSmithKline                            45
Novartis Pharmaceuticals                   45
Novo Nordisk A/S                           44
Alcon Research                             37
AstraZeneca                                28
                                           ..
Healthpoint                                 1
Halyard Health                              1
Pascoe Pharmazeutische Praeparate GmbH      1
Philips Respironics                         1
Johnson & Johnson Consumer Inc. (J&JCI)     1
Length: 273, dtype: int64

In [59]:
sp_oth.groupby('name').size().sort_values(ascending=False)

name
Massachusetts General Hospital    36
Mayo Clinic                       32
Duke University                   21
Brigham and Women's Hospital      17
University of Colorado, Denver    16
                                  ..
NxStage Medical                    1
Ohio University                    1
Oregon State University            1
Oslo University Hospital           1
AIDS Clinical Trials Group         1
Length: 447, dtype: int64

In [71]:
data = data.join(sp_lead[['agency_class', 'name']].add_prefix('sp_'))

## Pivoting Imbalance Metrics

In [72]:
# relative imbalance
def relative_imbalance(x):
    g1m, g1t, g2m, g2t = x
    s1, s2 = g1m/g1t, g2m/g2t
    return max(s1, s2)/min(s1, s2) -1

data['relative_imbalance'] = data[['g1m', 'g1t', 'g2m', 'g2t']].apply(
    relative_imbalance, axis=1
)
data.head()

Unnamed: 0,g1m,g1f,g2m,g2f,g1_mean,g2_mean,num_parts1,num_parts2,num_measures1,num_measures2,...,g2t,g1_sd_sample,g2_sd_sample,imbal,effect,phase,totp,relative_imbalance,sp_agency_class,sp_name
NCT00114127,6.0,7.0,11.0,4.0,67.9,53.7,13,15,13,15,...,15.0,27.40219,30.596568,0.271795,0.140169,Phase 3,28.0,0.588889,Other,Massachusetts General Hospital
NCT00180479,469.0,200.0,218.0,114.0,0.14,0.28,301,134,301,134,...,332.0,0.41,0.48,0.04442,0.019668,Phase 3,1001.0,0.067649,Industry,Abbott Medical Devices
NCT00405275,101.0,77.0,89.0,85.0,-2.12,-2.29,154,155,154,155,...,174.0,1.28,1.3,0.055921,0.010684,,352.0,0.10933,U.S. Fed,US Department of Veterans Affairs
NCT00446654,8.0,12.0,6.0,12.0,-0.01,0.08,21,21,21,21,...,18.0,0.21,0.25,0.066667,0.091983,Phase 1,38.0,0.2,Industry,Progen Pharmaceuticals
NCT00728754,15.0,14.0,17.0,15.0,-0.744,-0.8445,24,28,65,68,...,32.0,1.516511,1.18663,0.014009,0.0082,,61.0,0.027083,Industry,Zimmer Biomet


### imbalance by agency class = Industry

In [75]:
data[data['sp_agency_class'] == 'Industry']['imbal'].describe()

count    822.000000
mean       0.064316
std        0.062978
min        0.000000
25%        0.020873
50%        0.046488
75%        0.088106
max        0.439408
Name: imbal, dtype: float64

In [76]:
data[data['sp_agency_class'] == 'Industry']['relative_imbalance'].describe()

count    822.000000
mean       0.193564
std        0.301672
min        0.000000
25%        0.041715
50%        0.105263
75%        0.227745
max        3.860465
Name: relative_imbalance, dtype: float64

### Agency Class = Other

In [77]:
data[data['sp_agency_class'] == 'Other']['imbal'].describe()

count    1116.000000
mean        0.084752
std         0.080248
min         0.000000
25%         0.026975
50%         0.062500
75%         0.120000
max         0.576190
Name: imbal, dtype: float64

In [78]:
data[data['sp_agency_class'] == 'Other']['relative_imbalance'].describe()

count    1116.000000
mean        0.276243
std         0.464226
min         0.000000
25%         0.053102
50%         0.133333
75%         0.305073
max         8.000000
Name: relative_imbalance, dtype: float64

### Phase

In [80]:
data['phase'].unique()

array(['Phase 3', 'N/A', 'Phase 1', 'Phase 2', 'Phase 2/Phase 3',
       'Phase 4', 'Phase 1/Phase 2', 'Early Phase 1'], dtype=object)

In [82]:
data[data['phase'].isin(['Phase 2', 'Phase 1/Phase 2'])]['relative_imbalance'].describe()

count    332.000000
mean       0.246867
std        0.351698
min        0.000000
25%        0.047209
50%        0.121941
75%        0.287233
max        2.304348
Name: relative_imbalance, dtype: float64

In [84]:
data[data['phase'].isin(['Phase 3'])]['relative_imbalance'].describe()

count    422.000000
mean       0.173255
std        0.262841
min        0.000000
25%        0.040581
50%        0.106017
75%        0.206888
max        3.000000
Name: relative_imbalance, dtype: float64

In [85]:
data.to_pickle('with_add_data.p')