In [1]:
import pandas as pd
import numpy as numpy
from importlib import reload
from tqdm import tqdm_notebook as tqdm
import time
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import norm

import pdaactconn as pc
from trialexplorer import AACTStudySet

In [2]:
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, 
                               tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.add_constraint("results_first_submitted_date is not null")
ss.add_constraint("enrollment_type = 'Actual'")
ss.add_constraint("enrollment >= 10")
ss.add_constraint("enrollment <= 500")
ss.add_constraint("phase = 'Phase 2'")
ss.load_studies()

8751 studies loaded!


In [3]:
ss.add_dimensions(['baseline_measurements', 'result_groups', 'outcomes', 'outcome_measurements'])
ss.refresh_dim_data()
rg = ss.dimensions['result_groups']
bm = ss.dimensions['baseline_measurements']
om = ss.dimensions['outcome_measurements']
out = ss.dimensions['outcomes']

Successfuly added these 4 dimensions: ['baseline_measurements', 'result_groups', 'outcomes', 'outcome_measurements']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

Syncing the temp table temp_cur_studies in 18 chunks x 500 records each

Creating index on the temp table
 - Loading dimension baseline_measurements
 -- Loading raw data
 -- Sorting index
 - Loading dimension result_groups
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcomes
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcome_measurements
 -- Loading raw data
 -- Sorting index


In [4]:
#Get studies with exactly two treatment groups besides "Total" and grab necessary columns
combined_measures = pd.merge(rg.data, bm.data, left_on = ['nct_id', 'id'], right_on = ['nct_id', 'result_group_id'])
combined_measures = combined_measures[combined_measures.title_x != 'Total']
num_groups = combined_measures.groupby('nct_id').ctgov_group_code_x.nunique()
num_groups = pd.DataFrame(num_groups)
num_groups.columns.values[0] = 'n_groups'
combined_measures = combined_measures.merge(num_groups, on=['nct_id'])
combined_measures = combined_measures[combined_measures.n_groups==2]
study_balance_dat = combined_measures[['ctgov_group_code_x', 'classification', 'category', 'title_y',
                                       'param_type', 'param_value_num',
                                       'dispersion_type', 'dispersion_value_num']]

In [5]:
study_balance_dat.title_y.value_counts().head(10)
#We'll go with age and sex for now

Sex: Female, Male                                               11782
Age                                                             11719
Race (NIH/OMB)                                                  10955
Region of Enrollment                                             5002
Race/Ethnicity, Customized                                       3714
Ethnicity (NIH/OMB)                                              3459
Age, Customized                                                  1504
Gender                                                            408
Eastern Cooperative Oncology Group (ECOG) Performance Status      240
Weight                                                            150
Name: title_y, dtype: int64

In [6]:
sb_age = study_balance_dat[study_balance_dat.title_y == 'Age']
sb_age_cat = sb_age[sb_age.param_type=='Count of Participants']
sb_age_con = sb_age[sb_age.param_type=='Mean']
print(sb_age_cat.index.nunique())
print(sb_age_con.index.nunique())
#We'll choose the continuous version for now, since it has more

1051
2205


In [7]:
#Sex is always a count variable; only variability is capitalization of "category" field
#Get rid of rows that aren't sex or age
age_condition1 = study_balance_dat.title_y == 'Age'
age_condition2 = study_balance_dat.param_type == 'Mean'
sex_condition = study_balance_dat.title_y == 'Sex: Female, Male'
full_condition = (age_condition1 & age_condition2) | sex_condition
small_balance_dat = study_balance_dat[full_condition]
small_balance_dat.head(20)

Unnamed: 0_level_0,ctgov_group_code_x,classification,category,title_y,param_type,param_value_num,dispersion_type,dispersion_value_num
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NCT00000392,B2,,Male,"Sex: Female, Male",Count of Participants,104.0,,
NCT00000392,B2,,Female,"Sex: Female, Male",Count of Participants,5.0,,
NCT00000392,B1,,Male,"Sex: Female, Male",Count of Participants,101.0,,
NCT00000392,B1,,Female,"Sex: Female, Male",Count of Participants,5.0,,
NCT00001304,B2,,Male,"Sex: Female, Male",Count of Participants,4.0,,
NCT00001304,B2,,Female,"Sex: Female, Male",Count of Participants,10.0,,
NCT00001304,B1,,Male,"Sex: Female, Male",Count of Participants,6.0,,
NCT00001304,B1,,Female,"Sex: Female, Male",Count of Participants,7.0,,
NCT00001586,B2,,Male,"Sex: Female, Male",Count of Participants,35.0,,
NCT00001586,B2,,Female,"Sex: Female, Male",Count of Participants,21.0,,


In [8]:
#get studies that have both measures we're using
num_measures = small_balance_dat.groupby('nct_id').title_y.nunique()
num_measures = pd.DataFrame(num_measures)
num_measures.columns.values[0] = 'n_measures'
small_balance_dat = small_balance_dat.merge(num_measures, on=['nct_id'])
small_balance_dat = small_balance_dat[small_balance_dat.n_measures==2]

#assert that each study has 6 rows now--2 sex and 1 age for each of the 2 arms
num_rows = small_balance_dat.groupby('nct_id').title_y.count()
num_rows = pd.DataFrame(num_rows)
num_rows.columns.values[0] = 'n_rows'
small_balance_dat = small_balance_dat.merge(num_rows, on=['nct_id'])
small_balance_dat = small_balance_dat[small_balance_dat.n_rows == 6]

In [9]:
#handle aforementioned capitalization issue
small_balance_dat.category = small_balance_dat.category.str.lower()
small_balance_dat.head()

Unnamed: 0_level_0,ctgov_group_code_x,classification,category,title_y,param_type,param_value_num,dispersion_type,dispersion_value_num,n_measures,n_rows
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NCT00001586,B2,,male,"Sex: Female, Male",Count of Participants,35.0,,,2,6
NCT00001586,B2,,female,"Sex: Female, Male",Count of Participants,21.0,,,2,6
NCT00001586,B2,,,Age,Mean,58.42,Standard Deviation,11.32,2,6
NCT00001586,B1,,male,"Sex: Female, Male",Count of Participants,30.0,,,2,6
NCT00001586,B1,,female,"Sex: Female, Male",Count of Participants,19.0,,,2,6


In [10]:
def calculate_imbalance(study_frame):
    nctid = study_frame.index[0]
    group_codes = study_frame.ctgov_group_code_x.unique()
    group1 = study_frame[study_frame.ctgov_group_code_x == group_codes[0]]
    group2 = study_frame[study_frame.ctgov_group_code_x == group_codes[1]]
    
    sex_imbalance, group_size_imbalance = calculate_sex_imbalance(group1, group2)
    age_imbalance = calculate_age_imbalance(group1, group2)
    
    return([nctid, sex_imbalance, age_imbalance, group_size_imbalance])

def calculate_sex_imbalance(arm1, arm2):
    arm1_sex = arm1[arm1.title_y == 'Sex: Female, Male']
    arm2_sex = arm2[arm2.title_y == 'Sex: Female, Male']
    
    arm1_size = arm1_sex.param_value_num.sum()
    arm2_size = arm2_sex.param_value_num.sum()
    
    arm1_men = arm1_sex[arm1_sex.category == 'male'].param_value_num[0]
    arm2_men = arm2_sex[arm2_sex.category == 'male'].param_value_num[0]
    
    arm1_pct_men = arm1_men / arm1_size
    arm2_pct_men = arm2_men / arm2_size
    
    sex_imbalance = abs(arm1_pct_men - arm2_pct_men)
    group_size_imbalance = abs(0.5 - (arm1_size) / (arm1_size + arm2_size))
    return((sex_imbalance, group_size_imbalance))

def calculate_age_imbalance(arm1, arm2):
    arm1_mean_age = arm1[arm1.title_y == 'Age'].param_value_num[0]
    arm2_mean_age = arm2[arm2.title_y == 'Age'].param_value_num[0]
    
    age_imbalance = 12 * abs(arm1_mean_age - arm2_mean_age) 
    return(age_imbalance)

In [11]:
#Time to calculate imbalances
imbalance_dat = []
study_ids = small_balance_dat.index.unique()
n_studies = study_ids.shape[0]
print('Calculating imbalance for ' + str(n_studies) + ' studies')
for i in range(n_studies):
    current_id = study_ids[i]
    current_study = small_balance_dat[small_balance_dat.index == current_id]
    current_imbalances = calculate_imbalance(current_study)
    imbalance_dat.append(current_imbalances)
    if (i + 1) % 1000 == 0:
        print('Finished with ' + str(i + 1) + ' studies.')
imbalance_frame = pd.DataFrame(imbalance_dat, columns = ['nct_id', 'sex_imbalance', 'age_imbalance', 'group_size_imbalance'])
imbalance_frame.head()

Calculating imbalance for 2099 studies
Finished with 1000 studies.
Finished with 2000 studies.


Unnamed: 0,nct_id,sex_imbalance,age_imbalance,group_size_imbalance
0,NCT00001586,0.012755,26.64,0.033333
1,NCT00001596,0.152174,53.28,0.157143
2,NCT00001723,0.01,1.56,0.0
3,NCT00003222,0.217033,24.0,0.182927
4,NCT00004980,0.014493,4.8,0.04


In [12]:
om.data.reset_index(inplace=True)
out.data.reset_index(inplace=True)
rg.data.reset_index(inplace=True)
out.data = out.data.add_prefix('out_')
om.data = om.data.add_prefix('om_')
rg.data = rg.data.add_prefix('rg_')

In [13]:
out.data.head()

Unnamed: 0,out_nct_id,out_id,out_outcome_type,out_title,out_description,out_time_frame,out_population,out_anticipated_posting_date,out_anticipated_posting_month_year,out_units,out_units_analyzed,out_dispersion_type,out_param_type
0,NCT00000392,4908951,Secondary,Change in Neurocognitive Performance Domain z ...,Higher values for change in z-score represent ...,Baseline and 6 months,,,,z score,,Standard Error,Mean
1,NCT00000392,4908952,Primary,Change in Global Neurocognitive Performance z ...,Higher values for change in z-score represent ...,Baseline and 6 months,,,,z score,,Standard Error,Mean
2,NCT00001213,4909011,Primary,Number of Eyes With a Corneal Cystine Crystal ...,Response is defined as a decrease from baselin...,Any Time Point Up to 19 Years,One hundred sixty-one (161) participants were ...,,,eyes,Participants,,Number
3,NCT00001213,4909012,Primary,Number of Participants With Serious and Non-Se...,Since efficacy of ophthalmic cysteamine was es...,Any Time Point up to 27 Years,,,,participants,,,Number
4,NCT00001304,4908997,Primary,Urine Calcium Excretion Level,Measurements were taken1 hour before the morni...,3 years,All patients on the study,,,mmol/24 h,,Standard Deviation,Mean


In [14]:
combined_outcomes = om.data.merge(out.data, left_on = ['om_nct_id', 'om_outcome_id'], 
                                  right_on = ['out_nct_id', 'out_id'])

combined_outcomes = combined_outcomes.merge(rg.data, left_on = ['om_nct_id','om_result_group_id'], 
                                            right_on = ['rg_nct_id','rg_id'])
primary_outcomes = combined_outcomes[combined_outcomes.out_outcome_type == 'Primary']

In [15]:
primary_outcomes.om_param_type.value_counts()

Number                          52755
Count of Participants           47526
Mean                            38024
Median                           7240
Geometric Mean                   5289
Least Squares Mean               3659
Geometric Least Squares Mean      299
Log Mean                           42
Count of Units                     21
                                    3
Name: om_param_type, dtype: int64

In [16]:
primary_outcomes = primary_outcomes[primary_outcomes.om_param_type == 'Mean']
primary_outcomes.shape

(38024, 38)

In [17]:
primary_outcomes = primary_outcomes[primary_outcomes.om_param_value != 'NA']
primary_outcomes.shape

(37737, 38)

In [18]:
num_groups = primary_outcomes.groupby(['om_nct_id', 'om_outcome_id']).om_result_group_id.nunique()
num_groups = pd.DataFrame(num_groups)
num_groups.columns.values[0] = 'n_groups'
primary_outcomes = primary_outcomes.merge(num_groups, on=['om_nct_id', 'om_outcome_id'])
primary_outcomes = primary_outcomes[primary_outcomes.n_groups==2]
primary_outcomes.shape

(10313, 39)

In [19]:
study_means = []
primary_ids = primary_outcomes.om_nct_id.unique()
for pid in primary_ids:
    temp_frame = primary_outcomes[primary_outcomes.om_nct_id == pid]
    temp_outcome_ids = temp_frame.om_outcome_id.unique()
    sampled_index = numpy.random.choice(temp_outcome_ids.shape[0], 1, replace=False)
    chosen_outcome = temp_frame[temp_frame.om_outcome_id == temp_outcome_ids[sampled_index][0]].reset_index()
    chosen_means = chosen_outcome.om_param_value_num
    study_means.append([pid, chosen_means[0], chosen_means[1]])
study_means = pd.DataFrame(study_means, columns = ['nct_id', 'mean1', 'mean2'])

In [20]:
study_means['abs_mean_dif'] = abs(study_means.mean1 - study_means.mean2)
study_means['abs_mean_sum'] = abs(study_means.mean1 + study_means.mean2)
study_means['alt_test_statistic'] = study_means.abs_mean_dif / study_means.abs_mean_sum
study_means.shape

(1163, 6)

In [29]:
study_means.alt_test_statistic.describe([.05, .1, .25, .50, .75, .90, .95])

count    1162.000000
mean        0.813635
std         6.937002
min         0.000000
5%          0.006670
10%         0.015241
25%         0.048364
50%         0.169957
75%         0.503150
90%         1.008272
95%         1.927022
max       217.333333
Name: alt_test_statistic, dtype: float64

In [30]:
condition1 = study_means.alt_test_statistic > 0
condition2 = study_means.alt_test_statistic < 2
full_condition = condition1 & condition2
study_means = study_means[full_condition]
study_means.shape

(1084, 6)

In [31]:
regression_data = study_means.merge(imbalance_frame, on = ['nct_id'])
regression_data.shape
#This is about 2x as many studies as the old way

(726, 9)

In [32]:
regression_data.head()

Unnamed: 0,nct_id,mean1,mean2,abs_mean_dif,abs_mean_sum,alt_test_statistic,sex_imbalance,age_imbalance,group_size_imbalance
0,NCT00001596,-20.93,-23.52,2.59,44.45,0.058268,0.152174,53.28,0.157143
1,NCT00001723,-0.06,-0.12,0.06,0.18,0.333333,0.01,1.56,0.0
2,NCT00004980,8.61,5.85,2.76,14.46,0.190871,0.014493,4.8,0.04
3,NCT00005669,-0.07,-0.11,0.04,0.18,0.222222,0.07226,3.6,0.03
4,NCT00005879,-0.8,-1.1,0.3,1.9,0.157895,0.0,2.4,0.007538


In [33]:
linmod = smf.ols('alt_test_statistic ~ age_imbalance + sex_imbalance', data=regression_data).fit()
print(linmod.summary())

                            OLS Regression Results                            
Dep. Variable:     alt_test_statistic   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                   0.08790
Date:                Fri, 22 Nov 2019   Prob (F-statistic):              0.916
Time:                        01:03:11   Log-Likelihood:                -307.79
No. Observations:                 726   AIC:                             621.6
Df Residuals:                     723   BIC:                             635.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.3142      0.021     14.820