In [1]:
import sys
if "../../" not in sys.path:
    sys.path.append("../../")

In [2]:
import pandas as pd
import numpy as np
from importlib import reload
from tqdm import tqdm_notebook as tqdm
import time

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot
%matplotlib inline

In [3]:
# loading all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, 
                               tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

258163 studies loaded!


In [6]:
# adding and loading dimensional data
ss.add_dimensions(['baseline_measurements',
                   'result_groups',
                   'eligibilities',
                   'outcomes',
                   'outcome_measurements'])
ss.refresh_dim_data()

Successfuly added these 5 dimensions: ['baseline_measurements', 'result_groups', 'eligibilities', 'outcomes', 'outcome_measurements']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=517), HTML(value='')))

Syncing the temp table temp_cur_studies in 517 chunks x 500 records each

Creating index on the temp table
 - Loading dimension baseline_measurements
 -- Loading raw data
 -- Sorting index
 - Loading dimension result_groups
 -- Loading raw data
 -- Sorting index
 - Loading dimension eligibilities
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcomes
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcome_measurements
 -- Loading raw data
 -- Sorting index


In [7]:
num_all = ss.studies.shape[0]
num_all

258163

The total number of studies is 258k

In [9]:
# setting all the dimensional data to variables
bm = ss.dimensions['baseline_measurements'].data
rg = ss.dimensions['result_groups'].data
el = ss.dimensions['eligibilities'].data
out = ss.dimensions['outcomes'].data
om = ss.dimensions['outcome_measurements'].data

In [11]:
bm_all = bm.index.get_level_values(0).unique().shape[0]
bm_all, bm_all / num_all

(38368, 0.14861928316606174)

only 38k studies have baseline measurements, which represents 15% of the total number of studies

## Looking in the "classification" field of baseline measurements

In [14]:
possible_classes = [x for x in bm['classification'].unique() if 'male' in x.lower()]
dfc = bm[bm['classification'].isin(possible_classes)]
dfc.groupby('classification').size().sort_values(ascending=False)[:10]

classification
Female                                 1866
Male                                   1848
Female of Childbearing Potential        113
Females                                  57
Males                                    40
N/A (Subject is Male)                    30
Female of childbearing potential         30
Females of Child-bearing Potential       26
Sterilized Females                       26
Males (Excluded from this analysis)      26
dtype: int64

looks like if we just used Female, Male and their plurals we can cover most of the cases

In [37]:
used_classes = ['female', 'male', 'females', 'males']
used_df_class = dfc[dfc['classification'].apply(lambda x: x.lower()).isin(used_classes)].copy()
selected_classes = used_df_class['classification'].unique()
print("Found these classes in the 'classification' field that matched used_classes: \n%s\n" % selected_classes)

res_studies_dfc = used_df_class.index.get_level_values(0).unique().shape[0]
print("This applies to %s studies, or %.1f%% of the those that report baseline measurements " %
      (res_studies_dfc, res_studies_dfc / bm_all * 100))

Found these classes in the 'classification' field that matched used_classes: 
['Male' 'Female' 'male' 'Males' 'Females' 'female' 'FEMALE' 'MALE'
 'females' 'males']

This applies to 681 studies, or 1.8% of the those that report baseline measurements 


## Looking in the "category" field of baseline measurements

In [24]:
possible_class_cat = [x for x in bm['category'].unique() if 'male' in x.lower()]
dfcat = bm[bm['category'].isin(possible_class_cat)]
dfcat.groupby('category').size().sort_values(ascending=False)[:10]

category
Male                                  108202
Female                                108184
FEMALE                                    68
MALE                                      68
Female of childbearing potential          18
Not Applicable (Male)                      5
Male <8                                    4
Male donor / Male Recipient                4
Female donor / Male Recipient              4
Not applicable (male participants)         4
dtype: int64

if can use male and female, in any caps

In [36]:
used_cats = ['male', 'female']
used_df_cat = bm[bm['category'].apply(lambda x: x.lower()).isin(used_cats)].copy()
selected_cats = used_df_cat['category'].unique()
print("Found these categories in the 'category' field that matched used_cats: \n%s\n" % selected_cats)

res_studies_dfcat = used_df_cat.index.get_level_values(0).unique().shape[0]
print("This applies to %s studies, or %.1f%% of the those that report baseline measurements " %
      (res_studies_dfcat, res_studies_dfcat / bm_all * 100))

Found these categories in the 'category' field that matched used_cats: 
['Male' 'Female' 'male' 'female' 'MALE' 'FEMALE']

This applies to 37542 studies, or 97.8% of the those that report baseline measurements 


### We will transform the classfication and category raw fields into just 2 boolean fields:
- is_male_class
- is_male_cat

In [38]:
used_df_class['classification'].unique()

array(['Male', 'Female', 'male', 'Males', 'Females', 'female', 'FEMALE',
       'MALE', 'females', 'males'], dtype=object)

In [40]:
used_df_class['is_male_class'] = used_df_class['classification'].apply(lambda x: 
                                                                       False if 'female' in x.lower() 
                                                                       else True)

In [42]:
used_df_cat['category'].unique()

array(['Male', 'Female', 'male', 'female', 'MALE', 'FEMALE'], dtype=object)

In [43]:
used_df_cat['is_male_cat'] = used_df_cat['category'].apply(lambda x: 
                                                           False if 'female' in x.lower() 
                                                           else True)

#### This seems a majority of our studies, is there overlap between identifying gender using classification vs using category?

### Let's analyse their intersections

In [45]:
used_df_class.groupby(['is_male_class', 'category']).size()

is_male_class  category               
False                                     1943
               Female                        4
               Male                          4
True                                      1906
               <=18 years                    4
               >=65 years                    4
               Between 18 and 65 years       4
               Male                          3
               Transfemale                   3
dtype: int64

In [52]:
used_df_cat.groupby(['is_male_cat', 'classification']).size().sort_values(ascending=False)[:20]

is_male_cat  classification             
True                                        107707
False                                       107695
True         Part 2                             30
False        Part 2                             30
             Sex                                29
True         Sex                                29
             Gender                             25
False        Gender                             25
             Part A                             17
True         Part A                             17
             Part 1                             16
False        Part 1                             16
True         Part 1b                            13
             Part 1a                            13
False        Part 1b                            13
             Part 1a                            13
True         Part B                             11
False        Part B                             11
             All patients and clinicians 

### we wouldn't miss much if we required: 
- the category be "" for records from used_df_class
- the classification be "" for record from used_df_cat

In [81]:
print(used_df_class.shape)
df_class_no_collide = used_df_class[used_df_class['category'] == ""]
print(used_df_class_final.shape)

(3875, 17)
(3849, 17)


In [82]:
print(used_df_cat.shape)
df_cat_no_collide = used_df_cat[used_df_cat['classification'] == ""]
print(used_df_cat_final.shape)

(216528, 17)
(215402, 17)


In [83]:
i1 = df_class_no_collide.index.get_level_values(0).unique()
i2 = df_cat_no_collide.index.get_level_values(0).unique()

i1_overlap = [x for x in i1 if x in i2]
tot_uniq = len(i1) + len(i2) - len(i1_overlap)
print("Combined, we have gender for %s studies, or %.1f%% of the those that report baseline measurements " %
      (tot_uniq, tot_uniq / bm_all * 100))

Combined, we have gender for 38083 studies, or 99.3% of the those that report baseline measurements 


### Pretty good!

### Some examples:
- gender in "category"
    - https://clinicaltrials.gov/ct2/show/results/NCT00000125
- gender in "classification"
    - https://clinicaltrials.gov/ct2/show/results/NCT00001213

### Only some studies are eligible since they are designed to capture both genders

In [63]:
# note that el is unique on its index of nct_id
df_all_gender_idx = el[el['gender'] == 'All'][[]]
len(df_all_gender_idx)

219613

In [84]:
df_class_final = df_class_no_collide.loc[df_all_gender_idx.index]
df_cat_final = df_cat_no_collide.loc[df_all_gender_idx.index]

In [88]:
# how do our counts go after we've excluded studies that are uni-gendered?
i1 = df_class_final.index.get_level_values(0).unique()
i2 = df_cat_final.index.get_level_values(0).unique()

i1_overlap = [x for x in i1 if x in i2]
tot_uniq = len(i1) + len(i2) - len(i1_overlap)
print("After excluding single gender studies, we have gender data for %s studies, or %.1f%% of the those that report baseline measurements " %
      (tot_uniq, tot_uniq / bm_all * 100))

After excluding single gender studies, we have gender data for 33261 studies, or 86.7% of the those that report baseline measurements 


## Joining the dataframes and making 1 is_male column

In [90]:
used_df_both = pd.concat([df_class_final, df_cat_final], sort=False)
used_df_both.shape

(192324, 18)

In [92]:
def is_male(row):
    cur_class = row['classification']
    cur_cat = row['category']
    if 'female' in cur_class.lower() or 'female' in cur_cat.lower():
        return False
    return True

used_df_both['is_male'] = used_df_both.apply(is_male, axis=1)

In [103]:
len(used_df_both.index.get_level_values(0).unique())

33261

### Only some param_types / unit combinations are valid for our purposes to count the number of people:
- param_type = Count of Participants
- param_type = Number

In [93]:
used_df_both.groupby('param_type').size()

param_type
                             31
Count of Participants    189456
Count of Units              123
Mean                         74
Median                        6
Number                     2634
dtype: int64

In [100]:
used_df_both[used_df_both['param_type'] == 'Count of Participants']['units'].unique()

array(['Participants'], dtype=object)

In [99]:
used_df_both[used_df_both['param_type'] == 'Number']['units'].unique()

array(['participants', 'Participants', 'participants in follow up study',
       'percent', 'Patients', 'Number', 'Subjects', 'Participant',
       'participant', 'hips', 'infants', 'percentage of participants',
       'subjects', 'partipants', 'participants with data available',
       'Paricipants', 'eyes', 'male/female', 'Percentage of participants',
       'blood samples', 'Count of participants', 'patients'], dtype=object)

In [114]:
# keep only those param_types and no "percent" units
used_df_both = used_df_both[(used_df_both['param_type'].isin(['Count of Participants','Number'])) &
                            (used_df_both['units'].apply(lambda x: False if 'percent' in x.lower() else True))]
len(used_df_both.index.get_level_values(0).unique())

33222

## Joining to result groups to get the group labels:

In [115]:
dfm = used_df_both.reset_index().merge(rg.reset_index()[['nct_id', 'ctgov_group_code', 'title']], 
                                       on=['nct_id', 'ctgov_group_code'], 
                                       how='inner', 
                                       suffixes=('', '_rg'))

In [116]:
dfm.groupby('title_rg').size().sort_values(ascending=False)[:50].index

Index(['Total', 'Placebo', 'Control', 'Control Group', 'All Participants',
       'All Study Participants', 'Usual Care', 'Entire Study Population',
       'Overall Study', 'Treatment', 'Placebo Group', 'Intervention',
       'Group 1', 'Standard of Care', 'Overall', 'Arm 1', 'Vehicle', 'Group 2',
       'Sugar Pill', 'All Subjects', 'Arm 2', 'Cohort 1', 'Cohort 2', 'Arm I',
       'Total Title', 'Group B', 'All Patients', 'Standard Care', 'Group A',
       'Control Arm', 'Sitagliptin', 'Pregabalin', 'Varenicline',
       'Intervention Group', 'Normal Saline', 'Experimental', 'Duloxetine',
       'Cohort 3', 'Treatment Arm', 'Sham', 'Etanercept', 'Saline',
       'Metformin', 'Arm A', 'Arm B', 'Erlotinib', 'Active', 'Pioglitazone',
       'Treatment Group', 'Rituximab'],
      dtype='object', name='title_rg')

In [117]:
total_strings = ['Total', 
                 'All Participants', 
                 'All Study Participants', 
                 'Entire Study Population',
                 'Overall Study',
                 'Overall', 
                 'Total Title']

In [122]:
dfm_nontot = dfm[~dfm['title_rg'].isin(total_strings)]
len(dfm_nontot['nct_id'].unique())

32220

## only keep groups that have non-zero number of participants

In [139]:
df_nonzero = dfm_nontot[dfm_nontot['param_value_num'] > 0]
len(df_nonzero['nct_id'].unique())

32173

## What is the distribution of total participants by study?
- where can we set an appropriate cutoff?

In [140]:
dftotals = df_nonzero.groupby('nct_id')[['param_value_num']].sum()
dftotals.head()

Unnamed: 0_level_0,param_value_num
nct_id,Unnamed: 1_level_1
NCT00000125,1636.0
NCT00000134,279.0
NCT00000135,209.0
NCT00000136,234.0
NCT00000142,64.0


In [141]:
df_percentile_totals = dftotals.describe(np.arange(0,1,0.01))
df_percentile_totals.index.name='percentile'
df_percentile_totals.reset_index(inplace=True)
df_percentile_totals = df_percentile_totals[df_percentile_totals['percentile'].apply(lambda x: 
                                                                                     True if '%' in x 
                                                                                     else False)]
df_percentile_totals[15:25]

Unnamed: 0,percentile,param_value_num
19,15%,17.0
20,16%,18.0
21,17%,19.0
22,18%,20.0
23,19%,20.0
24,20%,21.0
25,21%,23.0
26,22%,24.0
27,23%,25.0
28,24%,26.0


### We picked to keep 80% of the data and exclude any studies such the total <= 20

In [142]:
df_totals_used = dftotals[dftotals['param_value_num'] > 20]
df_totals_used.shape

(25991, 1)

In [144]:
# excluding studies with < 30 total participants
dfm_large = df_nonzero[df_nonzero['nct_id'].isin(df_totals_used.index)]
len(dfm_large['nct_id'].unique())

25991

In [146]:
dfm_large.columns

Index(['nct_id', 'result_group_id', 'id', 'ctgov_group_code', 'classification',
       'category', 'title', 'description', 'units', 'param_type',
       'param_value', 'param_value_num', 'dispersion_type', 'dispersion_value',
       'dispersion_value_num', 'dispersion_lower_limit',
       'dispersion_upper_limit', 'explanation_of_na', 'is_male_class',
       'is_male_cat', 'is_male', 'title_rg'],
      dtype='object')

## Constricting to the set of studies with only 2 groups and 4 gender boxes

In [188]:
# 2 arms
df_nonzero_groups = dfm_large.groupby(['nct_id'])['ctgov_group_code'].nunique()
two_grp_ids = df_nonzero_groups[df_nonzero_groups == 2].index
dfm_large_two_arm = dfm_large[dfm_large['nct_id'].isin(two_grp_ids)]

# 4 buckets
num_arm_genders = dfm_large_two_arm_i.groupby('nct_id').size()
keeponly4 = num_arm_genders[num_arm_genders == 4].index

dfbm_final = dfm_large_two_arm[dfm_large_two_arm['nct_id'].isin(keeponly4)]
len(dfbm_final['nct_id'].unique())

3262

## On the outcome measurement side, we need:
- out_param_type = Mean
- out_outcome_type = Primary

In [153]:
omj = om.reset_index().add_prefix('om_')
outj = out.reset_index().add_prefix('out_')

In [154]:
dfj = omj.merge(outj, 
                how='inner',
                left_on=['om_nct_id', 'om_outcome_id'],
                right_on=['out_nct_id', 'out_id'],)

In [158]:
# studies from the baseline measurement gender requirements above
studies_w_gender_info = dfm_large_two_arm['nct_id'].unique()
len(studies_w_gender_info)

13041

In [160]:
# studies with 1 primary outcome
outj_primary = outj[outj['out_outcome_type'] == 'Primary']
num_outcomes_per_study = outj_primary.groupby('out_nct_id')['out_id'].size()
studies_w_1_outcome = num_outcomes_per_study[num_outcomes_per_study == 1].index
len(num_outcomes_per_study)

38531

In [161]:
dfj_used = dfj[(dfj['om_nct_id'].isin(studies_w_gender_info)) &
               (dfj['om_nct_id'].isin(studies_w_1_outcome))]

In [202]:
dfom_prim = dfj_used[(dfj_used['out_param_type'] == 'Mean') & (dfj_used['out_outcome_type'] == 'Primary')]
len(dfom_prim['om_nct_id'].unique())

3363

### Some studies report more than 1 set of outcomes in their primar results:
- for example: NCT00000378
- we can exclude these as well

In [203]:
num_group_codes = dfom_prim.groupby('om_nct_id')['om_ctgov_group_code'].size()
with_2_g_codes = num_group_codes[num_group_codes == 2].index

In [205]:
dfom_final = dfom_prim[dfom_prim['om_nct_id'].isin(with_2_g_codes)]
len(dfom_final['om_nct_id'].unique())

2423

## Taking the intersection of the 2 datasets

In [206]:
dfbm_final_i = dfbm_final[dfbm_final['nct_id'].isin(dfom_final['om_nct_id'].unique())]
dfom_final_i = dfom_final[dfom_final['om_nct_id'].isin(dfbm_final['nct_id'].unique())]

In [207]:
dfm_final_i.shape

(13048, 22)

In [208]:
dfbm_final_i.columns

Index(['nct_id', 'result_group_id', 'id', 'ctgov_group_code', 'classification',
       'category', 'title', 'description', 'units', 'param_type',
       'param_value', 'param_value_num', 'dispersion_type', 'dispersion_value',
       'dispersion_value_num', 'dispersion_lower_limit',
       'dispersion_upper_limit', 'explanation_of_na', 'is_male_class',
       'is_male_cat', 'is_male', 'title_rg'],
      dtype='object')

In [209]:
dfom_final_i.columns

Index(['om_nct_id', 'om_result_group_id', 'om_outcome_id', 'om_id',
       'om_ctgov_group_code', 'om_classification', 'om_category', 'om_title',
       'om_description', 'om_units', 'om_param_type', 'om_param_value',
       'om_param_value_num', 'om_dispersion_type', 'om_dispersion_value',
       'om_dispersion_value_num', 'om_dispersion_lower_limit',
       'om_dispersion_upper_limit', 'om_explanation_of_na', 'out_nct_id',
       'out_id', 'out_outcome_type', 'out_title', 'out_description',
       'out_time_frame', 'out_population', 'out_anticipated_posting_date',
       'out_anticipated_posting_month_year', 'out_units', 'out_units_analyzed',
       'out_dispersion_type', 'out_param_type'],
      dtype='object')

In [227]:
dfbm_final_i.to_pickle('bm.p')
dfom_final_i.to_pickle('om.p')

## Computing the imbalance from the bm dataset

In [225]:
imbal_dict = {}
for nct_id in tqdm(dfbm_final_i['nct_id'].unique()):
    sub_df = dfbm_final_i[dfbm_final_i['nct_id'] == nct_id]
    df_tot = sub_df.groupby('ctgov_group_code')[['param_value_num']].sum()
    df_male = sub_df[sub_df['is_male']][['ctgov_group_code', 'param_value_num']].set_index('ctgov_group_code')
    dfj = df_male.join(df_tot, rsuffix='_tot')
    dfj['pct_male'] = dfj['param_value_num'] / dfj['param_value_num_tot']
    imbal = abs(dfj['pct_male'].values[1] - dfj['pct_male'].values[0])
    imbal_dict[nct_id] = imbal

HBox(children=(IntProgress(value=0, max=2347), HTML(value='')))




In [223]:
# this is the dataframe where we can calculate an imbalance score
df_imbal = pd.DataFrame(imbal_dict, index=['imbal']).T
df_imbal.head()

Unnamed: 0,imbal
NCT00114127,0.271795
NCT00180479,0.04442
NCT00405275,0.055921
NCT00446654,0.066667
NCT00728754,0.014009


## Computing the effect size from the om dataset

In [217]:
dfom_final_i[['om_ctgov_group_code', 'om_param_value_num']].head()

Unnamed: 0,om_ctgov_group_code,om_param_value_num
22,O2,-0.41
23,O1,-0.46
42,O2,0.16
43,O1,0.24
173,O2,-20.93


In [218]:
effect_dict = {}
for nct_id in tqdm(dfom_final_i['om_nct_id'].unique()):
    sub_df = dfom_final_i[dfom_final_i['om_nct_id'] == nct_id]
    m1 = sub_df['om_param_value_num'].values[0]
    m2 = sub_df['om_param_value_num'].values[1]
    
    if m1 * m2 > 0:
    
        abs_sum = abs(m1) + abs(m2)
        abs_diff = abs(m1 - m2)
        abs_effect = abs_diff / abs_sum

        effect_dict[nct_id] = abs_effect

HBox(children=(IntProgress(value=0, max=2347), HTML(value='')))




In [224]:
# this is the dataframe where we can calculate an effect score
df_effect = pd.DataFrame(effect_dict, index=['effect']).T
df_effect.head()

Unnamed: 0,effect
NCT00000371,0.057471
NCT00000392,0.2
NCT00001596,0.058268
NCT00001723,0.333333
NCT00004500,0.114754
