In [1]:
import sys
if "../../" not in sys.path:
    sys.path.append("../../")

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from importlib import reload
from tqdm import tqdm_notebook as tqdm
from fuzzywuzzy import fuzz
import time

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# loading all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn=conn, 
                               tqdm_handler=tqdm)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

266207 studies loaded!


In [5]:
# adding and loading dimensional data
ss.add_dimensions(['baseline_measurements',
                   'result_groups',
                   'eligibilities',
                   'outcomes',
                   'outcome_counts',
                   'outcome_measurements'])
ss.refresh_dim_data()

Successfuly added these 6 dimensions: ['baseline_measurements', 'result_groups', 'eligibilities', 'outcomes', 'outcome_counts', 'outcome_measurements']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=533), HTML(value='')))

Syncing the temp table temp_cur_studies in 533 chunks x 500 records each

Creating index on the temp table
 - Loading dimension baseline_measurements
 -- Loading raw data
 -- Sorting index
 - Loading dimension result_groups
 -- Loading raw data
 -- Sorting index
 - Loading dimension eligibilities
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcomes
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcome_counts
 -- Loading raw data
 -- Sorting index
 - Loading dimension outcome_measurements
 -- Loading raw data
 -- Sorting index


In [6]:
num_all = ss.studies.shape[0]
num_all

266207

The total number of studies is 258k

In [6]:
# setting all the dimensional data to variables
bm = ss.dimensions['baseline_measurements'].data
rg = ss.dimensions['result_groups'].data
el = ss.dimensions['eligibilities'].data
out = ss.dimensions['outcomes'].data
oc = ss.dimensions['outcome_counts'].data
om = ss.dimensions['outcome_measurements'].data


In [9]:
bm_all = bm.index.get_level_values(0).unique().shape[0]
bm_all, bm_all / num_all

(38564, 0.14884575451489268)

only 38k studies have baseline measurements, which represents 15% of the total number of studies

## Looking in the "classification" field of baseline measurements

In [10]:
possible_classes = [x for x in bm['classification'].unique() if 'male' in x.lower()]
dfc = bm[bm['classification'].isin(possible_classes)]
dfc.groupby('classification').size().sort_values(ascending=False)[:10]

classification
Female                                 1872
Male                                   1854
Female of Childbearing Potential        113
Females                                  57
Males                                    40
N/A (Subject is Male)                    30
Female of childbearing potential         30
Females of Child-bearing Potential       26
Sterilized Females                       26
Males (Excluded from this analysis)      26
dtype: int64

looks like if we just used Female, Male and their plurals we can cover most of the cases

In [11]:
used_classes = ['female', 'male', 'females', 'males']
used_df_class = dfc[dfc['classification'].apply(lambda x: x.lower()).isin(used_classes)].copy()
selected_classes = used_df_class['classification'].unique()
print("Found these classes in the 'classification' field that matched used_classes: \n%s\n" % selected_classes)

res_studies_dfc = used_df_class.index.get_level_values(0).unique().shape[0]
print("This applies to %s studies, or %.1f%% of the those that report baseline measurements " %
      (res_studies_dfc, res_studies_dfc / bm_all * 100))

Found these classes in the 'classification' field that matched used_classes: 
['Male' 'Female' 'male' 'Males' 'Females' 'female' 'FEMALE' 'MALE'
 'females' 'males']

This applies to 682 studies, or 1.8% of the those that report baseline measurements 


## Looking in the "category" field of baseline measurements

In [12]:
possible_class_cat = [x for x in bm['category'].unique() if 'male' in x.lower()]
dfcat = bm[bm['category'].isin(possible_class_cat)]
dfcat.groupby('category').size().sort_values(ascending=False)[:10]

category
Male                                108795
Female                              108777
FEMALE                                  68
MALE                                    68
Female of childbearing potential        18
Female of Childbearing Potential         8
Participant is a male                    5
Not Applicable (Male)                    5
Male ≥8                                  4
Female <4                                4
dtype: int64

if can use male and female, in any caps

In [13]:
used_cats = ['male', 'female']
used_df_cat = bm[bm['category'].apply(lambda x: x.lower()).isin(used_cats)].copy()
selected_cats = used_df_cat['category'].unique()
print("Found these categories in the 'category' field that matched used_cats: \n%s\n" % selected_cats)

res_studies_dfcat = used_df_cat.index.get_level_values(0).unique().shape[0]
print("This applies to %s studies, or %.1f%% of the those that report baseline measurements " %
      (res_studies_dfcat, res_studies_dfcat / bm_all * 100))

Found these categories in the 'category' field that matched used_cats: 
['Male' 'Female' 'male' 'female' 'MALE' 'FEMALE']

This applies to 37737 studies, or 97.9% of the those that report baseline measurements 


### We will transform the classfication and category raw fields into just 2 boolean fields:
- is_male_class
- is_male_cat

In [14]:
used_df_class['classification'].unique()

array(['Male', 'Female', 'male', 'Males', 'Females', 'female', 'FEMALE',
       'MALE', 'females', 'males'], dtype=object)

In [15]:
used_df_class['is_male_class'] = used_df_class['classification'].apply(lambda x: 
                                                                       False if 'female' in x.lower() 
                                                                       else True)

In [16]:
used_df_cat['category'].unique()

array(['Male', 'Female', 'male', 'female', 'MALE', 'FEMALE'], dtype=object)

In [17]:
used_df_cat['is_male_cat'] = used_df_cat['category'].apply(lambda x: 
                                                           False if 'female' in x.lower() 
                                                           else True)

#### This seems a majority of our studies, is there overlap between identifying gender using classification vs using category?

### Let's analyse their intersections

In [18]:
used_df_class.groupby(['is_male_class', 'category']).size()

is_male_class  category                           
False                                                 1943
               Female                                    4
               MGI less than or equal to [<=] 2.00       3
               MGI more than [>] 2.00                    3
               Male                                      4
True                                                  1906
               <=18 years                                4
               >=65 years                                4
               Between 18 and 65 years                   4
               MGI less than or equal to [<=] 2.00       3
               MGI more than [>] 2.00                    3
               Male                                      3
               Transfemale                               3
dtype: int64

In [19]:
used_df_cat.groupby(['is_male_cat', 'classification']).size().sort_values(ascending=False)[:20]

is_male_cat  classification             
True                                        108290
False                                       108278
             Sex                                33
True         Sex                                33
             Part 2                             30
False        Part 2                             30
True         Gender                             28
False        Gender                             28
             Part A                             17
True         Part A                             17
             Part 1                             16
False        Part 1                             16
True         Part 1b                            13
             Part 1a                            13
False        Part 1b                            13
             Part 1a                            13
True         Part B                             11
False        Part B                             11
             All patients and clinicians 

### we wouldn't miss much if we required: 
- the category be "" for records from used_df_class
- the classification be "" for record from used_df_cat

In [20]:
print(used_df_class.shape)
df_class_no_collide = used_df_class[used_df_class['category'] == ""]
print(df_class_no_collide.shape)

(3887, 17)
(3849, 17)


In [21]:
print(used_df_cat.shape)
df_cat_no_collide = used_df_cat[used_df_cat['classification'] == ""]
print(df_cat_no_collide.shape)

(217714, 17)
(216568, 17)


In [22]:
i1 = df_class_no_collide.index.get_level_values(0).unique()
i2 = df_cat_no_collide.index.get_level_values(0).unique()

i1_overlap = [x for x in i1 if x in i2]
tot_uniq = len(i1) + len(i2) - len(i1_overlap)
print("Combined, we have gender for %s studies, or %.1f%% of the those that report baseline measurements " %
      (tot_uniq, tot_uniq / bm_all * 100))

Combined, we have gender for 38275 studies, or 99.3% of the those that report baseline measurements 


### Pretty good!

### Some examples:
- gender in "category"
    - https://clinicaltrials.gov/ct2/show/results/NCT00000125
- gender in "classification"
    - https://clinicaltrials.gov/ct2/show/results/NCT00001213

### Only some studies are eligible since they are designed to capture both genders

In [26]:
# note that el is unique on its index of nct_id
df_all_gender_idx = el[el['gender'] == 'All'][[]]
len(df_all_gender_idx)

220406

In [27]:
df_class_final = df_class_no_collide.loc[df_all_gender_idx.index]
df_cat_final = df_cat_no_collide.loc[df_all_gender_idx.index]

In [28]:
# how do our counts go after we've excluded studies that are uni-gendered?
i1 = df_class_final.index.get_level_values(0).unique()
i2 = df_cat_final.index.get_level_values(0).unique()

i1_overlap = [x for x in i1 if x in i2]
tot_uniq = len(i1) + len(i2) - len(i1_overlap)
print("After excluding single gender studies, we have gender data for %s studies, or %.1f%% of the those that report baseline measurements " %
      (tot_uniq, tot_uniq / bm_all * 100))

After excluding single gender studies, we have gender data for 33439 studies, or 86.7% of the those that report baseline measurements 


## Joining the dataframes and making 1 is_male column

In [29]:
used_df_both = pd.concat([df_class_final, df_cat_final], sort=False)
used_df_both.shape

(193410, 18)

In [30]:
def is_male(row):
    cur_class = row['classification']
    cur_cat = row['category']
    if 'female' in cur_class.lower() or 'female' in cur_cat.lower():
        return False
    return True

used_df_both['is_male'] = used_df_both.apply(is_male, axis=1)

In [31]:
len(used_df_both.index.get_level_values(0).unique())

33439

### Only some param_types / unit combinations are valid for our purposes to count the number of people:
- param_type = Count of Participants
- param_type = Number

In [32]:
used_df_both.groupby('param_type').size()

param_type
                             31
Count of Participants    190542
Count of Units              123
Mean                         74
Median                        6
Number                     2634
dtype: int64

In [33]:
used_df_both[used_df_both['param_type'] == 'Count of Participants']['units'].unique()

array(['Participants'], dtype=object)

In [34]:
used_df_both[used_df_both['param_type'] == 'Number']['units'].unique()

array(['participants', 'Participants', 'participants in follow up study',
       'percent', 'Patients', 'Number', 'Subjects', 'Participant',
       'participant', 'hips', 'infants', 'percentage of participants',
       'subjects', 'partipants', 'participants with data available',
       'Paricipants', 'eyes', 'male/female', 'Percentage of participants',
       'blood samples', 'Count of participants', 'patients'], dtype=object)

In [35]:
# keep only those param_types and no "percent" units
used_df_both = used_df_both[(used_df_both['param_type'].isin(['Count of Participants','Number'])) &
                            (used_df_both['units'].apply(lambda x: False if 'percent' in x.lower() else True))]
len(used_df_both.index.get_level_values(0).unique())

33400

## Joining to result groups to get the group labels:

In [36]:
dfm = used_df_both.reset_index().merge(rg.reset_index()[['nct_id', 'ctgov_group_code', 'title']], 
                                       on=['nct_id', 'ctgov_group_code'], 
                                       how='inner', 
                                       suffixes=('', '_rg'))

In [37]:
dfm.groupby('title_rg').size().sort_values(ascending=False)[:50].index

Index(['Total', 'Placebo', 'Control', 'All Participants', 'Control Group',
       'All Study Participants', 'Usual Care', 'Entire Study Population',
       'Overall Study', 'Treatment', 'Intervention', 'Placebo Group',
       'Group 1', 'Standard of Care', 'Overall', 'Arm 1', 'Vehicle', 'Group 2',
       'Sugar Pill', 'All Subjects', 'Arm 2', 'Cohort 1', 'Cohort 2', 'Arm I',
       'Total Title', 'All Patients', 'Group B', 'Standard Care', 'Group A',
       'Control Arm', 'Pregabalin', 'Sitagliptin', 'Varenicline',
       'Intervention Group', 'Experimental', 'Normal Saline', 'Duloxetine',
       'Cohort 3', 'Saline', 'Treatment Arm', 'Sham', 'Etanercept',
       'Metformin', 'Arm A', 'Active', 'Erlotinib', 'Arm B', 'Ketamine',
       'Rituximab', 'Treatment Group'],
      dtype='object', name='title_rg')

In [38]:
total_strings = ['Total', 
                 'All Participants', 
                 'All Study Participants', 
                 'Entire Study Population',
                 'Overall Study',
                 'Overall', 
                 'Total Title']

In [39]:
dfm_nontot = dfm[~dfm['title_rg'].isin(total_strings)]
len(dfm_nontot['nct_id'].unique())

32389

## only keep groups that have non-zero number of participants

In [40]:
df_nonzero = dfm_nontot[dfm_nontot['param_value_num'] > 0]
len(df_nonzero['nct_id'].unique())

32342

## What is the distribution of total participants by study?
- where can we set an appropriate cutoff?

In [41]:
dftotals = df_nonzero.groupby('nct_id')[['param_value_num']].sum()
dftotals.head()

Unnamed: 0_level_0,param_value_num
nct_id,Unnamed: 1_level_1
NCT00000125,1636.0
NCT00000134,279.0
NCT00000135,209.0
NCT00000136,234.0
NCT00000142,64.0


In [42]:
df_percentile_totals = dftotals.describe(np.arange(0,1,0.01))
df_percentile_totals.index.name='percentile'
df_percentile_totals.reset_index(inplace=True)
df_percentile_totals = df_percentile_totals[df_percentile_totals['percentile'].apply(lambda x: 
                                                                                     True if '%' in x 
                                                                                     else False)]
df_percentile_totals[15:25]

Unnamed: 0,percentile,param_value_num
19,15%,17.0
20,16%,18.0
21,17%,19.0
22,18%,20.0
23,19%,20.0
24,20%,21.0
25,21%,23.0
26,22%,24.0
27,23%,25.0
28,24%,26.0


### We picked to keep 80% of the data and exclude any studies such the total <= 20

In [43]:
df_totals_used = dftotals[dftotals['param_value_num'] > 20]
df_totals_used.shape

(26121, 1)

In [44]:
# excluding studies with < 30 total participants
dfm_large = df_nonzero[df_nonzero['nct_id'].isin(df_totals_used.index)]
len(dfm_large['nct_id'].unique())

26121

In [45]:
dfm_large.columns

Index(['nct_id', 'result_group_id', 'id', 'ctgov_group_code', 'classification',
       'category', 'title', 'description', 'units', 'param_type',
       'param_value', 'param_value_num', 'dispersion_type', 'dispersion_value',
       'dispersion_value_num', 'dispersion_lower_limit',
       'dispersion_upper_limit', 'explanation_of_na', 'is_male_class',
       'is_male_cat', 'is_male', 'title_rg'],
      dtype='object')

## Constricting to the set of studies with only 2 groups and 4 gender boxes

In [46]:
# 2 arms
df_nonzero_groups = dfm_large.groupby(['nct_id'])['ctgov_group_code'].nunique()
two_grp_ids = df_nonzero_groups[df_nonzero_groups == 2].index
dfm_large_two_arm = dfm_large[dfm_large['nct_id'].isin(two_grp_ids)]

# 4 buckets
num_arm_genders = dfm_large_two_arm.groupby('nct_id').size()
keeponly4 = num_arm_genders[num_arm_genders == 4].index

dfbm_final = dfm_large_two_arm[dfm_large_two_arm['nct_id'].isin(keeponly4)]
len(dfbm_final['nct_id'].unique())

12665

## On the outcome measurement side, we need:
- out_param_type = Mean
- out_outcome_type = Primary

In [47]:
omj = om.reset_index().add_prefix('om_')
outj = out.reset_index().add_prefix('out_')

In [48]:
dfj = omj.merge(outj, 
                how='inner',
                left_on=['om_nct_id', 'om_outcome_id'],
                right_on=['out_nct_id', 'out_id'],)

In [49]:
# studies from the baseline measurement gender requirements above
studies_w_gender_info = dfm_large_two_arm['nct_id'].unique()
len(studies_w_gender_info)

13114

In [50]:
# studies with 1 primary outcome
outj_primary = outj[outj['out_outcome_type'] == 'Primary']
num_outcomes_per_study = outj_primary.groupby('out_nct_id')['out_id'].size()
studies_w_1_outcome = num_outcomes_per_study[num_outcomes_per_study == 1].index
len(num_outcomes_per_study)

38728

In [51]:
dfj_used = dfj[(dfj['om_nct_id'].isin(studies_w_gender_info)) &
               (dfj['om_nct_id'].isin(studies_w_1_outcome))]

In [52]:
dfom_prim = dfj_used[(dfj_used['out_param_type'] == 'Mean') & (dfj_used['out_outcome_type'] == 'Primary')]
len(dfom_prim['om_nct_id'].unique())

3380

### Some studies report more than 1 set of outcomes in their primar results:
- for example: NCT00000378
- we can exclude these as well

In [53]:
# needs to have exactly 2 unique group codes
num_group_codes = dfom_prim.groupby('om_nct_id')['om_ctgov_group_code'].size()
with_2_g_codes = num_group_codes[num_group_codes == 2].index

num_uniq_group_codes = dfom_prim.groupby('om_nct_id')['om_ctgov_group_code'].nunique()
with_2_uniq_g_codes = num_uniq_group_codes[num_uniq_group_codes == 2].index

In [54]:
dfom_all_types = dfom_prim[(dfom_prim['om_nct_id'].isin(with_2_g_codes)) &
                       (dfom_prim['om_nct_id'].isin(with_2_uniq_g_codes))]
len(dfom_all_types['om_nct_id'].unique())

2435

upon study, only the dispersion types Standard Deviation and Standard Error have good coverage for a dispersion metric, so we will only keep those studies

In [55]:
dfom_all_types.groupby('om_dispersion_type').size()

om_dispersion_type
90% Confidence Interval         2
95% Confidence Interval       504
97.5% Confidence Interval       2
99% Confidence Interval         2
Full Range                    114
Inter-Quartile Range           26
Standard Deviation           3556
Standard Error                664
dtype: int64

In [56]:
used_dispersion_types = ['Standard Deviation', 'Standard Error']
dfom_final = dfom_all_types[dfom_all_types['om_dispersion_type'].isin(used_dispersion_types)]
len(dfom_final['om_nct_id'].unique())

2110

## On the outcome counts, getting the primary counts

In [57]:
ocj = oc.reset_index().add_prefix('oc_')
out_ocj = out[['id', 'outcome_type']].reset_index().add_prefix('oco_')

In [58]:
ocj.columns

Index(['oc_nct_id', 'oc_result_group_id', 'oc_outcome_id', 'oc_id',
       'oc_ctgov_group_code', 'oc_scope', 'oc_units', 'oc_count'],
      dtype='object')

In [59]:
out_ocj.columns

Index(['oco_nct_id', 'oco_id', 'oco_outcome_type'], dtype='object')

In [60]:
ocoj = ocj.merge(out_ocj,
                 how='inner', 
                 left_on=['oc_nct_id', 'oc_outcome_id'],
                 right_on=['oco_nct_id', 'oco_id'])
ocoj_in_set = ocoj[ocoj['oc_nct_id'].isin(dfom_final['om_nct_id'].unique())]

In [61]:
# only keep counts on primary out comes
ocoj_primary = ocoj_in_set[ocoj_in_set['oco_outcome_type'] == 'Primary']

In [62]:
ocoj_primary[ocoj_primary['oc_nct_id'] == 'NCT00993473']

Unnamed: 0,oc_nct_id,oc_result_group_id,oc_outcome_id,oc_id,oc_ctgov_group_code,oc_scope,oc_units,oc_count,oco_nct_id,oco_id,oco_outcome_type
272264,NCT00993473,3839331,1157565,2750701,O2,Measure,Participants,64,NCT00993473,1157565,Primary
272265,NCT00993473,3839332,1157565,2750702,O1,Measure,Participants,61,NCT00993473,1157565,Primary


Notes: After manually investigating the above cases, we need to compute the standard error divisor using the non-"Participant" row for oc_units

In [63]:
# extracting the out_come counts:
res_dict = {}

for nct_id in tqdm(ocoj_primary['oc_nct_id'].unique()):
    #print(nct_id)
    sub_df = ocoj_primary[ocoj_primary['oc_nct_id'] == nct_id]
    for gcode in sub_df['oc_ctgov_group_code'].unique():
        sub_df_gc = sub_df[sub_df['oc_ctgov_group_code'] == gcode]
        type_units = sub_df_gc['oc_units'].unique()

        if len(type_units) == 1:
            num_participants = sub_df_gc['oc_count'].values[0]
            num_measures = sub_df_gc['oc_count'].values[0]  # same as participants
        elif len(type_units) == 2:
            num_participants = sub_df_gc[sub_df_gc['oc_units'] == 'Participants']['oc_count'].values[0]
            num_measures = sub_df_gc[sub_df_gc['oc_units'] != 'Participants']['oc_count'].values[0]

        res_dict[(nct_id, gcode)] = {
            'num_participants': num_participants,
            'num_measures': num_measures,
        }

HBox(children=(IntProgress(value=0, max=2110), HTML(value='')))




In [64]:
df_counts = pd.DataFrame(res_dict).T
df_counts.index.names = ['nct_id', 'ctgov_group_code']
df_counts = df_counts.reset_index().add_prefix('ct_')
df_counts

Unnamed: 0,ct_nct_id,ct_ctgov_group_code,ct_num_participants,ct_num_measures
0,NCT00000371,O2,27,27
1,NCT00000371,O1,28,28
2,NCT00000392,O2,77,77
3,NCT00000392,O1,66,66
4,NCT00001596,O2,3,3
...,...,...,...,...
4219,NCT03831451,O1,74,74
4220,NCT04109703,O2,51,51
4221,NCT04109703,O1,49,49
4222,NCT04123665,O2,48,48


for each record in dfom_final_i, we have to associate it with a "num_participants" and "num_measured"

## Joining the counts to dfom_final_i

In [65]:
dfom_final.shape

(4220, 32)

In [66]:
df_om_final_w_counts = dfom_final.merge(df_counts,
                                        how='inner',
                                        left_on=['om_nct_id', 'om_ctgov_group_code'],
                                        right_on=['ct_nct_id', 'ct_ctgov_group_code'])
df_om_final_w_counts.shape

(4220, 36)

## Taking the intersection of the 2 datasets

In [67]:
dfbm_final_i = dfbm_final[dfbm_final['nct_id'].isin(df_om_final_w_counts['om_nct_id'].unique())]
dfbm_final_i = dfbm_final_i.add_prefix('bm_')
dfom_final_i = df_om_final_w_counts[df_om_final_w_counts['om_nct_id'].isin(dfbm_final['nct_id'].unique())]

In [68]:
dfbm_final_i.shape, dfom_final_i.shape

((8168, 22), (4084, 36))

In [69]:
# checkpoint outputting data pickels
dfbm_final_i.to_pickle('bm.p')
dfom_final_i.to_pickle('om.p')

## Incorporate result group data into the output measurements 

In [70]:
# joining the result groups to dfom
rg_omj = rg.reset_index().add_prefix("rgom_")
dfom_final_ij = dfom_final_i.merge(rg_omj,
                                   how='inner',
                                   left_on=['om_nct_id', 'om_result_group_id'],
                                   right_on=['rgom_nct_id', 'rgom_id'])
dfom_final_ij.shape

(4084, 42)

In [71]:
# joining the result groups to dfbm
rg_bmj = rg.reset_index().add_prefix("rgbm_")
dfbm_final_ij = dfbm_final_i.merge(rg_bmj,
                                   how='inner',
                                   left_on=['bm_nct_id', 'bm_result_group_id'],
                                   right_on=['rgbm_nct_id', 'rgbm_id'])
dfbm_final_ij.shape

(8168, 28)

In [72]:
dfom_final_ij.columns

Index(['om_nct_id', 'om_result_group_id', 'om_outcome_id', 'om_id',
       'om_ctgov_group_code', 'om_classification', 'om_category', 'om_title',
       'om_description', 'om_units', 'om_param_type', 'om_param_value',
       'om_param_value_num', 'om_dispersion_type', 'om_dispersion_value',
       'om_dispersion_value_num', 'om_dispersion_lower_limit',
       'om_dispersion_upper_limit', 'om_explanation_of_na', 'out_nct_id',
       'out_id', 'out_outcome_type', 'out_title', 'out_description',
       'out_time_frame', 'out_population', 'out_anticipated_posting_date',
       'out_anticipated_posting_month_year', 'out_units', 'out_units_analyzed',
       'out_dispersion_type', 'out_param_type', 'ct_nct_id',
       'ct_ctgov_group_code', 'ct_num_participants', 'ct_num_measures',
       'rgom_nct_id', 'rgom_id', 'rgom_ctgov_group_code', 'rgom_result_type',
       'rgom_title', 'rgom_description'],
      dtype='object')

## Using the group_code suffix to identify groups across bm and om

In [73]:
dfom_final_ij['grp_num'] = dfom_final_ij['om_ctgov_group_code'].apply(lambda x: int(x[1]))
dfbm_final_ij['grp_num'] = dfbm_final_ij['bm_ctgov_group_code'].apply(lambda x: int(x[1]))

## Pivotting both datasets so that there is only 1 row per study

In [74]:
dfbm_final_ij.columns

Index(['bm_nct_id', 'bm_result_group_id', 'bm_id', 'bm_ctgov_group_code',
       'bm_classification', 'bm_category', 'bm_title', 'bm_description',
       'bm_units', 'bm_param_type', 'bm_param_value', 'bm_param_value_num',
       'bm_dispersion_type', 'bm_dispersion_value', 'bm_dispersion_value_num',
       'bm_dispersion_lower_limit', 'bm_dispersion_upper_limit',
       'bm_explanation_of_na', 'bm_is_male_class', 'bm_is_male_cat',
       'bm_is_male', 'bm_title_rg', 'rgbm_nct_id', 'rgbm_id',
       'rgbm_ctgov_group_code', 'rgbm_result_type', 'rgbm_title',
       'rgbm_description', 'grp_num'],
      dtype='object')

In [75]:
res_dict = {}
for nct_id in tqdm(dfbm_final_ij['bm_nct_id'].unique()):
    sub_df = dfbm_final_ij[dfbm_final_ij['bm_nct_id'] == nct_id]
    g1m = sub_df[(sub_df['bm_is_male']) & (sub_df['grp_num'] == 1)]['bm_param_value_num'].values[0]
    g1f = sub_df[(~sub_df['bm_is_male']) & (sub_df['grp_num'] == 1)]['bm_param_value_num'].values[0]
    g2m = sub_df[(sub_df['bm_is_male']) & (sub_df['grp_num'] == 2)]['bm_param_value_num'].values[0]
    g2f = sub_df[(~sub_df['bm_is_male']) & (sub_df['grp_num'] == 2)]['bm_param_value_num'].values[0]
    
    g1title = sub_df[(sub_df['grp_num'] == 1)]['rgbm_title'].values[0]
    g1des = sub_df[(sub_df['grp_num'] == 1)]['rgbm_description'].values[0]
    g2title = sub_df[(sub_df['grp_num'] == 2)]['rgbm_title'].values[0]
    g2des = sub_df[(sub_df['grp_num'] == 2)]['rgbm_description'].values[0]
    
    res_dict[nct_id] = {
        'g1m': g1m,
        'g1f': g1f,
        'g2m': g2m,
        'g2f': g2f,
        'g1title': g1title,
        'g1des': g1des,
        'g2title': g2title,
        'g2des': g2des,
    }

HBox(children=(IntProgress(value=0, max=2042), HTML(value='')))




In [76]:
df_pivot_bm = pd.DataFrame(res_dict).T
float_type_cols = ['g1m', 'g1f', 'g2m', 'g2f']
for col in float_type_cols:
    df_pivot_bm[col] = df_pivot_bm[col].astype(float)
df_pivot_bm.head()

Unnamed: 0,g1m,g1f,g2m,g2f,g1title,g1des,g2title,g2des
NCT00114127,6.0,7.0,11.0,4.0,Duloxetine 60mg/Day + Placebo for 18 Weeks(Pha...,Participants who did not achieve remission at ...,Duloxetine 120mg/Day for 18 Weeks (Phase 2),Participants who did not achieve remission at ...
NCT00180479,469.0,200.0,218.0,114.0,XIENCE V® EECSS,XIENCE V® Everolimus Eluting Coronary Stent Sy...,TAXUS® EXPRESS2™ ECSS,TAXUS® EXPRESS2™ Paclitaxel Eluting Coronary S...
NCT00405275,101.0,77.0,89.0,85.0,Triple,"Hydroxychloroquine, sulfasalazine and methotre...",Etanercept,Etanercept and Methotrexate
NCT00446654,8.0,12.0,6.0,12.0,CGC-11047 Once Every 2 Weeks,16.5 mg CGC-11047 as a subconjunctival injecti...,CGC-11047 Once Every Four Weeks,16.5 mg CGC-11047 as a subconjunctival injecti...
NCT00728754,15.0,14.0,17.0,15.0,Osseotite Certain Prevail Implant,Patients with dental implant with internal con...,Osseotite Certain Implant,Patients with dental implant with internal con...


In [77]:
dfom_final_ij.columns

Index(['om_nct_id', 'om_result_group_id', 'om_outcome_id', 'om_id',
       'om_ctgov_group_code', 'om_classification', 'om_category', 'om_title',
       'om_description', 'om_units', 'om_param_type', 'om_param_value',
       'om_param_value_num', 'om_dispersion_type', 'om_dispersion_value',
       'om_dispersion_value_num', 'om_dispersion_lower_limit',
       'om_dispersion_upper_limit', 'om_explanation_of_na', 'out_nct_id',
       'out_id', 'out_outcome_type', 'out_title', 'out_description',
       'out_time_frame', 'out_population', 'out_anticipated_posting_date',
       'out_anticipated_posting_month_year', 'out_units', 'out_units_analyzed',
       'out_dispersion_type', 'out_param_type', 'ct_nct_id',
       'ct_ctgov_group_code', 'ct_num_participants', 'ct_num_measures',
       'rgom_nct_id', 'rgom_id', 'rgom_ctgov_group_code', 'rgom_result_type',
       'rgom_title', 'rgom_description', 'grp_num'],
      dtype='object')

In [78]:
res_dict = {}
for nct_id in tqdm(dfom_final_ij['om_nct_id'].unique()):
    sub_df = dfom_final_ij[dfom_final_ij['om_nct_id'] == nct_id]
    
    g1_mean = sub_df[sub_df['grp_num'] == 1]['om_param_value_num'].values[0]
    g2_mean = sub_df[sub_df['grp_num'] == 2]['om_param_value_num'].values[0]
    
    g1_dtype = sub_df[sub_df['grp_num'] == 1]['om_dispersion_type'].values[0]
    g2_dtype = sub_df[sub_df['grp_num'] == 2]['om_dispersion_type'].values[0]
    
    g1_sd = sub_df[sub_df['grp_num'] == 1]['om_dispersion_value_num'].values[0]
    g2_sd = sub_df[sub_df['grp_num'] == 2]['om_dispersion_value_num'].values[0]
    
    g1title_om = sub_df[sub_df['grp_num'] == 1]['rgom_title'].values[0]
    g1des_om = sub_df[sub_df['grp_num'] == 1]['rgom_description'].values[0]
    g2title_om = sub_df[sub_df['grp_num'] == 2]['rgom_title'].values[0]
    g2des_om = sub_df[sub_df['grp_num'] == 2]['rgom_description'].values[0]
    
    num_parts1 = sub_df[sub_df['grp_num'] == 1]['ct_num_participants'].values[0]
    num_parts2 = sub_df[sub_df['grp_num'] == 2]['ct_num_participants'].values[0]
    num_measures1 = sub_df[sub_df['grp_num'] == 1]['ct_num_measures'].values[0]
    num_measures2 = sub_df[sub_df['grp_num'] == 2]['ct_num_measures'].values[0]
    
    res_dict[nct_id] = {
        'g1_mean': g1_mean,
        'g2_mean': g2_mean,
        'g1_dtype': g1_dtype,
        'g2_dtype': g2_dtype,
        'g1_sd': g1_sd,
        'g2_sd': g2_sd,
        'g1title_om': g1title_om,
        'g1des_om': g1des_om,
        'g2title_om': g2title_om,
        'g2des_om': g2des_om,
        'num_parts1': num_parts1,
        'num_parts2': num_parts2,
        'num_measures1': num_measures1,
        'num_measures2': num_measures2,
    }
    

HBox(children=(IntProgress(value=0, max=2042), HTML(value='')))




In [79]:
df_pivot_om = pd.DataFrame(res_dict).T
float_type_cols = ['g1_mean', 'g2_mean', 'g1_sd', 'g2_sd']
for col in float_type_cols:
    df_pivot_om[col] = df_pivot_om[col].astype(float)
    
int_type_cols = ['num_parts1', 'num_parts2', 'num_measures1', 'num_measures2']
for col in int_type_cols:
    df_pivot_om[col] = df_pivot_om[col].astype(int)
    
df_pivot_om.head()

Unnamed: 0,g1_mean,g2_mean,g1_dtype,g2_dtype,g1_sd,g2_sd,g1title_om,g1des_om,g2title_om,g2des_om,num_parts1,num_parts2,num_measures1,num_measures2
NCT00000371,-0.46,-0.41,Standard Error,Standard Error,0.29,0.31,D-Cycloserine,Patients were given 50 mg of D-Cycloserine dai...,Placebo,Patients were given 50 mg of placebo daily in ...,28,27,28,27
NCT00000392,0.24,0.16,Standard Error,Standard Error,0.05,0.03,Peptide T,Peptide T given intranasally at a dosage of 2m...,Placebo,Placebo given intranasally at a dosage of 2mg ...,66,77,66,77
NCT00001596,-23.52,-20.93,Standard Deviation,Standard Deviation,24.32,25.59,Pirfenidone,Subjects received pirfenidone 801 mg (3 pills ...,Placebo,"Subjects received placebo (3 pills), three tim...",11,3,11,3
NCT00001723,-0.12,-0.06,Standard Error,Standard Error,0.02,0.02,Orlistat,Orlistat 120 mg TID for 6 months plus a behavi...,Placebo,Matching placebo 120 mg TID x 6 months plus a ...,100,100,100,100
NCT00004500,10.2,8.1,Standard Deviation,Standard Deviation,9.96,8.52,Lucinactant,Lucinactant via bronchoaveolar lavage,Standard Care,"Standard Care included the use of oxygen, CMV,...",38,31,38,31


## Joining both pivoted datasets

In [1]:
38564 - 2042

36522

In [80]:
dfj = df_pivot_bm.join(df_pivot_om, how='inner')
dfj.shape

(2042, 22)

## let's make sure the groups didn't get mixed up

In [81]:
dfj[[x for x in dfj.columns if 'title' in x]]

Unnamed: 0,g1title,g2title,g1title_om,g2title_om
NCT00114127,Duloxetine 60mg/Day + Placebo for 18 Weeks(Pha...,Duloxetine 120mg/Day for 18 Weeks (Phase 2),Duloxetine 60mg/Day + Placebo for 18 Weeks (Ph...,Duloxetine 120mg/Day for 18 Weeks (Phase 2)
NCT00180479,XIENCE V® EECSS,TAXUS® EXPRESS2™ ECSS,XIENCE V® EECSS,TAXUS® EXPRESS2™ ECSS
NCT00405275,Triple,Etanercept,Triple,Etanercept
NCT00446654,CGC-11047 Once Every 2 Weeks,CGC-11047 Once Every Four Weeks,CGC-11047 Once Every 2 Weeks,CGC-11047 Once Every Four Weeks
NCT00728754,Osseotite Certain Prevail Implant,Osseotite Certain Implant,Osseotite Certain Prevail Implant,Osseotite Certain Implant
...,...,...,...,...
NCT03663179,Active TMS,Sham TMS,Active TMS,Sham TMS
NCT03739242,Nutraceutical Combination,Placebo,Nutraceutical Combination,Placebo
NCT03831451,Stroke Preparedness Intervention,Healthy Lifestyle Intervention,Stroke Preparedness Intervention,Healthy Lifestyle Intervention
NCT04109703,High Level Pulsed Heat,Low Level Steady Heat,High Level Pulsed Heat,Low Level Steady Heat


In [82]:
dfj[dfj['g1title'] != dfj['g1title_om']][[x for x in dfj.columns if 'title' in x]]

Unnamed: 0,g1title,g2title,g1title_om,g2title_om
NCT00114127,Duloxetine 60mg/Day + Placebo for 18 Weeks(Pha...,Duloxetine 120mg/Day for 18 Weeks (Phase 2),Duloxetine 60mg/Day + Placebo for 18 Weeks (Ph...,Duloxetine 120mg/Day for 18 Weeks (Phase 2)
NCT00909779,Placebo Comparator: Placebo Twice Daily,Experimental: Arformoterol 15 Mcg Twice Daily,Experimental: Arformoterol 15 Mcg Twice Daily,Placebo
NCT03698591,"Transcranial Magnetic Stimulation (TMS), Then ...","Sham TMS, Then Transcranial Magnetic Stimulation",Transcranial Magnetic Stimulation (TMS),Sham TMS
NCT00064792,Placebo Followed by Simvastatin,Simvastatin Followed by Placebo,Not Simvastatin,Simvastatin
NCT00116844,"Sequence 1: VALTREX 1 g Once Daily, Placebo","Sequence 2: Placebo, VALTREX 1 g Once Daily",VALTREX 1 g Once Daily,Placebo
...,...,...,...,...
NCT02986139,Sequence AB,Sequence BA,Etanercept Commercial Formulation,Etanercept New Formulation
NCT02997904,Resultz Lice and Egg Removal Kit,Sham Lice and Egg Removal Kit,Resultz Lice and Egg Elimination Kit,Sham Lice and Egg Elimination Kit
NCT03001778,Community Usability Testing,ILF Usability Testing,Community Residents,ILF Resident Usability Testing
NCT03141086,Group 1,Group 2,LML134 5mg,Placebo


Notes: we can tell that some of groups did get mixed up:

for example, NCT00909779, group 1 from bm is the placebo, where as group 2 from the om is the placebo


##### Since imbalance is calculated only from BM measures, and effect only from OM measures, we don't need to match them, but if we were to use this dataset for other purposes we'd have to complete the work below to link the groups

### Correcting for the SD

In [83]:
drop_cols = ['g1title', 'g1des', 'g2title', 'g2des', 
             'g1title_om', 'g1des_om', 'g2title_om', 'g2des_om']
dfj = dfj.drop(drop_cols, axis=1)

In [84]:
dfj.columns

Index(['g1m', 'g1f', 'g2m', 'g2f', 'g1_mean', 'g2_mean', 'g1_dtype',
       'g2_dtype', 'g1_sd', 'g2_sd', 'num_parts1', 'num_parts2',
       'num_measures1', 'num_measures2'],
      dtype='object')

In [85]:
def generate_sd_fn(sdcol, tcol, dtypecol):
    def standardize_sd(row):
        g_sd = row[sdcol]
        gt = row[tcol]
        if row[dtypecol] == 'Standard Error':
            g_sd = g_sd * np.sqrt(gt)
        return g_sd
    return standardize_sd

dfj['g1t'] = dfj['g1f'] + dfj['g1m']
dfj['g2t'] = dfj['g2f'] + dfj['g2m']

# use num_measures as the unit to scale the listed sd
dfj['g1_sd_sample'] = dfj.apply(generate_sd_fn('g1_sd', 'num_measures1', 'g1_dtype'), axis=1)
dfj['g2_sd_sample'] = dfj.apply(generate_sd_fn('g2_sd', 'num_measures2', 'g2_dtype'), axis=1)

dfj = dfj.drop(['g1_sd', 'g2_sd', 'g1_dtype', 'g2_dtype'], axis=1)

## Imbalance

In [86]:
dfj['imbal'] = np.abs((dfj['g1m'] / dfj['g1t']) - (dfj['g2m'] / dfj['g2t']))

## Effect

In [87]:
numer = np.abs(dfj['g1_mean'] - dfj['g2_mean'])
denom = np.sqrt((dfj['num_measures1'] * dfj['g1_sd_sample'] ** 2)  + \
                 (dfj['num_measures2'] * dfj['g2_sd_sample'] ** 2) / 
                 (dfj['num_measures1'] + dfj['num_measures2']))

dfj['effect'] = numer / denom


In [88]:
dfj.to_pickle('effect_imbal_enhanced.p')

In [90]:
dfj[dfj['num_parts1'] > dfj['num_measures1']]

Unnamed: 0,g1m,g1f,g2m,g2f,g1_mean,g2_mean,num_parts1,num_parts2,num_measures1,num_measures2,g1t,g2t,g1_sd_sample,g2_sd_sample,imbal,effect
NCT00909779,243.0,178.0,236.0,183.0,155.0,171.7,420,421,63,40,421.0,419.0,91.2,98.7,0.013951,0.022987
NCT01481935,63.0,30.0,56.0,41.0,20.5,17.2,97,93,55,49,93.0,97.0,15.574017,17.5,0.1001,0.028418
NCT02004977,209.0,254.0,204.0,237.0,8.3,0.9,463,441,8,8,463.0,441.0,9.8,1.6,0.011181,0.266747


In [91]:
dfj[dfj['num_parts2'] > dfj['num_measures2']]

Unnamed: 0,g1m,g1f,g2m,g2f,g1_mean,g2_mean,num_parts1,num_parts2,num_measures1,num_measures2,g1t,g2t,g1_sd_sample,g2_sd_sample,imbal,effect
NCT00909779,243.0,178.0,236.0,183.0,155.0,171.7,420,421,63,40,421.0,419.0,91.2,98.7,0.013951,0.022987
NCT01481935,63.0,30.0,56.0,41.0,20.5,17.2,97,93,55,49,93.0,97.0,15.574017,17.5,0.1001,0.028418
NCT02004977,209.0,254.0,204.0,237.0,8.3,0.9,463,441,8,8,463.0,441.0,9.8,1.6,0.011181,0.266747
