In [4]:
import pandas as pd
import numpy as np

In [5]:
import matplotlib.pyplot as plt

In [6]:
import pickle

In [7]:
from tqdm import tqdm

In [8]:
from sklearn import preprocessing

In [9]:
data_path='../../adrd_v3b/'
save_path='../Processed/final/'
save_path2 = '../Processed/intermediated/'

In [10]:
main = pd.read_csv(data_path + 'adrd_v3b_master.csv')

In [11]:
rx = pd.read_csv(data_path + 'adrd_v3b_drugs.csv')

In [12]:
dx = pd.read_csv(data_path + 'adrd_v3b_dx_phecode.csv')

In [11]:
main.head(1)

Unnamed: 0,patid,adrd_dt,adrd,yrdob,race,gdr_cd,death_dt,state,age_onset,first_svc_dt,...,deleted,head_injury,heart_disease,diabetes,vascular_disease,obesity,hypertension,anxiety,depression,hyperlipidemia
0,33003315152,2007-08-24 00:00:00,1,1930,W,M,2012-09-01,MO,77,2007-02-21 00:00:00,...,0,0,0,0,0,0,0,0,0,0


In [12]:
rx.head(1)

Unnamed: 0,patid,gnrc_nm,drug_start_date
0,33012860824,DB00381,2007-07-08


In [13]:
dx.head(1)

Unnamed: 0,patid,fst_dt,icd_flag,diag,phecode
0,33042074081,2017-03-31,10,I10,401.1


In [18]:
len(main), len(rx['patid'].unique()), len(dx['patid'].unique())

(2327858, 2326341, 2326953)

In [14]:
end_yr = main['last_svc_dt'].apply(lambda x: int(str(x).split('-')[0]))

In [15]:
main['race'].unique()

array(['W', 'A', 'B', 'H', nan], dtype=object)

In [16]:
len(main['patid'].unique())

2327858

# Anti-asthma categories

In [13]:
agonist =  ['DB01001', 'DB13139', 'DB09082','DB09076', 'DB01274', 'DB15784', 'DB05039', 'DB00816', 
            'DB12846', 'DB00938', 'DB00871', 'DB00277', 'DB00983']

In [14]:
corticosteroids = ['DB13867', 'DB01222', 'DB00764', 'DB00394', 'DB00180','DB01410']

In [15]:
anticholinergic = ['DB00332','DB09076']

In [16]:
xanthine = ['DB00277', 'DB01303', 'DB01223']

In [17]:
leukotriene = ['DB00471', 'DB00744', 'DB00549', 'DB01411']

In [18]:
drug_prior = [agonist, corticosteroids, anticholinergic, xanthine, leukotriene]

In [19]:
drug_dict = {}
idx = 1
for lst in drug_prior:
    for name in lst:
        drug_dict[name] = idx
    idx += 1

In [20]:
drug_dict

{'DB01001': 1,
 'DB13139': 1,
 'DB09082': 1,
 'DB09076': 3,
 'DB01274': 1,
 'DB15784': 1,
 'DB05039': 1,
 'DB00816': 1,
 'DB12846': 1,
 'DB00938': 1,
 'DB00871': 1,
 'DB00277': 4,
 'DB00983': 1,
 'DB13867': 2,
 'DB01222': 2,
 'DB00764': 2,
 'DB00394': 2,
 'DB00180': 2,
 'DB01410': 2,
 'DB00332': 3,
 'DB01303': 4,
 'DB01223': 4,
 'DB00471': 5,
 'DB00744': 5,
 'DB00549': 5,
 'DB01411': 5}

In [21]:
rx['antiasthma'] = rx['gnrc_nm'].apply(lambda x: drug_dict.get(x,0))

In [22]:
rx_new=rx.groupby(['patid','antiasthma']).count().reset_index()
rx_new.rename(columns={'gnrc_nm':'count'},inplace=True)

# Exclude patients with some specific conditions

- Depression: 296.2
- Schizophrenia: 295
- Parkinson disease: 332
- Stroke: 434.21
- ICP: 377

In [23]:
rm_lst = [296.2, 295, 332, 434.21, 337]

In [24]:
dx_id = set(dx['patid'].unique())
main_id = set(main['patid'].unique())
rx_id = set(rx['patid'].unique())
final_list = list(main_id.intersection(dx_id, rx_id))

In [25]:
len(final_list)

2325438

In [26]:
dx_new = dx[dx['patid'].isin(final_list)]
main_new = main[main['patid'].isin(final_list)]
rx_new = rx[rx['patid'].isin(final_list)]

In [27]:
dx_new.index = np.arange(0, len(dx_new))
main_new.index = np.arange(0, len(main_new))
rx_new.index = np.arange(0, len(rx_new))

In [28]:
rm_dx = dx_new[dx_new['phecode'].isin(rm_lst)]

In [29]:
rm_id = list(rm_dx['patid'].unique())

In [30]:
len(rm_id)

393306

In [31]:
main_new = main_new[~main_new['patid'].isin(rm_id)]

In [32]:
dx_new = dx_new[~dx_new['patid'].isin(rm_id)]
rx_new = rx_new[~rx_new['patid'].isin(rm_id)]

In [33]:
main_new.index = np.arange(0, len(main_new))
dx_new.index = np.arange(0, len(dx_new))
rx_new.index = np.arange(0, len(rx_new))

In [34]:
len(main_new['patid'].unique())

1932132

# Multiple treatments for one patient 

In [35]:
rx_new.head()

Unnamed: 0,patid,gnrc_nm,drug_start_date,antiasthma
0,33012860824,DB00381,2007-07-08,0
1,33029969374,DB00904,2019-10-13,0
2,33028924187,DB00641,2011-09-19,0
3,33037338588,DB00264,2017-08-08,0
4,33005719004,DB00722,2012-05-09,0


In [36]:
rx_group = rx_new.groupby(['patid'])

In [37]:
def uniq_antiasthma(x):
    x_temp = x[x['antiasthma'] != 0]
    return len(x_temp['antiasthma'].unique())

In [38]:
uni_a = rx_group.apply(lambda x: uniq_antiasthma(x))

In [39]:
id_list = list(uni_a[uni_a <= 1].index)

In [40]:
len(id_list)

1643647

In [41]:
uniq_rx1 = rx_new[rx_new['patid'].isin(id_list)].copy()

In [42]:
uniq_rx1.index = np.arange(0, len(uniq_rx1))

In [43]:
uniq_rx1.head()

Unnamed: 0,patid,gnrc_nm,drug_start_date,antiasthma
0,33012860824,DB00381,2007-07-08,0
1,33028924187,DB00641,2011-09-19,0
2,33037338588,DB00264,2017-08-08,0
3,33005719004,DB00722,2012-05-09,0
4,33037527113,DB01085,2014-04-18,0


In [44]:
rx_count = uniq_rx1.groupby(by=['patid', 'antiasthma']).count()

In [45]:
rx_count = rx_count.reset_index()

In [46]:
rx_treatment = rx_count[rx_count['antiasthma'] != 0]
len(rx_treatment['patid'].unique()), len(rx_count['patid'].unique()) - len(rx_treatment['patid'].unique())

(336069, 1307578)

In [47]:
rx_count['log_count'] = np.log(rx_count['gnrc_nm'])

In [48]:
rx_count

Unnamed: 0,patid,antiasthma,gnrc_nm,drug_start_date,log_count
0,33003282085,0,260,260,5.560682
1,33003282095,0,270,270,5.598422
2,33003282135,0,379,379,5.937536
3,33003282158,0,29,29,3.367296
4,33003282187,0,45,45,3.806662
...,...,...,...,...,...
1979332,33249744392,0,52,52,3.951244
1979333,33249744392,1,2,2,0.693147
1979334,33250788842,0,40,40,3.688879
1979335,33250967065,0,23,23,3.135494


In [49]:
rx_sampled=rx_count[rx_count['antiasthma'] != 0].groupby('antiasthma').apply(lambda g: g.sample(n=3000, replace=True, weights=g['log_count']))

In [50]:
rx_sampled.drop_duplicates(inplace=True)

In [51]:
rx_sampled['antiasthma'].value_counts()

1    2955
5    2740
3    2542
2    2341
4     707
Name: antiasthma, dtype: int64

In [52]:
rx_sampled = rx_sampled[['patid', 'antiasthma', 'log_count']]

In [53]:
rx_sampled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,patid,antiasthma,log_count
antiasthma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1272738,33052186267,1,1.609438
1,966100,33041559432,1,2.484907
1,693283,33037745354,1,1.609438
1,438444,33029742991,1,2.564949
1,1541123,33116159034,1,2.484907


# Dx code

In [54]:
dx.head(1)

Unnamed: 0,patid,fst_dt,icd_flag,diag,phecode
0,33042074081,2017-03-31,10,I10,401.1


In [10]:
dx['phecode3'] = dx['phecode'].apply(lambda x: str(x).split('.')[0])

In [11]:
dx=dx.groupby(['patid','phecode3']).count().reset_index()
dx.rename(columns={'phecode':'count'},inplace=True)

In [12]:
dx['log_count']=dx['count'].apply(lambda x: np.log(x+1))

In [13]:
outlier=np.percentile(dx['log_count'], 95)
dx['log_count']=dx['log_count'].apply(lambda x: outlier if x>outlier else x)

In [14]:
top_phecode=dx.groupby('phecode3')['patid'].count()>dx['patid'].nunique()*0.05
top_phecode=top_phecode[top_phecode].index

In [15]:
dx=dx[dx['phecode3'].isin(top_phecode)]

In [16]:
high_var=dx[['phecode3','log_count']].groupby('phecode3').var()>0.2
high_var=high_var[high_var['log_count']].index

In [17]:
len(high_var)

222

In [62]:
patids=list(set(rx_sampled['patid'].unique()).intersection(dx['patid'].unique()))

# Encoder

In [63]:
patids_sample=patids

In [64]:
le_patid=preprocessing.LabelEncoder()
le_patid.fit(list(patids_sample))

LabelEncoder()

In [65]:
rx_sampled=rx_sampled[rx_sampled['patid'].isin(le_patid.classes_)].copy()
rx_sampled['patid']=le_patid.transform(rx_sampled['patid'])

In [66]:
dx=dx[dx['patid'].isin(le_patid.classes_)].copy()
dx['patid']=le_patid.transform(dx['patid'])

In [67]:
le_dx=preprocessing.LabelEncoder()
le_dx.fit(dx['phecode3'].unique())
dx['phecode3']=le_dx.transform(dx['phecode3'])

In [68]:
rx=rx_sampled.set_index('patid').sort_index()

In [69]:
rx.head()

Unnamed: 0_level_0,antiasthma,log_count
patid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,2.890372
1,2,1.94591
2,2,2.772589
3,1,2.397895
4,3,1.791759


# Demographics + time

In [70]:
len(main)

2327858

In [71]:
len(le_patid.classes_)

11285

In [72]:
main.index = main['patid']

In [73]:
main['female']=main['gdr_cd'].apply(lambda x: int(x=='F'))
main.drop(columns='gdr_cd',inplace=True)

In [74]:
main['race'].fillna('U', inplace=True)

In [75]:
main['race'].unique()

array(['W', 'A', 'B', 'H', 'U'], dtype=object)

In [16]:
main=main.merge(pd.get_dummies(main['race'],prefix='race_'),left_index=True, right_index=True).drop(columns='race')

In [76]:
main=main.loc[le_patid.classes_,:]
main.index=le_patid.transform(main.index)

In [77]:
main.to_csv(save_path2 + 'main.csv')
rx.to_csv(save_path2 + 'rx.csv')
dx.to_csv(save_path2 + 'dx.csv')

# Split dataset 

In [19]:
from sklearn.model_selection import train_test_split
import random

In [20]:
main = pd.read_csv(save_path2 + 'main.csv')
rx = pd.read_csv(save_path2 + 'rx.csv')
dx = pd.read_csv(save_path2 + 'dx.csv')

In [10]:
dx.head()

Unnamed: 0.1,Unnamed: 0,patid,phecode3,fst_dt,icd_flag,diag,count,log_count
0,12709,0,7,15,15,15,15,2.772589
1,12710,0,8,5,5,5,5,1.791759
2,12711,0,11,1,1,1,1,0.693147
3,12712,0,14,2,2,2,2,1.098612
4,12715,0,15,2,2,2,2,1.098612


In [10]:
dx = dx[['patid', 'phecode3', 'count', 'log_count']]

In [11]:
rx.head()

Unnamed: 0,patid,antiasthma,log_count
0,0,2,2.890372
1,1,2,1.94591
2,2,2,2.772589
3,3,1,2.397895
4,4,3,1.791759


In [12]:
main.head(1)

Unnamed: 0.1,Unnamed: 0,patid,adrd_dt,adrd,yrdob,race,death_dt,state,age_onset,first_svc_dt,...,head_injury,heart_disease,diabetes,vascular_disease,obesity,hypertension,anxiety,depression,hyperlipidemia,female
0,0,33003288277,2020-02-12 00:00:00,0,1934,U,,OK,86,2007-01-03 00:00:00,...,0,1,0,1,0,1,0,0,0,1


### Train test split
Tables:
- main
- dx
- rx

Outputs:
- X: Select features
- T: Treatment (rx)
- Y: ADRD
- W: dx

In [21]:
total_idx = list(main.index)
train_idx = random.sample(range(len(main)), int(0.6*len(main)))
rest_idx = [idx for idx in total_idx if idx not in train_idx]
val_idx = random.sample(rest_idx, int(0.5*len(rest_idx)))
test_idx = [idx for idx in rest_idx if idx not in val_idx]

In [23]:
len(train_idx), len(val_idx), len(test_idx)

(6771, 2257, 2257)

In [13]:
selected_features = ['age_onset','female']+['race__'+c for c in ['A','B','H','U','W']]

In [26]:
rx.head(1)

Unnamed: 0,patid,antiasthma,log_count
0,0,2,2.890372


In [27]:
dx.head(1)

Unnamed: 0,patid,phecode3,count,log_count
0,0,7,15,2.772589


In [28]:
main.drop(columns=['Unnamed: 0'], inplace=True)
main.index

RangeIndex(start=0, stop=11285, step=1)

In [18]:
X = main[selected_features]
Y = main['adrd']

In [29]:
len(rx)

11285

In [30]:
X_train = X.iloc[train_idx, :]
X_val = X.iloc[val_idx, :]
X_test = X.iloc[test_idx, :]
Y_train = Y.iloc[train_idx]
Y_test = Y.iloc[test_idx]
Y_val = Y.iloc[val_idx]
T_train = rx.iloc[train_idx, :]
T_test = rx.iloc[test_idx, :]
T_val = rx.iloc[val_idx, :]
W_train = dx[dx['patid'].isin(train_idx)]
W_val = dx[dx['patid'].isin(val_idx)]
W_test = dx[dx['patid'].isin(test_idx)]

In [31]:
pickle.dump((Y_train, T_train, X_train, W_train), open(save_path+'YTXW_train.pkl','wb'))
pickle.dump((Y_val, T_val, X_val, W_val), open(save_path+'YTXW_val.pkl','wb'))
pickle.dump((Y_test, T_test, X_test, W_test), open(save_path+'YTXW_test.pkl','wb'))

In [None]:
pickle.dump(le_patid, open(save_path+'le_patid.pkl','wb'))
pickle.dump(le_dx, open(save_path+'le_dx.pkl','wb'))

In [120]:
pickle.dump(drug_dict, open(save_path+'drug_dict.pkl', 'wb'))