In [6]:
import pandas as pd
import numpy as np

In [2]:
# ICD10 codes for T1DM, T2DM
icd10_t1dm = ['E10']

icd10_t2dm = ['E11']

icd1_other_dm = ['E12', 'E13', 'E14']

In [17]:
# self-reported diabetes codes
sr_t1dm = [1222]

sr_t2dm = [1223]

sr_other_dm = [1221, 1521]

sr_alltype_dm = [1220]

In [4]:
# read ICD10 codes and self-reported codes for all subjects
df_icd = pd.read_csv('data/main_data_icd10.csv', low_memory=False)
df_icd_times = pd.read_csv('data/time_icd10.csv', low_memory=False)
df_noncancer = pd.read_csv('data/self_reported_noncancer.csv', low_memory=False)
df_noncancer_times = pd.read_csv('data/self_reported_noncancer_times.csv', low_memory=False)

In [5]:
# check Eid for df_icd and df_noncancer
print(df_icd['Eid'].equals(df_noncancer['Eid']))

True


In [15]:
# codes to numpy
icd_codes = df_icd.iloc[:, 1:].to_numpy()
icd_code_times = df_icd_times.iloc[:, 1:].to_numpy()
noncancers_codes = df_noncancer.iloc[:, 1:].to_numpy()
noncancers_code_times = df_noncancer_times.iloc[:, 1:].to_numpy()
# disease marker for t1dm, t2dm, other dm
dis_marker = np.zeros([len(icd_codes), 4], dtype=np.int32)
dis_date = np.zeros([len(icd_codes), 4], dtype=float)

In [16]:
from datetime import datetime

# search for t1dm, t2dm, other dm from icd_codes
for i in range(len(icd_codes)):
    icd_i = icd_codes[i]
    # check icd_codes
    for t, icd_code in enumerate(icd_i):
        # if nan detected, just break (no more codes for this subject)
        if isinstance(icd_code, str) is False:
            break
        # 0. t1dm
        for x in icd10_t1dm:
            if icd_code.startswith(x):
                dis_marker[i][0] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][0] = dt.timestamp()
                    break
                
        # 1. t2dm
        for x in icd10_t2dm:
            if icd_code.startswith(x):
                dis_marker[i][1] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][1] = dt.timestamp()
                    break
                
        # 2. other dm
        for x in icd1_other_dm:
            if icd_code.startswith(x):
                dis_marker[i][2] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][2] = dt.timestamp()
                    break

In [18]:
# search for t1dm, t2dm, other dm from noncancers_codes
from math import ceil
# check non-cancer codes
for i in range(len(icd_codes)):
    noncancers_i = noncancers_codes[i]
    for t, code in enumerate(noncancers_i):
        if code == np.nan:
            continue
        # 0. t1dm
        if dis_marker[i][0] == 0:
            for x in sr_t1dm:
                if code == x:
                    dis_marker[i][0] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][0] = dt
                    break
                
        # 1. t2dm
        if dis_marker[i][1] == 0:
            for x in sr_t2dm:
                if code == x:
                    dis_marker[i][1] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][1] = dt
                    break
        
        # 3. other dm
        if dis_marker[i][2] == 0:
            for x in sr_other_dm:
                if code == x:
                    dis_marker[i][2] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][2] = dt
                    break
                    
        # 4. all dm
        if dis_marker[i][3] == 0:
            for x in sr_alltype_dm:
                if code == x:
                    dis_marker[i][3] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][3] = dt
                    break

In [19]:
# save disease marker and disease date
df_dis_marker = pd.DataFrame(dis_marker, columns=['t1dm', 't2dm', 'other_dm', 'all_dm'])
df_dis_date = pd.DataFrame(dis_date, columns=['t1dm', 't2dm', 'other_dm', 'all_dm'])
# add Eid
df_dis_marker.insert(0, 'Eid', df_icd['Eid'])
df_dis_date.insert(0, 'Eid', df_icd['Eid'])

In [20]:
# count number of subjects with t1dm, t2dm, other dm
print('t1dm: ', np.sum(dis_marker[:, 0]))
print('t2dm: ', np.sum(dis_marker[:, 1]))
print('other dm: ', np.sum(dis_marker[:, 2]))
print('all dm: ', np.sum(dis_marker[:, 3]))

t1dm:  5029
t2dm:  41783
other dm:  6433
all dm:  22966


In [24]:
# save to csv file to data/data_preprocessed
df_dis_marker.to_csv('data/data_preprocessed/diabetes_marker.csv', index=False)
df_dis_date.to_csv('data/data_preprocessed/diabetes_date.csv', index=False)

In [26]:
# remove subejects with other dm
# remove subjects with other dm, or all dm = 1, t1dm = 0, and t2dm = 0
subj_other_dm = df_dis_marker[(df_dis_marker['other_dm'] == 1) | ((df_dis_marker['all_dm'] == 1) & (df_dis_marker['t1dm'] == 0) & (df_dis_marker['t2dm'] == 0))]['Eid'].to_numpy()
print(len(subj_other_dm))

9781


In [28]:
# remove subjects with other dm
df_dis_marker_removed_other_dm = df_dis_marker[~df_dis_marker['Eid'].isin(subj_other_dm)]
df_dis_date_removed_other_dm = df_dis_date[~df_dis_date['Eid'].isin(subj_other_dm)]
# count for t1dm, t2dm, other dm
print('t1dm: ', np.sum(df_dis_marker_removed_other_dm['t1dm']))
print('t2dm: ', np.sum(df_dis_marker_removed_other_dm['t2dm']))
print('other dm: ', np.sum(df_dis_marker_removed_other_dm['other_dm']))

t1dm:  3781
t2dm:  36899
other dm:  0


In [29]:
# save to csv file to data/data_preprocessed E10, E11
df_dis_marker_removed_other_dm.to_csv('data/data_preprocessed/diabetes_marker_removed_other_dm.csv', index=False)
df_dis_date_removed_other_dm.to_csv('data/data_preprocessed/diabetes_date_removed_other_dm.csv', index=False)

In [30]:
# remove subjects with both t1dm and t2dm
subj_both_dm = df_dis_marker_removed_other_dm[(df_dis_marker_removed_other_dm['t1dm'] == 1) & (df_dis_marker_removed_other_dm['t2dm'] == 1)]['Eid'].to_numpy()
print(len(subj_both_dm))

2804


In [31]:
# remove subjects with both t1dm and t2dm
df_dis_marker_removed_both_dm = df_dis_marker_removed_other_dm[~df_dis_marker_removed_other_dm['Eid'].isin(subj_both_dm)]
df_dis_date_removed_both_dm = df_dis_date_removed_other_dm[~df_dis_date_removed_other_dm['Eid'].isin(subj_both_dm)]
# count for t1dm, t2dm, other dm
print('t1dm: ', np.sum(df_dis_marker_removed_both_dm['t1dm']))
print('t2dm: ', np.sum(df_dis_marker_removed_both_dm['t2dm']))

t1dm:  977
t2dm:  34095


In [32]:
# save to csv file to data/data_preprocessed
df_dis_marker_removed_both_dm.to_csv('data/data_preprocessed/diabetes_marker_removed_both_dm.csv', index=False)
df_dis_date_removed_both_dm.to_csv('data/data_preprocessed/diabetes_date_removed_both_dm.csv', index=False)

In [33]:
# subjects with brain IDPs 
df_idps = pd.read_csv('data/data_587idp_v2.csv')

In [41]:
# merge with df_dis_marker
df_dis_with_idps = pd.merge(df_dis_marker_removed_both_dm, df_idps, on='Eid', how='inner')
print(len(df_dis_with_idps))

42802


In [42]:
# count for t1dm, t2dm, other dm
print('t1dm with idps: ', np.sum(df_dis_with_idps['t1dm']))
print('t2dm with idps: ', np.sum(df_dis_with_idps['t2dm']))
# print('other dm: ', np.sum(df_dis_with_idps['other_dm']))

t1dm:  78
t2dm:  1752
other dm:  0
