In [15]:
import pandas as pd
import os
import numpy as np

In [16]:
column_map = {
    'eid': 'eid', 
    'p30750_i0': 'HbA1c', 
    'p30740_i0': 'Random_Glucose', 
    'p41270': 'Diagnoses_ICD10', 
    'p41271': 'Diagnoses_ICD9',
    'p20003_i0': 'Medication', 
    'p21022': 'Age_at_recruitment', 
    'p31': 'Sex',
    'p189': 'Townsend_deprivation_index_at_recruitment', 
    'p22009_a1': 'Genetic_PC1', 
    'p22009_a2': 'Genetic_PC2', 
    'p22009_a3': 'Genetic_PC3', 
    'p22009_a4': 'Genetic_PC4', 
    'p22009_a5': 'Genetic_PC5',
    'p22009_a6': 'Genetic_PC6', 
    'p22009_a7': 'Genetic_PC7', 
    'p22009_a8': 'Genetic_PC8', 
    'p22009_a9': 'Genetic_PC9', 
    'p22009_a10': 'Genetic_PC10',
    'p2986_i0': 'Started_insulin_within_one_year_of_diagnosis', 
    'p6177_i0': 'Medication_for_cholesterol_bp_diabetes', 
    'p6153_i0': 'Medication_for_cholesterol_bp_diabetes_or_exogenous_hormones', 
    'p21001_i0': 'BMI', 
    'p42040': 'GP_clinical_event_record', 
    'p42016': 'Date_of_COPD',
    'p42017': 'Source_of_COPD', 
    'p20002_i0': 'Non_cancer_illness_code_self_reported',
    'p21000_i0': 'Ethnic_background',
    'p40006_i0': 'Type_of_cancer_ICD10',
    'p40013_i0': 'Type_of_cancer_ICD9',
}
self_reported_codes = {
    'T2D': '1223', 
    'T1D':'1222', 
    'Gestational diabetes':'1221'
}
glucose_med_codes = {
    "Insulin product": "140883066",
    "Metformin": "1140884600",
    "gliclazide": "1140874744",
    "pioglitazone": "1141171646",
    "rosiglitazone": "1141177600",
    "glimepiride": "1141152590",
    "glucophage 500mg tablet": "1140874686",
    "rosiglitazone 1mg / metformin 500mg tablet": "1141189090",
    "avandamet 1mg / 500mg tablet": "1141189094",
    "glibenclamide": "1140874718",
    "glipizide": "1140874646",
    "actos 15mg tablet": "1141171652",
    "repaglinide": "1141168660",
    "glyclizide": "1140910566",
    "avandia 4mg tablet": "1141177606",
    "diamicron 80mg tablet": "1140874746",
    "tolbutamide": "1140874674",
    "acarbose": "1140868902",
    "amaryl 1mg tablet": "1141156984",
    "nateglinide": "1141173882"
}
ICD10_codes = {
    "T2D": "E11",
    "T1D": "E10",
    "Gestational diabetes": "O24",
    "Renal agenesis and other reduction defects of kidney": "Q60",
    "Cystic kidney disease": "Q61",
    "Other congenital malformations of kidney": "Q63",
    "Unspecified contracted kidney": "N26",
    "Small kidney of unknown cause": "N27",
    "Calculus of kidney and ureter": "N20",
    "Other disorders of kidney and ureter, not elsewhere classified": "N28",
    "Other disorders of kidney and ureter in diseases classified elsewhere": "N29",
    "Hypertensive heart disease": "I11",
    "Hypertensive renal disease": "I12",
    "Hypertensive renal and heart disease": "I13",
    "Heart failure": "I50",
    "Other acute ischaemic heart disease": "I24",
    "Chronic ischaemic heart diseases": "I25",
    "Systemic lupus Erythematosus": "M32",
    "Other chronic obstructive pulmonary disease": "J44",
    "Fibrosis and cirrhosis of liver": "K74",
    "Hepatic failure, not elsewhere classified": "K72"
}

In [17]:
os.chdir('/mnt/sdh/upamanyu/GWANN/T2D_v2/')
ukb_data = pd.read_csv('GWANN_T2Dv2_Variables_participant.tsv', sep='\t')
for c in ['p41270', 'p20003_i0', 'p20002_i0']:
    ukb_data[c] = ukb_data[c].apply(lambda x: x.split('|') if isinstance(x, str) else [])
# Keep only the main ICD10 code e.g. E11.9 -> E11
ukb_data['p41270'] = ukb_data['p41270'].apply(lambda x: [xi[:-1] for xi in x])
ukb_data.head()

Unnamed: 0,eid,p30750_i0,p30740_i0,p41270,p20003_i0,p21022,p31,p189,p22009_a1,p22009_a2,...,p6177_i0,p6153_i0,p21001_i0,p42016,p42017,p20002_i0,p21000_i0,p40006_i0,p41271,p40013_i0
0,3686618,24.6,,"[D12, E88, I1, J43, K63, K64, Z09, Z87]",[2038460150],50.0,0,1.2331,-8.62607,5.48782,...,,-7.0,21.227,2021-06-25,21.0,[],1001.0,,,
1,1620979,36.1,4.852,[],"[1140876136, 1140883504]",41.0,0,1.09023,-12.7236,5.57298,...,,-7.0,31.5501,,,[],1001.0,,,
2,3509529,45.4,9.964,"[E11, E83, M16, M199, M257, M861]","[1140868226, 1140874744, 1140861958]",56.0,1,-4.27277,-13.7558,5.64773,...,1,,22.663,,,"[1220, 99999, 1465]",1001.0,C435,V252,
3,4426623,39.7,5.236,[],[1140868226],60.0,1,-2.06036,-7.38569,3.7055,...,-7,,23.6377,,,"[1413, 1405, 1417]",1001.0,,,
4,3585192,39.9,5.612,"[E78, E87, I1, I24, I25, I48, I95, J45, K59, R...","[1140879802, 1140860806, 1140861958]",65.0,1,-3.14689,-11.2248,5.42737,...,1|2,,24.9653,,,"[1065, 1111, 1608, 1473, 1452]",1001.0,,,


In [18]:
filt_df = ukb_data.copy()
print(f'{filt_df.shape[0]} participants')

502399 participants


# White british (UKB code 1001) with less than 0.125 relatedness

In [19]:
geno_ids = pd.read_csv('geno_ids.csv')['ID1'].to_list()
filt_df = filt_df[filt_df['eid'].isin(geno_ids)]
assert filt_df['p21000_i0'].unique() == [1001]
print(f'{filt_df.shape[0]} participants remaining after filtering for white british with less than 0.125 relatedness')

333455 participants remaining after filtering for white british with less than 0.125 relatedness


# Exclusion

In [20]:
print(f'{filt_df.shape[0]} participants remaining')

exclusion_diseases = diseases = [
    "Hypertensive heart disease",
    "Hypertensive renal disease",
    "Hypertensive renal and heart disease",
    "Heart failure",
    "Other acute ischaemic heart disease",
    "Chronic ischaemic heart diseases",
    "Systemic lupus Erythematosus",
    "Other chronic obstructive pulmonary disease",
    "Fibrosis and cirrhosis of liver",
    "Hepatic failure, not elsewhere classified"
]
exclusion_ICD10 = {ICD10_codes[d] for d in exclusion_diseases}
exclusion_ICD10 = filt_df['p41270'].apply(lambda x: len(set(x).intersection(exclusion_ICD10)) > 0).values
exclusion_diseases = "\n\t" + "\n\t".join(exclusion_diseases)
print(f'{np.count_nonzero(exclusion_ICD10)} participants with ICD10 for: {exclusion_diseases}')

exclusion_COPD = filt_df['p42017'].apply(lambda x: x in [0, 1, 2, 11, 12, 21, 22]).values
print(f'{np.count_nonzero(exclusion_COPD)} participants with COPD (algorithmically defined)')

exclusion_array = exclusion_ICD10 | exclusion_COPD
excluded = filt_df.loc[exclusion_array]['eid'].to_list()
print(f'{np.count_nonzero(~exclusion_array)} participants remaining after removing exclusions')

333455 participants remaining
47705 participants with ICD10 for: 
	Hypertensive heart disease
	Hypertensive renal disease
	Hypertensive renal and heart disease
	Heart failure
	Other acute ischaemic heart disease
	Chronic ischaemic heart diseases
	Systemic lupus Erythematosus
	Other chronic obstructive pulmonary disease
	Fibrosis and cirrhosis of liver
	Hepatic failure, not elsewhere classified
17553 participants with COPD (algorithmically defined)
282833 participants remaining after removing exclusions


# Case

In [21]:
case_ICD10 = filt_df['p41270'].apply(lambda x: ICD10_codes['T2D'] in set(x)).values
print(f'{np.count_nonzero(case_ICD10)} with T2D ICD10 code E11')

case_HbA1c = filt_df['p30750_i0'] > 47.5
print(f'{np.count_nonzero(case_HbA1c)} with HbA1c > 47.5')

case_selfreport = filt_df['p20002_i0'].apply(lambda x: self_reported_codes['T2D'] in x).values
print(f'{np.count_nonzero(case_selfreport)} with self-reported T2D')

case_medications = set(glucose_med_codes.values())
case_medications = filt_df['p20003_i0'].apply(lambda x: len(set(x).intersection(case_medications)) > 0).values
print(f'{np.count_nonzero(case_medications)} with insulin or glucose medications')

case_array = case_ICD10 | case_HbA1c | case_selfreport | case_medications
print(f'{np.count_nonzero(case_array)} participants in case group')

cases = filt_df.loc[case_array]['eid'].to_list()

24636 with T2D ICD10 code E11
11206 with HbA1c > 47.5
2105 with self-reported T2D
9537 with insulin or glucose medications
27595 participants in case group


# Control

In [22]:
diabetes_disease = ['T1D', 'T2D', 'Gestational diabetes']
diabetes_ICD10 = {ICD10_codes[d] for d in diabetes_disease}
control_ICD10 = filt_df['p41270'].apply(lambda x: len(set(x).intersection(diabetes_ICD10)) == 0).values
diabetes_disease = "\n\t" + "\n\t".join(diabetes_disease)
print(f'{np.count_nonzero(control_ICD10)} without ICD10 for: {diabetes_disease}')

control_HbA1c = filt_df['p30750_i0'] < 38.8
print(f'{np.count_nonzero(control_HbA1c)} with HbA1c < 38.8')

diabetes_disease = ['T1D', 'T2D', 'Gestational diabetes']
diabetes_selfreport = {self_reported_codes[d] for d in diabetes_disease}
control_selfreport = filt_df['p20002_i0'].apply(lambda x: len(set(x).intersection(diabetes_selfreport)) == 0).values
diabetes_disease = "\n\t" + "\n\t".join(diabetes_disease)
print(f'{np.count_nonzero(control_selfreport)} without self-reported: {diabetes_disease}')

control_glucose = filt_df['p30740_i0'] < 11.1
print(f'{np.count_nonzero(control_glucose)} with Random Glucose < 11.1')

kidney_diseases = ["Renal agenesis and other reduction defects of kidney", "Cystic kidney disease", 
                    "Other congenital malformations of kidney", "Unspecified contracted kidney", 
                    "Small kidney of unknown cause", "Calculus of kidney and ureter", 
                    "Other disorders of kidney and ureter, not elsewhere classified", 
                    "Other disorders of kidney and ureter in diseases classified elsewhere"]
kidney_ICD10 = {ICD10_codes[d] for d in kidney_diseases}
control_kidney = filt_df['p41270'].apply(lambda x: len(set(x).intersection(kidney_ICD10)) == 0).values
kidney_diseases = "\n\t" + "\n\t".join(kidney_diseases)
print(f'{np.count_nonzero(control_kidney)} without ICD10 for: {kidney_diseases}')

control_array = control_ICD10 & control_HbA1c & control_selfreport & control_glucose & control_kidney
print(f'{np.count_nonzero(control_array)} participants in control group')

controls = filt_df.loc[control_array]['eid'].to_list()

307990 without ICD10 for: 
	T1D
	T2D
	Gestational diabetes
260118 with HbA1c < 38.8
330930 without self-reported: 
	T1D
	T2D
	Gestational diabetes
288515 with Random Glucose < 11.1
322731 without ICD10 for: 
	Renal agenesis and other reduction defects of kidney
	Cystic kidney disease
	Other congenital malformations of kidney
	Unspecified contracted kidney
	Small kidney of unknown cause
	Calculus of kidney and ureter
	Other disorders of kidney and ureter, not elsewhere classified
	Other disorders of kidney and ureter in diseases classified elsewhere
216210 participants in control group


In [28]:
total = set(excluded).union(set(cases)).union(set(controls))
print(f'{"Total:":<10}{len(total)}')
print()
for e in [set(excluded), set()]:
    cos = set(controls).difference(e)
    cas = set(cases).difference(e.union(cos))
    print(f'{"Excluded:":<10}{len(e)}')
    print(f'{"Cases:":<10}{len(cas)}')
    print(f'{"Controls:":<10}{len(cos)}')
    print()

Total:    258577

Excluded: 50622
Cases:    16617
Controls: 191338

Excluded: 0
Cases:    27498
Controls: 216210

