In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('data/sustain_results_biomarkers.csv')
df_t2dm = df[df['t2dm'] == 1]
print(df_t2dm['Subtype'].value_counts())

Subtype
Subtype 2    10903
Subtype 1     9402
Name: count, dtype: int64


In [7]:
df_brain_disease = pd.read_csv('data/diseases/data_brain_disorder_all.csv')
df_cognitive = pd.read_csv('data/cognitive/data_cognitive_all.csv')
df_comm_disease = pd.read_csv('data/diseases/common_disease.csv')

In [8]:
df = df[['Eid', 't2dm', 'Sex', 'Age', 'Drinking_status', 'Smoking_status', 'Income', 'Education', 'Subtype', 'Stage']]

In [9]:
# merge the data
df = df.merge(df_brain_disease, on='Eid', how='left')
df = df.merge(df_cognitive, on='Eid', how='left')
df = df.merge(df_comm_disease, on='Eid', how='left')
print(df['Subtype'].value_counts())

Subtype
Subtype 1    20773
Subtype 2    19837
Name: count, dtype: int64


In [10]:
# subtype if df['t2dm'] == 0 then 'control' else subtype
df['Subtype'] = df.apply(lambda x: 'control' if x['t2dm'] == 0 else x['Subtype'], axis=1)

In [11]:
print(df['Subtype'].value_counts())

Subtype
control      20305
Subtype 2    10903
Subtype 1     9402
Name: count, dtype: int64


In [12]:
df.to_csv('data/data_subtype.csv', index=False)

In [13]:
# idp data
df_idp = pd.read_csv('data/idp/data_587idp_v2.csv')

In [14]:
df_idp_sub = df.merge(df_idp, on='Eid', how='inner')

In [15]:
df_idp_sub.to_csv('data/data_subtype_idp.csv', index=False)

In [17]:
print(df.columns)
print(df['Subtype'].value_counts())

Index(['Eid', 't2dm', 'Sex', 'Age', 'Drinking_status', 'Smoking_status',
       'Income', 'Education', 'Subtype', 'Stage', 'Alzheimers_disease',
       'Anorexia_nervosa', 'Anxiety_disorder', 'Bipolar_disorder',
       'Depression_disorder', 'Epilepsy', 'Multiple_sclerosis',
       'Obsessive_compulsive_disorder', 'Parkinsons_disease',
       'post_traumatic_stress_disorder', 'Schizophrenia', 'Sleep_disorder',
       'Stroke', 'Maximum_digits_remembered_correctly',
       'Number_of_symbol_digit_matches_made_correctly',
       'Mean_time_to_correctly_identify_matches', 'Fluid_intelligence_score',
       'Prospective_memory_result',
       'Errors_before_selecting_correct_item_in_alphanumeric_path',
       'Duration_to_complete_alphanumeric_path', 'hypertension',
       'renal_failure', 'heart_failure', 'obesity', 'hyperlipidemia',
       'ischemic_heart_disease', 'retinopathy', 'asthma',
       'acute_kidney_failure', 'chronic_kidney_disease', 'cardiac_arrhythmias',
       'glomerulus_

In [19]:
diseases = [
    'Alzheimers_disease',
    'Anorexia_nervosa',
    'Anxiety_disorder',
    'Bipolar_disorder',
    'Depression_disorder',
    'Epilepsy',
    'Multiple_sclerosis',
    'Obsessive_compulsive_disorder',
    'Parkinsons_disease',
    'Schizophrenia',
    'Sleep_disorder',
    'Stroke',
    'hypertension', 
    'heart_failure', 
    'ischemic_heart_disease', 
    'cardiac_arrhythmias', 
    'acute_kidney_failure', 
    'chronic_kidney_disease',
    'glomerulus_nephritis',
    'obesity', 
    # 'hyperlipidemia', 
    'retinopathy', 
    'asthma'
]

In [24]:
# diseases cases in each subtype
import numpy as np
cases = np.zeros((len(diseases), 3), dtype=int)
for i, disease in enumerate(diseases):
    print(disease)
    n_control = df[(df[disease] == 1) & (df['Subtype'] == 'control')].shape[0]
    n_s1 = df[(df[disease] == 1) & (df['Subtype'] == 'Subtype 1')].shape[0]
    n_s2 = df[(df[disease] == 1) & (df['Subtype'] == 'Subtype 2')].shape[0]
    cases[i, :] = [n_control, n_s1, n_s2]

Alzheimers_disease
Anorexia_nervosa
Anxiety_disorder
Bipolar_disorder
Depression_disorder
Epilepsy
Multiple_sclerosis
Obsessive_compulsive_disorder
Parkinsons_disease
Schizophrenia
Sleep_disorder
Stroke
hypertension
heart_failure
ischemic_heart_disease
cardiac_arrhythmias
acute_kidney_failure
chronic_kidney_disease
glomerulus_nephritis
obesity
retinopathy
asthma


In [25]:
# save the cases
df_cases = pd.DataFrame(cases, columns=['control', 'Subtype 1', 'Subtype 2'], index=diseases)
df_cases.to_csv('data/disease_cases.csv')