In [None]:
import os
import pandas as pd

In [None]:
df_fam = pd.read_csv('data/allchr.fam', sep=' ', header=None)
df_fam.columns = ['FID', 'IID', 'PID', 'MID', 'SEX', 'PHENO']

In [None]:
df_subtype = pd.read_csv('data/sustain_results_biomarkers_rmstage0.csv')
os.makedirs('results/subtype1', exist_ok=True)
os.makedirs('results/subtype2', exist_ok=True)
os.makedirs('results/subtype1_subtype2', exist_ok=True)

In [None]:
# control and t2dm
df_control = df_subtype[df_subtype['t2dm'] == 0]
df_t2dm = df_subtype[df_subtype['t2dm'] == 1]
# diabetes subtype1 and subtype2
df_subtype1 = df_t2dm[df_t2dm['Subtype'] == 'Subtype 1']
df_subtype2 = df_t2dm[df_t2dm['Subtype'] == 'Subtype 2']

In [None]:
# control and subtype1
df_control_subtype1 = pd.concat([df_control, df_subtype1])
# control and subtype2
df_control_subtype2 = pd.concat([df_control, df_subtype2])
# subtype1 and subtype2
df_subtype1_subtype2 = pd.concat([df_subtype1, df_subtype2])

In [None]:
df_control_subtype1_fam = df_fam[df_fam['IID'].isin(df_control_subtype1['Eid'])]
df_control_subtype2_fam = df_fam[df_fam['IID'].isin(df_control_subtype2['Eid'])]
df_subtype1_subtype2_fam = df_fam[df_fam['IID'].isin(df_subtype1_subtype2['Eid'])]

In [None]:
df_control_subtype1_fam.to_csv('results/subtype1/subtype1.fam', sep=' ', index=False, header=False, na_rep='NA')
df_control_subtype2_fam.to_csv('results/subtype2/subtype2.fam', sep=' ', index=False, header=False, na_rep='NA')
df_subtype1_subtype2_fam.to_csv('results/subtype1_subtype2/subtype.fam', sep=' ', index=False, header=False, na_rep='NA')

In [None]:
covariates = ['Sex', 'Age', 'Drinking_status', 'Smoking_status', 'Education', 'Income']
pheno = ['t2dm', 'Subtype']

In [None]:
# save FID and IID to a file
df_control_subtype1_fam[['FID', 'IID']].to_csv('results/subtype1/subtype1.fid', sep=' ', index=False, header=False, na_rep='NA')
df_control_subtype2_fam[['FID', 'IID']].to_csv('results/subtype2/subtype2.fid', sep=' ', index=False, header=False, na_rep='NA')
df_subtype1_subtype2_fam[['FID', 'IID']].to_csv('results/subtype1_subtype2/subtype.fid', sep=' ', index=False, header=False, na_rep='NA')

In [None]:
# PHNEO data
df_control_subtype1_pheno = pd.merge(df_control_subtype1_fam, df_control_subtype1[['Eid'] + pheno + covariates], left_on='IID', right_on='Eid')
df_control_subtype2_pheno = pd.merge(df_control_subtype2_fam, df_control_subtype2[['Eid'] + pheno + covariates], left_on='IID', right_on='Eid')
df_subtype1_subtype2_pheno = pd.merge(df_subtype1_subtype2_fam, df_subtype1_subtype2[['Eid'] + pheno + covariates], left_on='IID', right_on='Eid')

# remove 'Eid' column
df_control_subtype1_pheno = df_control_subtype1_pheno.drop(columns=['Eid'])
df_control_subtype2_pheno = df_control_subtype2_pheno.drop(columns=['Eid'])
df_subtype1_subtype2_pheno = df_subtype1_subtype2_pheno.drop(columns=['Eid'])

df_control_subtype1_pheno['t2dm'] = df_control_subtype1_pheno['t2dm'].astype(int) + 1
df_control_subtype2_pheno['t2dm'] = df_control_subtype2_pheno['t2dm'].astype(int) + 1
# pheno for subtype1_subtype2: 1: subtype1, 2: subtype2
df_subtype1_subtype2_pheno['t2dm'] = df_subtype1_subtype2_pheno['Subtype'].apply(lambda x: 1 if x == 'Subtype 1' else 2)
print(df_subtype1_subtype2_pheno['t2dm'].value_counts())

In [None]:
df_control_subtype1_pheno.to_csv('results/subtype1/subtype1.pheno', sep=' ', index=False, header=True, na_rep='NA')
df_control_subtype2_pheno.to_csv('results/subtype2/subtype2.pheno', sep=' ', index=False, header=True, na_rep='NA')
df_subtype1_subtype2_pheno.to_csv('results/subtype1_subtype2/subtype.pheno', sep=' ', index=False, header=True, na_rep='NA')

In [None]:
print(df_control_subtype1_pheno['t2dm'].value_counts())
print(df_control_subtype2_pheno['t2dm'].value_counts())

In [None]:
print(df_subtype1_subtype2_pheno['t2dm'].value_counts())

In [None]:
df_subtype1_pheno = pd.read_csv('results/subtype1/subtype1.phenos', sep=' ')
df_subtype2_pheno = pd.read_csv('results/subtype2/subtype2.phenos', sep=' ')

In [None]:
df_subtype1_pheno = df_subtype1_pheno[df_subtype1_pheno['IID'].isin(df_control_subtype1['Eid'])]
df_subtype2_pheno = df_subtype2_pheno[df_subtype2_pheno['IID'].isin(df_control_subtype2['Eid'])]

In [None]:
df_subtype1_pheno.to_csv('results/subtype1/subtype1.phenos', sep=' ', index=False, header=True, na_rep='NA')
df_subtype2_pheno.to_csv('results/subtype2/subtype2.phenos', sep=' ', index=False, header=True, na_rep='NA')