In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/data_preprocessed/diabetes_marker.csv')

In [3]:
df['t2dm'].value_counts()

t2dm
0    460628
1     41783
Name: count, dtype: int64

In [4]:
# remove other dm, only keep t2dm
df2 = pd.read_csv('data/data_preprocessed/diabetes_marker_removed_other_dm.csv')
df2 = df2[df2['t1dm'] == 0]
print('removed number of other dm', len(df) - len(df2))
print(df2['t2dm'].value_counts())

removed number of other dm 13562
t2dm
0    454754
1     34095
Name: count, dtype: int64


In [5]:
df_dm_biomarker = pd.read_csv('data/imputed/data_t2dm_biomarker_imputed.csv')
print(df_dm_biomarker['t2dm'].value_counts())

t2dm
0    454754
1     34095
Name: count, dtype: int64


In [6]:
# remove subjects with insulin medication
df_ins = pd.read_csv('data/medication/insulin.csv')
eid_used_ins = df_ins[df_ins['insulin'] == 1]['Eid'].tolist()
df_dm_biomarker1 = df_dm_biomarker[~df_dm_biomarker['Eid'].isin(eid_used_ins)]
print('number of subjects removed: ', len(df_dm_biomarker) - len(df_dm_biomarker1))
print('number of subjects after removing insulin medication: ', len(df_dm_biomarker))
print(df_dm_biomarker1['t2dm'].value_counts())

number of subjects removed:  1622
number of subjects after removing insulin medication:  488849
t2dm
0    454628
1     32599
Name: count, dtype: int64


In [7]:
df_genetic = pd.read_csv('data/allchr.fam', sep=' ', header=None)
df_genetic.columns = ['FID', 'IID', 'PID', 'MID', 'SEX', 'PHENOTYPE']
df_dm_biomarker2 = df_dm_biomarker1[df_dm_biomarker1['Eid'].isin(df_genetic['IID'])]
print('number of subjects removed: ', len(df_dm_biomarker1) - len(df_dm_biomarker2))
print('number of subjects after removing subjects without genetic data: ', len(df_dm_biomarker2))

number of subjects removed:  14721
number of subjects after removing subjects without genetic data:  472506


In [8]:
# remove outliers for each biomarker, values > 5 standard deviation
biomarkers = df_dm_biomarker.columns[11:].tolist()
df_dm_biomarker3 = df_dm_biomarker2.copy()
for biomarker in biomarkers:
    mean = df_dm_biomarker3[biomarker].mean()
    std = df_dm_biomarker3[biomarker].std()
    df_dm_biomarker3 = df_dm_biomarker3[abs(df_dm_biomarker3[biomarker] - mean) < 5 * std]
print('number of subjects removed: ', len(df_dm_biomarker2) - len(df_dm_biomarker3))
print('number of subjects after removing outliers: ', len(df_dm_biomarker3))

number of subjects removed:  39829
number of subjects after removing outliers:  432677


In [9]:
df_time_diff = pd.read_csv('data/time/data_t2dm_time_diff.csv')
# keep subjects with dm_type and controls
Eids_dm_type = df_time_diff[df_time_diff['dm_type'].isin(['control', 'pre'])]['Eid'].tolist()
df_dm_biomarker4 = df_dm_biomarker3[df_dm_biomarker3['Eid'].isin(Eids_dm_type)]
print('number of subjects removed: ', len(df_dm_biomarker3) - len(df_dm_biomarker4))
print('number of subjects after keeping dm_type: ', len(df_dm_biomarker4))
# removed subjects Eid
Eids_removed = df_dm_biomarker3[~df_dm_biomarker3['Eid'].isin(Eids_dm_type)]['Eid'].tolist()
print('len removed subjects Eid: ', len(Eids_removed))
df_removed = df_dm_biomarker3[~df_dm_biomarker3['Eid'].isin(Eids_dm_type)]
df_removed.to_csv('results/removed_subjects.csv', index=False)

number of subjects removed:  3844
number of subjects after keeping dm_type:  428833
len removed subjects Eid:  3844


In [10]:
df_dm_biomarker4['HbA1c_prec'] = (df_dm_biomarker4['HbA1c'] * 0.09148) + 2.152
df_dm_biomarker5 = df_dm_biomarker4[~((df_dm_biomarker4['t2dm'] == 0) & (df_dm_biomarker4['HbA1c_prec'] >= 6.5))]
df_dm_biomarker5 = df_dm_biomarker5[~((df_dm_biomarker4['t2dm'] == 0) & (df_dm_biomarker5['Glucose'] >= 7.0))]
print('number of subjects removed: ', len(df_dm_biomarker4) - len(df_dm_biomarker5))
print('number of subjects after removing subjects with HbA1c >= 6.5 and Glucose >= 7.0: ', len(df_dm_biomarker5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dm_biomarker4['HbA1c_prec'] = (df_dm_biomarker4['HbA1c'] * 0.09148) + 2.152
  df_dm_biomarker5 = df_dm_biomarker5[~((df_dm_biomarker4['t2dm'] == 0) & (df_dm_biomarker5['Glucose'] >= 7.0))]


number of subjects removed:  5993
number of subjects after removing subjects with HbA1c >= 6.5 and Glucose >= 7.0:  422840


In [11]:
print(df_dm_biomarker5['t2dm'].value_counts())

t2dm
0    402535
1     20305
Name: count, dtype: int64


In [12]:
df_anemia = pd.read_csv('data/anemia.csv')
eids_with_anemia = df_anemia[df_anemia['anemia_before_baseline'] == 1]['Eid'].tolist()
df_dm_biomarker6 = df_dm_biomarker5[~df_dm_biomarker5['Eid'].isin(eids_with_anemia)]
print('number of subjects removed: ', len(df_dm_biomarker5) - len(df_dm_biomarker6))
print('number of subjects after removing subjects anemia: ', len(df_dm_biomarker6))

number of subjects removed:  226
number of subjects after removing subjects anemia:  422614


In [13]:
print(df_dm_biomarker6['t2dm'].value_counts())

t2dm
0    402337
1     20277
Name: count, dtype: int64


In [31]:
# data with Brain MRI IDPs
df_brian = pd.read_csv('results/sustain_results_idps.csv')
# CMR
df_cmr = pd.read_csv('results/sustain_results_cmr.csv')

In [32]:
df_brian['t2dm'].value_counts()

t2dm
0    1591
1    1134
Name: count, dtype: int64

In [33]:
df_cmr['t2dm'].value_counts()

t2dm
0    1192
1     855
Name: count, dtype: int64