In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import statsmodels.api as sm
from scipy.stats import t

In [11]:
dm_type = 'pre'
df = pd.read_csv(f'data/data_balanced/data_t2dm_{dm_type}_balanced_no_outliers.csv')
# df = pd.read_csv('data/data_balanced/data_t2dm_biomarker_imputed.csv')
df_bio_names = pd.read_csv('data/biomarker_field.csv')
df_bio_names = df_bio_names[['Biomarker', 'Field', 'Units']]

In [12]:
print(df['t2dm'].value_counts())

t2dm
1    20277
0    20277
Name: count, dtype: int64


In [20]:
biomarkers = df.columns[11:len(df.columns) - 2]
print(biomarkers)

Index(['BMI', 'SBP', 'DBP', 'WBC', 'RBC', 'Hgb', 'Hct', 'MCV', 'MCH', 'MCHC',
       'RDW', 'PLT', 'PCT', 'MPV', 'PDW', 'LYM', 'MONO', 'NEUT', 'EOS', 'BASO',
       'NRBC', 'LYMP', 'MONOP', 'NEUTP', 'EOSP', 'BASOP', 'NRBCP', 'RETP',
       'RET', 'MRV', 'MSCV', 'IRF', 'HLS_ReticP', 'HLS_Retic',
       'Urine_creatinine', 'Urine_potassium', 'Urine_sodium', 'Albumin', 'ALP',
       'ALT', 'ApoA', 'ApoB', 'AST', 'DBIL', 'Urea', 'Calcium', 'CHOL',
       'Creatinine', 'CRP', 'CysC', 'Glucose', 'HbA1c', 'HDLc', 'IGF_1',
       'LDLc', 'Lpa', 'Phosphate', 'TBIL', 'Testosterone', 'TP', 'TG', 'Urate',
       'Vitamin_D', 'TyG'],
      dtype='object')


In [21]:
# z-score normalization
from utils import normalize_by_controls
df_zscored = normalize_by_controls(df, 't2dm', biomarkers)

In [22]:
# cross validation 10-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [28]:
beta = np.zeros((len(biomarkers), 10))
p_values = np.ones((len(biomarkers), 10))
i = 0
for tr_idx, te_idx in skf.split(df_zscored[biomarkers], df_zscored['t2dm']):
    df_data = df_zscored.iloc[te_idx]
    for j, biomarker in enumerate(biomarkers):
        bio = biomarkers[j]
        X = np.asarray(df_data[bio])
        y = np.asarray(df_data['t2dm'])
        X = sm.add_constant(X.reshape(-1, 1))
        print(f'fold {i + 1}, biomarker = {bio}')
        try:
            model = sm.Logit(y, X)
            result = model.fit()
            p_values[j, i] = result.pvalues[1]
            beta[j, i] = result.params[1]
        except Exception as e:
            print(e)
            # except:
            #     print('Error')
            #     p_values[j, i] = 1
            #     beta[j, i] = np.nan
    i += 1

fold 1, biomarker = BMI
Optimization terminated successfully.
         Current function value: 0.611467
         Iterations 5
fold 1, biomarker = SBP
Optimization terminated successfully.
         Current function value: 0.686797
         Iterations 4
fold 1, biomarker = DBP
Optimization terminated successfully.
         Current function value: 0.688616
         Iterations 4
fold 1, biomarker = WBC
Optimization terminated successfully.
         Current function value: 0.671352
         Iterations 4
fold 1, biomarker = RBC
Optimization terminated successfully.
         Current function value: 0.686851
         Iterations 4
fold 1, biomarker = Hgb
Optimization terminated successfully.
         Current function value: 0.692545
         Iterations 3
fold 1, biomarker = Hct
Optimization terminated successfully.
         Current function value: 0.692040
         Iterations 3
fold 1, biomarker = MCV
Optimization terminated successfully.
         Current function value: 0.685448
         Itera

In [32]:
# meta beta, se, p-value
beta_cv = np.mean(beta, axis=1)
se_cv = np.std(beta, axis=1) / np.sqrt(10)
# t-value
t_stats = beta_cv / (se_cv + 1e-9)
# p-value
p_values_cv = 2 * (1 - t.cdf(np.abs(t_stats), 10 - 1))

In [36]:
# save results
df_logit_cv = pd.DataFrame({
    'Biomarker': biomarkers, 
    'Beta': beta_cv, 
    'SE': se_cv, 
    't': t_stats, 
    'p-value': p_values_cv,
    'abs_beta': np.abs(beta_cv)
}).sort_values(by='abs_beta', ascending=False)
df_logit_cv = df_logit_cv.merge(df_bio_names, on='Biomarker', how='left')
df_logit_cv.to_csv(f'data/data_balanced/logistic/biomarker_logit_cv_{dm_type}.csv', index=False)