In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import statsmodels.api as sm
from scipy.stats import t

In [None]:
dm_type = 'pre'
df = pd.read_csv(f'data/data_balanced/data_t2dm_{dm_type}_balanced_no_outliers.csv')
# df = pd.read_csv('data/data_balanced/data_t2dm_biomarker_imputed.csv')
df_bio_names = pd.read_csv('data/biomarker_field.csv')
df_bio_names = df_bio_names[['Biomarker', 'Field', 'Units']]

In [None]:
print(df['t2dm'].value_counts())

In [None]:
biomarkers = df.columns[11:len(df.columns)]
print(biomarkers)

In [None]:
# z-score normalization
from utils import normalize_by_controls
df_zscored = normalize_by_controls(df, 't2dm', biomarkers)

In [None]:
# cross validation 10-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
beta = np.zeros((len(biomarkers), 10))
p_values = np.ones((len(biomarkers), 10))
i = 0
for tr_idx, te_idx in skf.split(df_zscored[biomarkers], df_zscored['t2dm']):
    df_data = df_zscored.iloc[te_idx]
    for j, biomarker in enumerate(biomarkers):
        bio = biomarkers[j]
        X = np.asarray(df_data[bio])
        y = np.asarray(df_data['t2dm'])
        X = sm.add_constant(X.reshape(-1, 1))
        print(f'fold {i + 1}, biomarker = {bio}')
        try:
            model = sm.Logit(y, X)
            result = model.fit()
            p_values[j, i] = result.pvalues[1]
            beta[j, i] = result.params[1]
        except Exception as e:
            print(e)
            # except:
            #     print('Error')
            #     p_values[j, i] = 1
            #     beta[j, i] = np.nan
    i += 1

In [None]:
# meta beta, se, p-value
beta_cv = np.mean(beta, axis=1)
se_cv = np.std(beta, axis=1) / np.sqrt(10)
# t-value
t_stats = beta_cv / (se_cv + 1e-9)
# p-value
p_values_cv = 2 * (1 - t.cdf(np.abs(t_stats), 10 - 1))

In [None]:
# save results
df_logit_cv = pd.DataFrame({
    'Biomarker': biomarkers, 
    'Beta': beta_cv, 
    'SE': se_cv, 
    't': t_stats, 
    'p-value': p_values_cv,
    'abs_beta': np.abs(beta_cv)
}).sort_values(by='abs_beta', ascending=False)
df_logit_cv = df_logit_cv.merge(df_bio_names, on='Biomarker', how='left')
df_logit_cv.to_csv(f'data/data_balanced/logistic/biomarker_logit_cv_{dm_type}.csv', index=False)