In [18]:
import pandas as pd
import numpy as np

In [19]:
dm_type = 'pre'
df = pd.read_csv(f'data/data_balanced/data_t2dm_{dm_type}_balanced_no_outliers.csv')
# df = pd.read_csv('data/data_balanced/data_t2dm_biomarker_imputed.csv')
df_bio_names = pd.read_csv('data/biomarker_field.csv')
df_bio_names = df_bio_names[['Biomarker', 'Field', 'Units']]

# Data Preprocessing

In [20]:
# check if there are any duplicate samples
print(len(df['Eid'].unique()) == len(df))

True


In [21]:
print(df['t2dm'].value_counts())

t2dm
1    20305
0    20305
Name: count, dtype: int64


In [22]:
# calculate log2 fold change for each biomarker between t2dm and control
biomarkers = df.columns[11:]
mean_t2dm = []
mean_control = []
log2fc = []
t_stats = []
p_values = []

In [23]:
from scipy import stats
for bio in biomarkers:
    t2dm = df[df['t2dm'] == 1][bio]
    control = df[df['t2dm'] == 0][bio]
    mean_t2dm.append(t2dm.mean())
    mean_control.append(control.mean())
    t_stat, p_value = stats.ttest_ind(t2dm, control)
    t_stats.append(t_stat)
    p_values.append(p_value)
    log2fc.append(np.log2(t2dm.mean() / (control.mean())))

  log2fc.append(np.log2(t2dm.mean() / (control.mean())))


In [24]:
df_log2fc = pd.DataFrame({
    'Biomarker': biomarkers, 
    'mean_t2dm': mean_t2dm, 
    'mean_control': mean_control, 
    'log2fc': log2fc, 
    't_stat': t_stats, 
    'p_value': p_values,
})
df_log2fc['Bonferroni_adj_p'] = df_log2fc['p_value'] * len(df_log2fc)
df_log2fc['Bonferroni_adj_p'] = df_log2fc['Bonferroni_adj_p'].apply(lambda x: 1 if x > 1 else x)

In [25]:
df_log2fc['abs_log2fc'] = np.abs(df_log2fc['log2fc'])
# direction 1: t2dm > control, -1: t2dm < control
df_log2fc['direction'] = np.sign(df_log2fc['mean_t2dm'] - df_log2fc['mean_control'])
df_log2fc = df_log2fc.sort_values(by='abs_log2fc', ascending=False)
df_log2fc = df_log2fc.merge(df_bio_names, left_on='Biomarker', right_on='Biomarker', how='left')
# significant if bonferroni adjusted p-value < 0.05, marked as "< 0.05" in the table else "NS"
df_log2fc['Sig_note'] = df_log2fc['Bonferroni_adj_p'].apply(lambda x: '< 0.05' if x < 0.05 else 'NS')
df_log2fc['direction'] = np.sign(df_log2fc['log2fc'])

In [26]:
df_log2fc.to_csv(f'data/data_balanced/log2fc/biomarkers_{dm_type}_log2fc_no_outlier.csv', index=False)

In [27]:
# z-score normalization
from utils import normalize_by_controls
df_zscored = normalize_by_controls(df, 't2dm', biomarkers)

In [28]:
# calculate logistic regression coefficients and p-values for each biomarker
import statsmodels.api as sm

biomarkers = df_zscored.columns[11:]
beta = []
p_values = []

for bio in biomarkers:
    print(bio)
    if df_zscored[bio].isna().sum() > 0:
        beta.append(np.nan)
        p_values.append(np.nan)
        continue
    X = np.asarray(df_zscored[bio]) 
    y = np.asarray(df_zscored['t2dm'])
    X = sm.add_constant(X.reshape(-1, 1))
    model = sm.Logit(y, X)
    try:
        result = model.fit()
        beta.append(result.params[1])
        p_values.append(result.pvalues[1])
    except:
        beta.append(np.nan)
        p_values.append(np.nan)

BMI
Optimization terminated successfully.
         Current function value: 0.611685
         Iterations 5
SBP
Optimization terminated successfully.
         Current function value: 0.690595
         Iterations 4
DBP
Optimization terminated successfully.
         Current function value: 0.690507
         Iterations 4
WBC
Optimization terminated successfully.
         Current function value: 0.674072
         Iterations 4
RBC
Optimization terminated successfully.
         Current function value: 0.689443
         Iterations 4
Hgb
Optimization terminated successfully.
         Current function value: 0.692904
         Iterations 3
Hct
Optimization terminated successfully.
         Current function value: 0.692723
         Iterations 3
MCV
Optimization terminated successfully.
         Current function value: 0.687594
         Iterations 4
MCH
Optimization terminated successfully.
         Current function value: 0.687819
         Iterations 4
MCHC
Optimization terminated successfully.
   

In [29]:
df_logreg = pd.DataFrame({
    'Biomarker': biomarkers, 
    'Beta': beta, 
    'P_value': p_values,
    'abs_Beta': np.abs(beta)
})
# bonferroni adjusted p-value, if p-value > 1, set it to 1
df_logreg['Bonferroni_adj_p'] = df_logreg['P_value'] * len(df_logreg)
df_logreg['Bonferroni_adj_p'] = df_logreg['Bonferroni_adj_p'].apply(lambda x: 1 if x > 1 else x)
# significant if bonferroni adjusted p-value < 0.05, marked as "< 0.05" in the table else "NS"
df_logreg['Sig_note'] = df_logreg['Bonferroni_adj_p'].apply(lambda x: '< 0.05' if x < 0.05 else 'NS')
df_logreg['direction'] = np.sign(df_logreg['Beta'])

In [30]:
df_logreg = df_logreg.merge(df_bio_names, on='Biomarker', how='left')

In [31]:
df_logreg = df_logreg.sort_values(by='abs_Beta', ascending=False)

In [32]:
df_logreg.to_csv(f'data/data_balanced/logistic/biomarkers_{dm_type}_logreg_no_outlier.csv', index=False)