In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from utils import zscore_regressed_out_covariates

In [None]:
# Load data
df = pd.read_csv('data/data_subtype.csv')
df.columns

In [None]:
covariates = ['Sex', 'Age', 'Drinking_status', 'Smoking_status', 'Income', 'Education']
cognitive_scores = [
    'Maximum_digits_remembered_correctly',
    'Number_of_symbol_digit_matches_made_correctly',
    'Mean_time_to_correctly_identify_matches',
    'Fluid_intelligence_score',
    'Prospective_memory_result',
    'Duration_to_complete_alphanumeric_path',
]
cog_cate = [
    'Numeric memory',
    'Symbol digit substitution',
    'Reaction time',
    'Fluid intelligence and reasoning',
    'Prospective memory',
    'Trail making',
]

In [None]:
# for each cognitive score, regress out the covariates and linear regression with subtype
res_subtype1 = np.zeros((len(cognitive_scores), 5))
res_subtype2 = np.zeros((len(cognitive_scores), 5))
for i, score in enumerate(cognitive_scores):
    print(f'Regressing out covariates for {score}')
    df_cog = df[['t2dm','Subtype', score] + covariates].dropna()
    
    print('Original shape:', df_cog.shape)
    df_cog_zscored = zscore_regressed_out_covariates(df_cog, 't2dm', [score], covariates)
    df_subtype1 = df_cog_zscored[df_cog_zscored['Subtype'].isin(['control', 'Subtype 1'])].copy()
    df_subtype2 = df_cog_zscored[df_cog_zscored['Subtype'].isin(['control', 'Subtype 2'])].copy()
    
    # convert to int. control: 0, subtype: 1
    df_subtype1['Subtype'] = df_subtype1['Subtype'].map({'control': 0, 'Subtype 1': 1})
    df_subtype2['Subtype'] = df_subtype2['Subtype'].map({'control': 0, 'Subtype 2': 1})
    
    # linear regression for subtype 1 and cognitive score
    x1 = sm.add_constant(df_subtype1['Subtype'])
    y1 = df_subtype1[score]
    model1 = sm.OLS(y1, x1).fit()
    # save beta, se, pvalue, tvalue, and r2
    res_subtype1[i, 0] = model1.params['Subtype']
    res_subtype1[i, 1] = model1.bse['Subtype']
    res_subtype1[i, 2] = model1.pvalues['Subtype']
    res_subtype1[i, 3] = model1.tvalues['Subtype']
    res_subtype1[i, 4] = model1.rsquared
    
    # linear regression for subtype 2 and cognitive score
    x2 = sm.add_constant(df_subtype2['Subtype'])
    y2 = df_subtype2[score]
    model2 = sm.OLS(y2, x2).fit()
    # save beta, se, pvalue, tvalue, and r2
    res_subtype2[i, 0] = model2.params['Subtype']
    res_subtype2[i, 1] = model2.bse['Subtype']
    res_subtype2[i, 2] = model2.pvalues['Subtype']
    res_subtype2[i, 3] = model2.tvalues['Subtype']
    res_subtype2[i, 4] = model2.rsquared
    

In [None]:
# save results
res_subtype1_df = pd.DataFrame(res_subtype1, columns=['beta', 'se', 'pvalue', 'tvalue', 'r2'])
res_subtype2_df = pd.DataFrame(res_subtype2, columns=['beta', 'se', 'pvalue', 'tvalue', 'r2'])
res_subtype1_df['Subtype'] = 'Subtype 1'
res_subtype2_df['Subtype'] = 'Subtype 2'
res_subtype1_df['cognitive_score'] = cognitive_scores
res_subtype2_df['cognitive_score'] = cognitive_scores
# reorder columns
res_subtype1_df = res_subtype1_df[['Subtype', 'cognitive_score' , 'beta', 'se', 'pvalue', 'tvalue', 'r2']]
res_subtype2_df = res_subtype2_df[['Subtype', 'cognitive_score' , 'beta', 'se', 'pvalue', 'tvalue', 'r2']]

In [None]:
# fdr correction
from statsmodels.stats.multitest import multipletests
res_subtype1_df['P_FDR'] = multipletests(res_subtype1_df['pvalue'], method='fdr_bh')[1]
res_subtype2_df['P_FDR'] = multipletests(res_subtype2_df['pvalue'], method='fdr_bh')[1]

In [None]:
# concatenate results
res_subtype = pd.concat([res_subtype1_df, res_subtype2_df])
res_subtype['Sig_Note'] = res_subtype['P_FDR'].apply(lambda x: '< 0.05' if x < 0.05 else 'NS' )
os.makedirs('results/cognitive', exist_ok=True)
res_subtype.to_csv('results/cognitive/cognitive_scores_subtype.csv', index=False)