In [1]:
import pandas as pd
import pickle
import os
import numpy as np
import sys
from sklearn.metrics import roc_auc_score, make_scorer,brier_score_loss,log_loss,average_precision_score
import shutil
import os

In [2]:
def convert_hba1c_mmol_mol_2_percentage(row):
    try:
        row = 0.0915 * row + 2.15
    except:
        row = None
    return row

An A1C level below 5.7% is considered normal
An A1C level between 5.7% and 6.4% is considered prediabetes
An A1C level of 6.5% or higher on two separate tests indicates type 2 diabetes

In [3]:
def get_strat_cohort_indeces():
    Test_file_path = "/net/mraid08/export/jafar/UKBioBank/Data/ukb29741_a1c_below_65_updates_scoreboard_test.csv"
    a1c_col="30750-0.0"
    a1c_df=pd.read_csv(Test_file_path,usecols=["eid",a1c_col],index_col="eid")

    a1c_df["hba1c"]=a1c_df.apply(convert_hba1c_mmol_mol_2_percentage)
    a1c_df_pre=a1c_df[a1c_df["hba1c"]>=5.7]
    a1c_df_pre_index=a1c_df_pre.index

    a1c_df_healthy=a1c_df[a1c_df["hba1c"]<5.7]
    a1c_df_healthy=a1c_df_healthy[a1c_df["hba1c"]>=4]
    a1c_df_healthy_index=a1c_df_healthy.index
    return a1c_df_healthy_index,a1c_df_pre_index

In [4]:
def stratify_results(folder_name):
    a1c_df_healthy_index,a1c_df_pre_index=get_strat_cohort_indeces()
    base_path="/home/edlitzy/UKBB_Tree_Runs/For_article/Revision_runs/results_folder/"
    test_path=os.path.join(base_path,folder_name,"y_LR_test.csv")
    pred_path=os.path.join(base_path,folder_name,"final_scores.csv")
    test_df=pd.read_csv(test_path,index_col="eid")
    pred_df=pd.read_csv(pred_path,index_col="eid")
    tot_df=test_df.join(pred_df)
    res_pre_df=tot_df.loc[a1c_df_pre_index,:]
    res_healthy_df=tot_df.loc[a1c_df_healthy_index,:]
    return res_pre_df,res_healthy_df

In [5]:
def calc_ci(df,folder_name):
    roc_list=[]
    aps_list=[]
    for ind in range(1000):
        tmp_df=df.sample(n=df.shape[0],replace=True,random_state=ind)
        roc_list.append(roc_auc_score(y_true=tmp_df.iloc[:,0].values,y_score=tmp_df.iloc[:,1]))
        aps_list.append(average_precision_score(y_true=tmp_df.iloc[:,0].values,y_score=tmp_df.iloc[:,1]))
    res_df=pd.DataFrame(
        index=[folder_name],columns=["auROC min","auROC mean","auROC max","APS min","APS mean","APS max"])
    res_df["auROC min"]="{:.2f}".format(np.quantile(roc_list,0.025))
    res_df["auROC max"]="{:.2f}".format(np.quantile(roc_list,0.975))
    res_df["auROC mean"]="{:.2f}".format(np.mean(roc_list))
    res_df["APS min"]="{:.2f}".format(np.quantile(aps_list,0.025))
    res_df["APS max"]="{:.2f}".format(np.quantile(aps_list,0.975))
    res_df["APS mean"]="{:.2f}".format(np.mean(aps_list))
    print(res_df)
    return res_df

In [6]:
res_list=[]
folder_name="LR_No_reticulocytes_scoreboard"
four_bt_res_pre_df,four_bt_res_healthy_df=stratify_results(folder_name)
res_list.append(calc_ci(df=four_bt_res_pre_df,folder_name="Pre diab "+folder_name))
res_list.append(calc_ci(df=four_bt_res_healthy_df,folder_name="Healthy " +folder_name))

  # This is added back by InteractiveShellApp.init_path()


                                        auROC min auROC mean auROC max  \
Pre diab LR_No_reticulocytes_scoreboard      0.68       0.73      0.77   

                                        APS min APS mean APS max  
Pre diab LR_No_reticulocytes_scoreboard    0.15     0.20    0.26  
                                       auROC min auROC mean auROC max APS min  \
Healthy LR_No_reticulocytes_scoreboard      0.76       0.81      0.85    0.02   

                                       APS mean APS max  
Healthy LR_No_reticulocytes_scoreboard     0.03    0.05  


In [18]:
four_bt_res_pre_df.shape
four_bt_res_pre_df["2443-3.0"].sum()/four_bt_res_pre_df.shape[0]

(1006, 2)

In [22]:
four_bt_res_healthy_df.shape

(7948, 2)

In [23]:
four_bt_res_healthy_df["2443-3.0"].sum()/four_bt_res_healthy_df.shape[0]

0.008303975842979365

In [7]:
folder_name="LR_Anthro_scoreboard"
anhtro_res_pre_df,anthro_res_healthy_df=stratify_results(folder_name)
res_list.append(calc_ci(df=anhtro_res_pre_df,folder_name="Pre diab "+folder_name))
res_list.append(calc_ci(df=anthro_res_healthy_df,folder_name="Healthy "+folder_name))

  # This is added back by InteractiveShellApp.init_path()


                              auROC min auROC mean auROC max APS min APS mean  \
Pre diab LR_Anthro_scoreboard      0.68       0.73      0.77    0.15     0.20   

                              APS max  
Pre diab LR_Anthro_scoreboard    0.26  
                             auROC min auROC mean auROC max APS min APS mean  \
Healthy LR_Anthro_scoreboard      0.76       0.81      0.86    0.02     0.04   

                             APS max  
Healthy LR_Anthro_scoreboard    0.07  


In [8]:
tot_res=pd.concat(res_list)

In [9]:
tot_res

Unnamed: 0,auROC min,auROC mean,auROC max,APS min,APS mean,APS max
Pre diab LR_No_reticulocytes_scoreboard,0.68,0.73,0.77,0.15,0.2,0.26
Healthy LR_No_reticulocytes_scoreboard,0.76,0.81,0.85,0.02,0.03,0.05
Pre diab LR_Anthro_scoreboard,0.68,0.73,0.77,0.15,0.2,0.26
Healthy LR_Anthro_scoreboard,0.76,0.81,0.86,0.02,0.04,0.07


In [13]:
def build_summary_table(df):
    sum_df=pd.DataFrame(index=df.index,columns=["auROC [95% CI]","APS [95% CI]"])
    sum_df["auROC [95% CI]"]=df["auROC mean"]+" ["+df["auROC min"]+"-"+df["auROC max"]+"]"
    sum_df["APS [95% CI]"]=df["APS mean"]+" ["+df["APS min"]+"-"+df["APS max"]+"]"
    return sum_df

In [14]:
final_df=build_summary_table(tot_res)

In [15]:
final_df

Unnamed: 0,auROC [95% CI],APS [95% CI]
Pre diab LR_No_reticulocytes_scoreboard,0.73 [0.68-0.77],0.20 [0.15-0.26]
Healthy LR_No_reticulocytes_scoreboard,0.81 [0.76-0.85],0.03 [0.02-0.05]
Pre diab LR_Anthro_scoreboard,0.73 [0.68-0.77],0.20 [0.15-0.26]
Healthy LR_Anthro_scoreboard,0.81 [0.76-0.86],0.04 [0.02-0.07]


In [16]:
final_df.to_csv("/home/edlitzy/UKBB_Tree_Runs/For_article/Revision_runs/Tables/stratified_original_results.csv")