In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import *
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from scipy import interp
from sklearn.metrics import roc_auc_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Load the Data

In [3]:
#Data saved in csv/excel
import glob, os

data_dir= "C:/Users/tsb7592/Box/ARMADA Operations/Data and Analysis/Mar 2021 UDS data (deidentified)"
os.chdir(data_dir)

In [4]:
uds= pd.read_csv("ARMADA_UDS_MAR21_DI.CSV")
uds.shape

(1046, 618)

In [5]:
import sys 

stdoutOrigin=sys.stdout 
sys.stdout = open("C:/Users/tsb7592/Box/ARMADA Operations/Data and Analysis/stat_output.txt", "w")

In [6]:
armada= pd.read_csv("C:/Users/tsb7592/Box/ARMADA Operations/Data and Analysis/May 2020 Interim Analysis Data for Manuscripts/ARMADA_TBSCORES_MAY2020.csv")
armada.shape

(1083, 189)

In [7]:
armada.head()

Unnamed: 0,ptid,assmnt,site,cohort2,LANGUAGE,AGE,gender_rc,ethnicity_rc,race_tb_rc,racetb_w,...,VisualAc_FCTScore,VisualAc_Raw,VisualAc_svalogmar,VisualAc_svasnellen,VisualAc_UCSScore,WIN_Raw_L,WIN_Raw_R,WIN_Thresh_L,WIN_Thresh_R,race_summary
0,1.45635,1,Emory,6,en-US,82,2,1,2,0,...,60.0,82.0,-0.04,20/16-3,95.0,20.0,18.0,10.0,11.6,2
1,1.46421,1,Emory,4,en-US,88,1,1,1,1,...,,,,,,,,,,1
2,1.46421,2,Emory,4,en-US,89,1,1,1,1,...,,76.0,0.08,20/20-4,82.0,0.0,0.0,26.0,26.0,1
3,1.46422,1,Emory,3,en-US,88,2,1,1,1,...,,70.0,0.2,20/30,70.0,7.0,9.0,20.4,18.8,1
4,1.46422,2,Emory,3,en-US,89,2,1,1,1,...,,77.0,0.06,20/20-3,84.0,5.0,8.0,22.0,19.6,1


# Data Cleaning

In [8]:
#Combine ARMADA with this UDS left join
df = pd.merge(armada,uds, how="left", on=['ptid','assmnt'])

In [9]:
df.head()

Unnamed: 0,ptid,assmnt,site,cohort2,LANGUAGE,AGE,gender_rc,ethnicity_rc,race_tb_rc,racetb_w,...,RESPHEAR,RESPDIST,RESPINTR,RESPDISN,RESPFATG,RESPEMOT,RESPASST,RESPOTH,RESPOTHX,NACCMOCB
0,1.45635,1,Emory,6,en-US,82,2,1,2,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
1,1.46421,1,Emory,4,en-US,88,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
2,1.46421,2,Emory,4,en-US,89,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
3,1.46422,1,Emory,3,en-US,88,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
4,1.46422,2,Emory,3,en-US,89,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0


In [10]:
#Only keep the rows with a valid MINDATEDIFF
df =df[df.MINDATEDIFF.notnull()]

In [11]:
df.shape

(1046, 805)

In [12]:
#MEMORY through comport (so total of 10 variables)
keep = ['MEMORY', 'ORIENT', 'JUDGMENT', 'COMMUN', 'HOMEHOBB', 'PERSCARE',
       'CDRSUM', 'CDRGLOB', 'COMPORT']

In [13]:
df.columns

Index(['ptid', 'assmnt', 'site', 'cohort2', 'LANGUAGE', 'AGE', 'gender_rc',
       'ethnicity_rc', 'race_tb_rc', 'racetb_w',
       ...
       'RESPHEAR', 'RESPDIST', 'RESPINTR', 'RESPDISN', 'RESPFATG', 'RESPEMOT',
       'RESPASST', 'RESPOTH', 'RESPOTHX', 'NACCMOCB'],
      dtype='object', length=805)

In [14]:
df[keep]

Unnamed: 0,MEMORY,ORIENT,JUDGMENT,COMMUN,HOMEHOBB,PERSCARE,CDRSUM,CDRGLOB,COMPORT
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.5,1.0,1.0,1.0,1.0,5.5,1.0,0.0
2,2.0,1.0,2.0,2.0,2.0,2.0,11.0,2.0,1.0
3,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
4,0.5,0.5,0.5,0.0,0.0,0.0,1.5,0.5,0.0
...,...,...,...,...,...,...,...,...,...
1078,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,
1079,,,,,,,,,
1080,,,,,,,,,
1081,0.5,0.0,0.0,0.0,0.0,0.0,,0.5,


In [15]:
df.head()

Unnamed: 0,ptid,assmnt,site,cohort2,LANGUAGE,AGE,gender_rc,ethnicity_rc,race_tb_rc,racetb_w,...,RESPHEAR,RESPDIST,RESPINTR,RESPDISN,RESPFATG,RESPEMOT,RESPASST,RESPOTH,RESPOTHX,NACCMOCB
0,1.45635,1,Emory,6,en-US,82,2,1,2,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
1,1.46421,1,Emory,4,en-US,88,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
2,1.46421,2,Emory,4,en-US,89,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
3,1.46422,1,Emory,3,en-US,88,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
4,1.46422,2,Emory,3,en-US,89,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0


In [16]:
cohort ={1:"Gen population healthy aging 85+", 2:"Gen population 65-85", 3:"Gen population MCI",
4:"Gen population AD", 5:"African American mild cognitive impairment",
6:"African American normal controls 65-85", 7:"Spanish Speaking mild cognitive impairment",
8:"Spanish Speaking normal controls 65-85", 9:"Spanish Speaking AD"}

In [17]:
#Rename the value under column cohort2
df['cohort2'] = df['cohort2'].map(cohort)

In [18]:
df.cohort2.value_counts()

Gen population 65-85                          265
Gen population healthy aging 85+              134
Gen population MCI                            133
African American normal controls 65-85        127
Spanish Speaking normal controls 65-85        126
Spanish Speaking mild cognitive impairment    112
Gen population AD                              83
African American mild cognitive impairment     42
Spanish Speaking AD                            24
Name: cohort2, dtype: int64

In [19]:
select = [ 'Gen population 65-85', 'Gen population MCI','Gen population AD']
genpop = df[df.cohort2.isin(select)]

In [20]:
genpop[keep]

Unnamed: 0,MEMORY,ORIENT,JUDGMENT,COMMUN,HOMEHOBB,PERSCARE,CDRSUM,CDRGLOB,COMPORT
1,1.0,0.5,1.0,1.0,1.0,1.0,5.5,1.0,0.0
2,2.0,1.0,2.0,2.0,2.0,2.0,11.0,2.0,1.0
3,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
4,0.5,0.5,0.5,0.0,0.0,0.0,1.5,0.5,0.0
6,1.0,0.0,0.0,0.0,0.5,0.0,1.5,0.5,0.0
...,...,...,...,...,...,...,...,...,...
840,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
946,0.5,0.0,0.0,0.0,0.0,0.0,,0.5,
974,1.0,0.0,0.0,0.0,0.0,0.0,,0.5,
991,0.5,0.0,0.5,0.5,0.5,0.0,,0.5,


# 1) Count for each variable across all 3 gen pop groups 

In [21]:
counts  = genpop.groupby('cohort2')[keep].count().unstack().reset_index().pivot(index='level_0', columns='cohort2', values=0)

In [22]:
counts =counts.reindex(keep)[select]
print(counts)

# 2) ANOVA & Tukey’s Test

https://www.statology.org/tukey-test-python/

In [23]:
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#The one-way ANOVA tests the null hypothesis that 3 genpop groups have the same population mean.
print("----one-way ANOVA----")
print(f_oneway(counts.iloc[:,0],counts.iloc[:,1],counts.iloc[:,2]))

F_onewayResult(statistic=200.69804129168858, pvalue=1.039951452532637e-15)

In [24]:
df

Unnamed: 0,ptid,assmnt,site,cohort2,LANGUAGE,AGE,gender_rc,ethnicity_rc,race_tb_rc,racetb_w,...,RESPHEAR,RESPDIST,RESPINTR,RESPDISN,RESPFATG,RESPEMOT,RESPASST,RESPOTH,RESPOTHX,NACCMOCB
0,1.45635,1,Emory,African American normal controls 65-85,en-US,82,2,1,2,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
1,1.46421,1,Emory,Gen population AD,en-US,88,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
2,1.46421,2,Emory,Gen population AD,en-US,89,1,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
3,1.46422,1,Emory,Gen population MCI,en-US,88,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
4,1.46422,2,Emory,Gen population MCI,en-US,89,2,1,1,1,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1078,X20106,1,CU,Spanish Speaking normal controls 65-85,es,68,1,2,32,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,
1079,X20115,1,CU,Spanish Speaking normal controls 65-85,es,70,1,2,32,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,
1080,X20116,1,CU,Spanish Speaking mild cognitive impairment,es,66,1,2,32,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,
1081,X20119,1,CU,Spanish Speaking mild cognitive impairment,es,76,1,2,32,0,...,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,


In [26]:
counts

cohort2,Gen population 65-85,Gen population MCI,Gen population AD
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MEMORY,265,132,83
ORIENT,265,132,83
JUDGMENT,265,132,83
COMMUN,265,132,83
HOMEHOBB,265,132,83
PERSCARE,265,132,83
CDRSUM,265,128,83
CDRGLOB,265,132,83
COMPORT,168,127,83


In [25]:
cdf = pd.DataFrame({'score': [265, 265, 265, 265, 265, 265, 265, 265, 168,
                             132, 132, 132, 132, 132, 132, 128, 132, 127,
                             83, 83, 83, 83, 83, 83, 83, 83, 83],
                   'group': np.repeat(['Gen population 65-85', 'Gen population MCI', 'Gen population AD'], repeats=9)}) 

In [27]:
cdf

Unnamed: 0,score,group
0,265,Gen population 65-85
1,265,Gen population 65-85
2,265,Gen population 65-85
3,265,Gen population 65-85
4,265,Gen population 65-85
5,265,Gen population 65-85
6,265,Gen population 65-85
7,265,Gen population 65-85
8,168,Gen population 65-85
9,132,Gen population MCI


In [28]:
# perform Tukey's test
tukey = pairwise_tukeyhsd(endog=cdf['score'],
                          groups=cdf['group'],
                          alpha=0.05)

#display results
print(tukey)

# 3)  NACCFAM counts per group

In [29]:
df.NACCFAM.value_counts()

9.0    546
1.0    314
0.0    186
Name: NACCFAM, dtype: int64

In [30]:
#Drop the unknown NACCFAM 
Ncounts  = genpop[genpop.NACCFAM !=9 ].groupby('NACCFAM')['cohort2'].value_counts().unstack()
#Ncounts.to_excel('C:/Users/tsb7592/Downloads/NACCFAM.xlsx')

In [31]:
print(Ncounts)

In [32]:
#Chi square results got from R

#Pearson's Chi-squared test
print('X-squared = 7.7844, df = 2, p-value = 0.0204')
print("")

#  HYPERT counts per group

In [33]:
df.HYPERTEN.unique()

array([ 1., -4.,  0.,  2.,  9.])

In [34]:
df.HYPERCHO.value_counts()

-4.0    463
 1.0    288
 0.0    282
 2.0     12
 9.0      1
Name: HYPERCHO, dtype: int64

In [35]:
h1=genpop[(genpop.HYPERTEN != 9 )&(genpop.HYPERTEN !=-4)].groupby('HYPERTEN')['cohort2'].value_counts().unstack().fillna(0)
print(h1)

In [36]:
#Pearson's Chi-squared test

data:  h1
print("X-squared = 15.884, df = 4, p-value = 0.003179")
print("")

In [37]:
h2=genpop[(genpop.HYPERCHO != 9 )&(genpop.HYPERCHO !=-4)].groupby('HYPERCHO')['cohort2'].value_counts().unstack().fillna(0)
print(h2)

In [38]:
#Pearson's Chi-squared test

#data:  h2
print("X-squared = 7.0551, df = 4, p-value = 0.133")


In [39]:
sys.stdout.close()
sys.stdout=stdoutOrigin

# Two sets of regression and AUC/ROC plots

In [40]:
#Create new columns cohort_groups, cognitive_status
df.loc[df['cohort2'].str.contains('Gen population'), 'cohort_group'] = 'GenPop'
df.loc[df['cohort2'].str.contains('Spanish'), 'cohort_group'] = 'Spanish'
df.loc[df['cohort2'].str.contains('African American'), 'cohort_group'] = 'AA'

In [41]:
df.shape

(1046, 806)

In [42]:
df.cohort_group.value_counts()

GenPop     615
Spanish    262
AA         169
Name: cohort_group, dtype: int64

In [43]:
from statsmodels.multivariate.manova import MANOVA

#maovt = MANOVA.from_formula('MEMORY +ORIENT +JUDGMENT +COMMUN +HOMEHOBB +PERSCARE +CDRSUM +CDRGLOB +COMPORT ~ cohort2', data=genpop)
#print(maovt.mv_test())

In [44]:
#Create new columns cohort_groups, cognitive_status
df.loc[df['cohort2'].str.contains('mild cognitive impairment|MCI'), 'cognitive_status'] = 'MCI'
df.loc[df['cohort2'].str.contains('AD'), 'cognitive_status'] = 'AD'
df.loc[df['cohort2'].str.contains('65-85|normal controls|healthy aging 85+'), 'cognitive_status'] = 'NC'
df['cognitive_status'].head()

0     NC
1     AD
2     AD
3    MCI
4    MCI
Name: cognitive_status, dtype: object

In [45]:
df.columns

Index(['ptid', 'assmnt', 'site', 'cohort2', 'LANGUAGE', 'AGE', 'gender_rc',
       'ethnicity_rc', 'race_tb_rc', 'racetb_w',
       ...
       'RESPINTR', 'RESPDISN', 'RESPFATG', 'RESPEMOT', 'RESPASST', 'RESPOTH',
       'RESPOTHX', 'NACCMOCB', 'cohort_group', 'cognitive_status'],
      dtype='object', length=807)

Two sets of regressions (like in A and B below) for three comparisons (Ad vs mci, mc vs nc, nc vs AD) and three race/ethnicity. And an overall (with all the race/ethnicities collapsed)

A) AD vs. MCI = CCC + CFC  + age + sex + education



In [46]:
var1_acs= ['cohort_group', 'cognitive_status','AGE','SEX', 'College Degree', 'Graduate Degree', 'High School', 'Less than HS',
        'None', 'Some College','CCC_ACSScore','CFC_ACSScore']

In [47]:
var1_fct= ['cohort_group', 'cognitive_status','AGE','SEX', 'College Degree', 'Graduate Degree', 'High School', 'Less than HS',
        'None', 'Some College','CCC_FCTScore','CFC_FCTScore']

In [48]:
var1_ucs= ['cohort_group', 'cognitive_status','AGE','SEX', 'College Degree', 'Graduate Degree', 'High School', 'Less than HS',
        'None', 'Some College','CCC_UCSScore','CFC_UCSScore']

[ROC Curves, confusion matrix](https://scikit-plot.readthedocs.io/en/stable/metrics.html)

In [49]:
def dummy_variable(variable, df):
    '''
    Using the binned columns, replace them with dummy columns.
    Inputs:
    df: A panda dataframe
    variable: A list of column headings for binned variables
    Outputs:
    df:A panda dataframe
    '''
    dummy_df = pd.get_dummies(df[variable]).rename(columns = lambda x: str(x))
    df = pd.concat([df, dummy_df], axis=1)
    df.drop([variable], inplace = True, axis=1)
    
    return df

In [50]:
# CREATING DUMMIES for EDUCATION
SCOL = ['EDUCATION']
print ('The following columns have been broken into dummies:' , SCOL)
for dummy in SCOL:
    df = dummy_variable(dummy, df)
print (' ')

The following columns have been broken into dummies: ['EDUCATION']
 


In [51]:
def select_subset(df, not_include, keep, race=None):
    '''
    Select the subset of dataset based on congitive status, races and num of indedpendt variables.
    Inputs: 
        df: the dataframe
        not_include (string): one of the congitive_status wont be included in the dataset(AD, MCI,NC)
        keep (list of string): a list of column names for the regression
        race (None or string): cohort_group we are interested in, if None we process all cohort
        
      
    Returns: X, y and the title for the figure
    '''
    sub = df[keep]

    #Select the subset for cognitive_status
    sub=sub[sub['cognitive_status']!=not_include]
    
    #
    if race:
        sub=sub[sub['cohort_group']==race]
    
    sub.dropna(inplace=True)

    # Binarize the output
    if not_include == 'AD':
        y = np.where(sub.cognitive_status =='MCI', 1, 0)
        t = 'MCI vs. NC'
    elif not_include == 'MCI':
        y = np.where(sub.cognitive_status =='AD', 1, 0)
        t = 'AD vs. NC'
    else:
        y = np.where(sub.cognitive_status =='AD', 1, 0)
        t = 'AD vs. MCI'

    DROP_COLS = ['cognitive_status','cohort_group']
    X =sub.drop(DROP_COLS, axis = 1)

    if race:
        t = t+ '--'+ race

    return X, y ,t
   

In [52]:
def scores_at_k(y_true, y_scores, k):
    '''
    For a given level of k, calculate corresponding
    precision, recall, and f1 scores.
    '''
    preds_at_k = generate_binary_at_k(y_scores, k)
    precision = round(precision_score(y_true, preds_at_k), 2)
    recall = round(recall_score(y_true, preds_at_k), 2)
    f1 = round(f1_score(y_true, preds_at_k),2)
    return precision, recall, f1

[Finding thresholds](https://amirhessam88.github.io/finding-thresholds/)

In [53]:
# Function: Plotting Confusion Matrix
def _plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score, roc_curve, auc, confusion_matrix
    import itertools
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize = 14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="black")

    plt.ylabel('True Class', fontsize = 14)
    plt.xlabel('Predicted Class', fontsize = 14)

    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.tight_layout()

In [54]:
from math import sqrt
def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = round((AUC - 1.96*SE_AUC), 2)
    upper = round((AUC + 1.96*SE_AUC), 2)
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, upper)

In [55]:
def _threshold_finder(X, y_true, t):
    """
    a function to find the optimal threshold for binary classification
    X: the test set of features (pandas dataframe or numpy array)
    y_true: the true class labels (list or array of 0's and 1's)  
    t: title   

    return the dataframe contains the results of each regression
    """
    if np.all((y_true== 0)):
        print(t, 'does not have class 1')
        return 
    results_df = pd.DataFrame(columns=('Title', 'Accuracy','AUC-ROC', '95% CI','AVG Precision','Youden index','Precision,Recall,F1 at_50'))
    
    model =  LogisticRegression(random_state=0).fit(X, y)
    probas = model.predict_proba(X)
    y_predict_proba  = model.predict_proba(X)[:,1]
    y_pred = model.predict(X)
    accuracy = round(model.score(X, y),2)

    CI = roc_auc_ci(y_true, y_predict_proba, positive=1)

    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba)
    auc = roc_auc_score(y_true, y_predict_proba)
    precision, recall, thresholds2 = precision_recall_curve(y_true, y_predict_proba)
    
    class_names = [0, 1]
    youden_idx = np.argmax(np.abs(tpr - fpr))
    youden_threshold = thresholds[youden_idx]
    y_pred_youden = (y_predict_proba > youden_threshold).astype(int)
    cnf_matrix = confusion_matrix(y_true, y_pred_youden)
    np.set_printoptions(precision=2)

    #Zip, unzip to ensure corresponding order
    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_predict_proba, y), reverse=True))
    results_df.loc[len(results_df)] = [t, accuracy, round(roc_auc_score(y, y_predict_proba),2),CI,
                                    round(average_precision_score(y, y_predict_proba),2), round(youden_threshold,2),
                                    scores_at_k(y_test_sorted,y_pred_probs_sorted,50.0)]
    

    plt.figure(figsize = (10, 5))
    plt.subplot(1,2,1)  # one row, two columns, fist cell
    plt.plot(fpr, tpr, color = "red", label = F"AUC = {auc:.2f}")
    plt.plot(fpr[youden_idx], tpr[youden_idx], marker = "o", color = "green", ms =10, label =F"Youden Threshold = {youden_threshold:.2f}" )
    plt.axvline(x = fpr[youden_idx], ymin = fpr[youden_idx], ymax = tpr[youden_idx], color = "green", ls = "--")
    plt.plot([0,1], [0,1] , color = "black", ls = "--")
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('1 - Specificity' , fontsize=12)
    plt.ylabel('Sensitivity' , fontsize=12)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.legend( prop={'size':12} , loc = 4)
    plt.title('ROC Curve for '+t, fontsize = 12)

    plt.subplot(1,2,2)
    _plot_confusion_matrix(cnf_matrix, classes=class_names, normalize = False, cmap=plt.cm.Blues, title = F"Youden Threshold = {youden_threshold:.2f}\nAccuracy = {accuracy_score(y_true, y_pred_youden)*100:.2f}%")
    plt.show()
    
    plt.figure(figsize = (12, 5))
    plt.subplot(1,2,1)
    plt.plot(thresholds, 1-fpr, label = "1 - Specificity")
    plt.plot(thresholds, tpr, label = "Sensitivity")
    plt.xlabel("Threshold", fontsize = 12)
    plt.ylabel("Score", fontsize = 12)
    plt.legend(loc = 0)
    plt.xlim([0.025, thresholds[np.argmin(abs(tpr + fpr - 1))]+0.2])
    plt.axvline(thresholds[np.argmin(abs(tpr + fpr - 1))], color="k", ls = "--")
    plt.title(F"Threshold = {thresholds[np.argmin(abs(tpr + fpr - 1))]:.2f}", fontsize = 12)

    plt.show()
     
    #skplt.metrics.plot_confusion_matrix(y, y_pred, normalize=True,title='Normalized Confusion Matrix '+t)

    return results_df
    

In [56]:
def generate_binary_at_k(y_scores, k):
    '''
    Set first k% as 1, the rest as 0.
    '''
    cutoff_index = int(len(y_scores) * (k / 100.0))
    test_predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
    return test_predictions_binary

In [57]:
var1 = [var1_acs,var1_fct,var1_ucs]

In [58]:
names=['Age', 'Fully', 'Uncorrected']

In [59]:
import scikitplot as skplt

In [60]:
df.cohort_group.value_counts()

GenPop     615
Spanish    262
AA         169
Name: cohort_group, dtype: int64

In [None]:
all

B) AD vs MCI (c(0,1)) = DCCS + Flanker + LSWM + … + age + sex + education
 


In [None]:
var2_acs =['cohort_group','cognitive_status','AGE', 'SEX', 'DCCS_ACSScore', 'Flanker_ACSScore',
       'LSWM_ACSScore',  'ORR_ACSScore',  'PCPS_ACSScore',  'PSM_ACSScore', 
       'TPVT_ACSScore','College Degree', 'Graduate Degree', 'High School', 'Less than HS',
       'None', 'Some College']

In [None]:
var2_ucs =['cohort_group','cognitive_status','AGE', 'SEX', 'DCCS_UCSScore', 'Flanker_UCSScore',
       'LSWM_UCSScore',  'ORR_UCSScore',  'PCPS_UCSScore',  'PSM_UCSScore', 
       'TPVT_UCSScore','College Degree', 'Graduate Degree', 'High School', 'Less than HS',
       'None', 'Some College']

In [None]:
var2_ftc =['cohort_group','cognitive_status','AGE', 'SEX', 'DCCS_FCTScore', 'Flanker_FCTScore',
       'LSWM_FCTScore',  'ORR_FCTScore',  'PCPS_FCTScore',  'PSM_FCTScore', 
       'TPVT_FCTScore','College Degree', 'Graduate Degree', 'High School', 'Less than HS',
       'None', 'Some College']

In [None]:
var2=[var2_acs,var2_ftc,var2_ucs]

In [None]:
rv = []
fa =[]
for i in range(3):
    for cg in ['AD','MCI','NC']:
        for cohort in ['GenPop','Spanish','AA']:
            X, y, t = select_subset(df, cg, var2[i], cohort)
            df1 = _threshold_finder(X, y , t)
            rv.append(df1)
    
    temp = pd.concat(rv)
    temp['Corrections'] =names[i]
    fa.append(temp)
all2 = pd.concat(fa, ignore_index=True)
all2['Regression'] = 2

In [None]:
all2

In [None]:
#Previously used function

def plot_roc(X,y, t):

    results_df = pd.DataFrame(columns=('Title', 'accuracy','auc-roc', 'avg precision','precision,recall,f1 at_50'))
    
    lr = LogisticRegression(random_state=0).fit(X, y)
    probas = lr.predict_proba(X)
    y_pred_probs = lr.predict_proba(X)[:,1]
    accuracy = round(lr.score(X, y),2)
    #Zip, unzip to ensure corresponding order
    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y), reverse=True))
    results_df.loc[len(results_df)] = [t, accuracy, round(roc_auc_score(y, y_pred_probs),2),
                                    round(average_precision_score(y, y_pred_probs),2),
                                    scores_at_k(y_test_sorted,y_pred_probs_sorted,50.0)]



    return results_df