# Get clinician diagnosis

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample

  return f(*args, **kwds)


In [2]:
with open("../preprocess_data/processed_data/processed_csv/first_visit_features/test_first_visit_features.csv", "rt") as fin:
    test_features = pd.read_csv(fin, low_memory=False)

In [3]:
# get the clinical diagonsis
def label_clinical_diagonsis(row):
    if (row["NACCALZD"] == 1) and (row["NACCLBDE"] != 1):
        return 0
    elif (row["NACCALZD"] != 1) and (row["NACCLBDE"] == 1):
        return 1
    elif (row["NACCALZD"] == 1) and (row["NACCLBDE"] == 1):
        return 2
    else:
        return 3

In [4]:
# to add a column and fill clinical diagnosis for each row in test set
test_features["clinician_diagnosis"] = test_features.apply(lambda row: label_clinical_diagonsis(row), axis=1)

In [7]:
len(test_features), len(pd.unique(test_features["NACCID"]))

(380, 380)

# get the bootstrapping samples and get the sensitivity and specificity for each sample

In [8]:
# step 1: get a list contraining (naccid, label, clinician_diagnosis)
test_label_diag = test_features[["NACCID", "label", "clinician_diagnosis"]].values

In [9]:
test_label_diag

array([['NACC014159', 0, 0],
       ['NACC017967', 0, 0],
       ['NACC020729', 3, 2],
       ...,
       ['NACC988182', 1, 1],
       ['NACC993747', 0, 0],
       ['NACC997719', 3, 3]], dtype=object)

In [10]:
# step 2: define a function to get the bootstrap sample sets
def construct_bootstrap_test_samples(feature_list, sample_size):
    """
    Args:
        - feature_list: the list containing all test features
        - sample_size: the number of samples in each bootstrap sample
    """
    index_array = np.arange(len(feature_list))  # get the index array for test_features
    resampled_index_array = resample(index_array, n_samples=sample_size)
    #resampled_index_list = resampled_index_array.tolist()
    features_resampled = [feature_list[i] for i in resampled_index_array]

    return features_resampled, resampled_index_array


In [12]:
# step 3: use sklearn to compute the confusion matrix 
# and use bootstrapping_for_sensitivity_specificity to 
# compute sensitivity and specificity for each bootstrapping sample

from sensitivity_specificity import sensitivity_specificity_from_mtx  

def compute_sensitivity_specificity(sampled_array):
    """
    Compute sensitivity and specificity scores for each disease
    Args:
        - sampled_array: numpy array containing resampled test sample.
                         each sample has the format array([naccid, label, clinician_diagnosis])
    """
    # get labels and the corresponding clincian diagnosis
    resampled_labels = [sample[1] for sample in sampled_array]
    resampled_clinician_diag = [sample[2] for sample in sampled_array]
    
    conf_mtx = confusion_matrix(resampled_labels, resampled_clinician_diag)
    sensitivity, specificity = sensitivity_specificity_from_mtx(conf_mtx)
    
    return sensitivity, specificity

In [13]:
# step 4: use bootstrap to get sensitivity and specificity for each bootstrap sample set

# configure bootstrap
n_iterations = 1000
n_size = int(len(test_label_diag)*0.8)

# define lists to store statistics
ad_sensitivity_list = []  
lbd_sensitivity_list = [] 
mix_sensitivity_list = [] 
others_sensitivity_list = [] 

ad_specificity_list = []  
lbd_specificity_list = [] 
mix_specificity_list = [] 
others_specificity_list = [] 

sample_indices_list = []

for iteration in range(n_iterations):

    # prepare bootstrap test set
    test_set, sample_indices = construct_bootstrap_test_samples(test_label_diag, n_size)
    sample_indices_list.append(sample_indices)

    # compute the sensitivity and specificity
    sensitivity_list, specificity_list = compute_sensitivity_specificity(test_set)
    ad_sensitivity_list.append(sensitivity_list[0])  
    lbd_sensitivity_list.append(sensitivity_list[1])
    mix_sensitivity_list.append(sensitivity_list[2]) 
    others_sensitivity_list.append(sensitivity_list[3])

    ad_specificity_list.append(specificity_list[0])
    lbd_specificity_list.append(specificity_list[1])
    mix_specificity_list.append(specificity_list[2])
    others_specificity_list.append(specificity_list[3]) 

In [15]:
len(ad_specificity_list)

1000

# get the bootstrap statistics for each metrics

In [17]:
from statistics import mean

In [20]:
# define a function to get the confident interval for the metrics",
def get_confident_interval(metrics_list, alpha):
    """
    Args:
        - metrics_list: list containing all the bootstrap value for this metrics",
        - alpha: the chosen percentile
    """
    
    p = ((1.0 - alpha) / 2.0) * 100,
    lower = np.percentile(metrics_list, p),
    p = (alpha + ((1.0-alpha)/2.0)) * 100,
    upper = np.percentile(metrics_list, p),
    print("{}% confidence interval {} and {}".format(alpha*100, lower[0], upper[0]))
    print('\n')

In [21]:
alpha = 0.95

In [25]:
######### Sensitivity ############

# AD 
print("PURE AD")
print("mean is", mean(ad_sensitivity_list))
get_confident_interval(ad_sensitivity_list, alpha)

# LBD 
print("PURE LBD")
print("mean is", mean(lbd_sensitivity_list))
get_confident_interval(lbd_sensitivity_list, alpha)
# MIX
print("MIX AD + LBD")
print("mean is", mean(mix_sensitivity_list))
get_confident_interval(mix_sensitivity_list, alpha)
# Others
print("OTHERS")
print("mean is", mean(others_sensitivity_list))
get_confident_interval(others_sensitivity_list, alpha)

PURE AD
mean is 0.6799437264825531
95.0% confidence interval [0.59677081] and [0.76106195]


PURE LBD
mean is 0.40548237391693276
95.0% confidence interval [0.] and [0.8]


MIX AD + LBD
mean is 0.03323714693598768
95.0% confidence interval [0.] and [0.07528927]


OTHERS
mean is 0.6709451781330792
95.0% confidence interval [0.56663043] and [0.76394751]




In [26]:
######### Specificity ############

# AD 
print("PURE AD")
print("mean is", mean(ad_specificity_list))
get_confident_interval(ad_specificity_list, alpha)

# LBD 
print("PURE LBD")
print("mean is", mean(lbd_specificity_list))
get_confident_interval(lbd_specificity_list, alpha)
# MIX
print("MIX AD + LBD")
print("mean is", mean(mix_specificity_list))
get_confident_interval(mix_specificity_list, alpha)
# Others
print("OTHERS")
print("mean is", mean(others_specificity_list))
get_confident_interval(others_specificity_list, alpha)

PURE AD
mean is 0.495211214687952
95.0% confidence interval [0.42549144] and [0.56253324]


PURE LBD
mean is 0.964865379945087
95.0% confidence interval [0.94217687] and [0.98327759]


MIX AD + LBD
mean is 0.9700294942668849
95.0% confidence interval [0.94633158] and [0.99010263]


OTHERS
mean is 0.7702858308652755
95.0% confidence interval [0.71425368] and [0.82193061]


