In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import numpy as np

# Function for preprocessing data
def preprocess_data(data):
    # Drop unnecessary columns
    remove_columns = ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'EXAMDATE', 'Dx Codes for Submission']
    data = data.drop(remove_columns, axis=1)

    # Handle missing values
    imputer = SimpleImputer(strategy="mean")
    data[["AGE", "PTEDUCAT", "MMSE"]] = imputer.fit_transform(data[["AGE", "PTEDUCAT", "MMSE"]])

    # One-Hot Encode categorical features
    categorical_features = ["PTGENDER", "PTETHCAT", "PTRACCAT", "APOE Genotype"]  # Add APOE Genotype
    encoder = OneHotEncoder(sparse_output=False)
    data = pd.concat([data, pd.DataFrame(encoder.fit_transform(data[categorical_features]))], axis=1)
    data.drop(categorical_features, axis=1, inplace=True)
    # data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

    # Feature scaling
    scaler = StandardScaler()
    data[["AGE", "MMSE"]] = scaler.fit_transform(data[["AGE", "MMSE"]])

    # Drop rows with missing values in the 'imputed_genotype' column
    data = data.dropna(subset=['imputed_genotype'])

    # Separate features (X) and target variable (y)
    X = data.drop("DX.bl", axis=1)
    y = data["DX.bl"]

    # y = pd.DataFrame()
    # y.loc[:, 'DX.bl'] = data.loc[:, 'DX.bl']

    X.columns = X.columns.astype(str)

    return X, y

# Load data
data = pd.read_csv("ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv")

# Preprocess data
X, y = preprocess_data(data.copy())



In [4]:
y

0        AD
1      LMCI
2        CN
3      LMCI
4        AD
       ... 
623    LMCI
624    LMCI
625    LMCI
626    LMCI
627    LMCI
Name: DX.bl, Length: 627, dtype: object

In [5]:
X.head()

Unnamed: 0,AGE,PTEDUCAT,APOE4,MMSE,imputed_genotype,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.917438,18.0,1,-2.703157,True,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.14778,10.0,0,0.031099,False,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.219929,16.0,0,0.812315,True,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.78275,13.0,0,-0.750117,True,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.189998,12.0,1,-1.140725,True,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
data.head()

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
0,4702245ea294ce5d4e9b8a87027dfdf4,011_S_0003,3,32237,MRI,1,09-01-2005,AD,09-12-2005,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34,AD
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/05,LMCI,11-08-2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
2,90419199306997753de8042f1fd55e38,011_S_0005,5,32246,MRI,1,09-02-2005,CN,09-07-2005,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33,CN
3,d8d175ffff1e2053e6a18c5df494ccdf,100_S_0006,6,33025,MRI,1,11/15/05,LMCI,11/29/05,80.4,Female,13,Not Hisp/Latino,White,0,25,True,33,MCI
4,986e75b2e604cd44b38feb2188476fb2,011_S_0010,10,32270,MRI,1,11-07-2005,AD,11-10-2005,73.9,Female,12,Not Hisp/Latino,White,1,24,True,34,AD


In [7]:
data[data['PTETHCAT']=="Hisp/Latino"]

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/05,LMCI,11-08-2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
5,1b4f75db908c740500a9f46c409b8a30,022_S_0014,14,59375,MRI,1,9/29/05,CN,11-04-2005,78.5,Female,12,Hisp/Latino,White,0,29,False,33,CN
462,4c38f5b5523467ce2cab128cfaf0a991,016_S_1028,1028,40799,MRI,1,11-02-2006,LMCI,11/29/06,76.9,Female,7,Hisp/Latino,White,1,25,True,34,MCI
469,885a8003a39878dab23aa8d98c1ea71c,128_S_1043,1043,69091,MRI,1,11/15/06,LMCI,12/14/06,68.5,Male,16,Hisp/Latino,White,0,25,False,33,MCI
498,854ebdaec732e67ad63a56a452563e00,016_S_1117,1117,46384,MRI,1,12-01-2006,LMCI,12-11-2006,68.9,Female,18,Hisp/Latino,White,0,26,True,33,MCI
502,3ef0dd85a37e4e1a01549f5b3da823b8,016_S_1121,1121,96234,MRI,1,12-06-2006,LMCI,12/28/06,56.2,Female,18,Hisp/Latino,White,0,24,False,33,MCI
503,3cb62964c907d3a51d87d0015ae305f1,003_S_1122,1122,52799,MRI,1,12-06-2006,LMCI,01-12-2007,76.6,Female,14,Hisp/Latino,White,0,28,False,33,MCI
570,60558ad84adb5792f213b1943e6e9a76,002_S_1280,1280,60056,MRI,1,2/13/07,CN,2/27/07,70.7,Female,14,Hisp/Latino,White,1,30,False,34,CN


In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train the 1st stage Gaussian Naive Bayes classifier
def train_stage1_classifier(X_train, y_train):
    
    # Define hyperparameters grid for Gaussian Naive Bayes (Stage 1 Classifier)
    # gnb_param_grid = {'alpha': [0.1, 0.5, 1.0]}
    gnb_param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7]}
    
    # Perform grid search for Gaussian Naive Bayes
    model_stage1 = GridSearchCV(GaussianNB(), gnb_param_grid, cv=5)
    model_stage1.fit(X_train, y_train)
    model_stage1 = model_stage1.best_estimator_
    
    # model_stage1 = GaussianNB()
    # model_stage1.fit(X_train, y_train)
    return model_stage1

# Function to train the 2nd stage SVM or KNN classifier
def train_stage2_classifier(X_train, y_train, classification_type):
    if classification_type == 'SVM':

        # Define hyperparameters grid for SVM (Stage 2 Classifier)
        svm_param_grid = {'C': [0.1, 1, 10],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'poly', 'rbf']}
        
        # Perform grid search for SVM
        model_stage2 = GridSearchCV(SVC(), svm_param_grid, cv=5)
        model_stage2.fit(X_train, y_train)
        model_stage2 = model_stage2.best_estimator_
        
        # model_stage2 = SVC()
    elif classification_type == 'KNN':

        knn_param_grid = {'n_neighbors': [3, 5, 7],
                          'p': [1, 2, 3],
                          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
        
        # Perform grid search for KNN
        model_stage2 = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5)
        model_stage2.fit(X_train, y_train)
        model_stage2 = model_stage2.best_estimator_
        
        # model_stage2 = KNeighborsClassifier()
    else:
        raise ValueError("Invalid classification type. Must be 'SVM' or 'KNN'.")
    
    # model_stage2.fit(X_train, y_train)
    return model_stage2

# Function for multistage classification
def multistage_classifier(input_object, model_stage1, model_stage2_svm, model_stage2_knn):
    result_stage1 = model_stage1.predict_proba([input_object])
    
    if result_stage1[0][0] >= 0.5:
        # High confidence in AD or MCI/NC classification, no need for further stage
        print("****",result_stage1)
        
        return 'AD'
    else:
        ma = max(result_stage1[0][:])
        print("max is ", ma)
        if ma>=0.5:
            # Uncertain classification from the first stage, proceed to the second stage
            result_stage2_svm = model_stage2_svm.predict([input_object])
            
            if result_stage2_svm[0] == 'LMCI':
                return 'LMCI'
            # elif result_stage2_svm[0] == 'CN':
            #     return 'CN'
            else:
                return 'CN'
                # return result_stage2_svm[0]
        else:
            result_stage2_knn = model_stage2_knn.predict([input_object])
            return result_stage2_knn[0]


In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier

# # Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Function to train the 1st stage Gaussian Naive Bayes classifier
# def train_stage1_classifier(X_train, y_train):
#     model_stage1 = GaussianNB()
#     model_stage1.fit(X_train, y_train)
#     return model_stage1

# # Function to train the 2nd stage SVM or KNN classifier
# def train_stage2_classifier(X_train, y_train, classification_type):
#     if classification_type == 'SVM':
#         model_stage2 = SVC()
#     elif classification_type == 'KNN':
#         model_stage2 = KNeighborsClassifier()
#     else:
#         raise ValueError("Invalid classification type. Must be 'SVM' or 'KNN'.")
    
#     model_stage2.fit(X_train, y_train)
#     return model_stage2

# # Function for multistage classification
# def multistage_classifier(input_object, model_stage1, model_stage2_svm, model_stage2_knn):
#     result_stage1 = model_stage1.predict_proba([input_object])
    
#     if result_stage1[0][0] >= 0.5:
#         # High confidence in AD or MCI/NC classification, no need for further stage
#         print("****",result_stage1)
        
#         return 'AD'
#     else:
#         ma = max(result_stage1[0][:])
#         print("max is ", ma)
#         if ma>=0.7:
#             # Uncertain classification from the first stage, proceed to the second stage
#             result_stage2_svm = model_stage2_svm.predict([input_object])
            
#             if result_stage2_svm[0] == 'LMCI':
#                 return 'LMCI'
#             # elif result_stage2_svm[0] == 'CN':
#             #     return 'CN'
#             else:
#                 # return 'CN'
#                 return result_stage2_svm[0]
#         else:
#             result_stage2_knn = model_stage2_knn.predict([input_object])
#             return result_stage2_knn[0]


In [10]:
y_train.value_counts()

DX.bl
LMCI    244
CN      148
AD      109
Name: count, dtype: int64

In [11]:

# Convert X_train and X_test to NumPy arrays
X_train = X_train.values
X_test = X_test.values

In [12]:
# Train the 1st stage Gaussian Naive Bayes classifier
model_stage1 = train_stage1_classifier(X_train, y_train)

# Train the 2nd stage SVM classifier
model_stage2_svm = train_stage2_classifier(X_train, y_train, 'SVM')

# Train the 2nd stage KNN classifier
model_stage2_knn = train_stage2_classifier(X_train, y_train, 'KNN')




In [13]:
y_test

581    LMCI
591    LMCI
551    LMCI
213    LMCI
485      CN
       ... 
536      AD
319      CN
218      AD
344      AD
383    LMCI
Name: DX.bl, Length: 126, dtype: object

In [14]:
# # Evaluate on test data
# y_pred = []
# for sample in X_test.values:
#     y_pred.append(multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn))



# Evaluate on test data
y_pred = []
for sample in X_test:
    # print(list(sample))
    y_pred.append(multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn))
    # print(y_pred[-1])



max is  0.8559123469471349
**** [[1. 0. 0.]]
max is  0.7059871540203078
max is  0.8606997047322006
max is  0.7323925847330378
max is  0.9009223927466871
max is  0.6145370095444181
**** [[9.67344295e-01 2.17995113e-11 3.26557054e-02]]
**** [[1. 0. 0.]]
max is  0.8501283647524581
max is  0.8369063774552392
max is  0.9997202650964372
max is  0.9216399635801018
max is  0.8629006485476778
max is  0.8103648147582917
max is  0.927873051205026
max is  0.8983928625827877
**** [[9.81490599e-01 6.36849326e-15 1.85094007e-02]]
max is  0.8497325750392659
max is  0.5010166806043213
max is  0.5505031173839995
**** [[6.05711275e-01 3.63967674e-09 3.94288721e-01]]
max is  0.8921693288828029
max is  0.7806924559581367
**** [[9.94785273e-01 1.15326559e-15 5.21472709e-03]]
max is  0.516548273971117
max is  0.5930004070949076
max is  0.5708758488259633
max is  0.4939073445370629
max is  0.8744258748694883
max is  0.9963275788059123
**** [[8.60742755e-01 2.98246604e-06 1.39254262e-01]]
max is  0.60923847940

In [15]:
y_test.values

array(['LMCI', 'LMCI', 'LMCI', 'LMCI', 'CN', 'CN', 'LMCI', 'AD', 'CN',
       'LMCI', 'CN', 'CN', 'CN', 'LMCI', 'LMCI', 'CN', 'CN', 'LMCI',
       'LMCI', 'LMCI', 'LMCI', 'AD', 'LMCI', 'LMCI', 'AD', 'LMCI', 'CN',
       'CN', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'AD', 'CN', 'LMCI', 'CN',
       'AD', 'LMCI', 'LMCI', 'CN', 'AD', 'CN', 'CN', 'LMCI', 'AD', 'AD',
       'LMCI', 'CN', 'LMCI', 'LMCI', 'CN', 'CN', 'CN', 'LMCI', 'LMCI',
       'CN', 'CN', 'AD', 'CN', 'CN', 'LMCI', 'AD', 'LMCI', 'AD', 'LMCI',
       'LMCI', 'CN', 'CN', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'CN',
       'AD', 'LMCI', 'LMCI', 'CN', 'AD', 'LMCI', 'CN', 'CN', 'LMCI',
       'LMCI', 'LMCI', 'LMCI', 'LMCI', 'CN', 'CN', 'CN', 'LMCI', 'LMCI',
       'LMCI', 'CN', 'LMCI', 'CN', 'LMCI', 'AD', 'CN', 'AD', 'LMCI', 'CN',
       'CN', 'LMCI', 'LMCI', 'AD', 'CN', 'LMCI', 'AD', 'CN', 'AD', 'LMCI',
       'LMCI', 'CN', 'AD', 'LMCI', 'LMCI', 'AD', 'AD', 'LMCI', 'CN', 'AD',
       'CN', 'AD', 'AD', 'LMCI'], dtype=object)

In [16]:
pd.DataFrame(y_pred).values

array([['LMCI'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['AD'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['AD'],
       ['CN'],
       ['LMCI'],
       ['AD'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['AD'],
       ['CN'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['CN'],
       ['AD'],
       ['LMC

In [17]:
pd.DataFrame(y_pred).value_counts()

LMCI    52
CN      50
AD      24
Name: count, dtype: int64

In [18]:
# import numpy as np

# # Initialize an empty list to store flattened predictions
# flattened_predictions = []

# # Iterate over each element in y_pred
# for pred in y_pred:
#     # Check if the element is an array
#     if isinstance(pred, np.ndarray):
#         # Flatten the array and append it to the list
#         flattened_predictions.extend(pred.flatten())
#     else:
#         # If it's not an array, simply append it to the list
#         flattened_predictions.append(pred)

# # Convert the list to a numpy array
# y_pred = np.array(flattened_predictions)

In [19]:
yp=pd.DataFrame(y_pred)
yp.value_counts()

LMCI    52
CN      50
AD      24
Name: count, dtype: int64

In [20]:
yt=pd.DataFrame(y_test)
yt.value_counts()
# len(y_test)

DX.bl
LMCI     60
CN       42
AD       24
Name: count, dtype: int64

In [21]:
print("Classification Report:")
print(classification_report(yp, yt))

Classification Report:
              precision    recall  f1-score   support

          AD       0.67      0.67      0.67        24
          CN       0.76      0.64      0.70        50
        LMCI       0.60      0.69      0.64        52

    accuracy                           0.67       126
   macro avg       0.68      0.67      0.67       126
weighted avg       0.68      0.67      0.67       126



In [22]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6666666666666666


In [23]:
print(accuracy_score(y_test, model_stage1.predict(X_test)))

0.6746031746031746


In [24]:
print(accuracy_score(y_test, model_stage2_svm.predict(X_test)))

0.6746031746031746


In [25]:
print(accuracy_score(y_test, model_stage2_knn.predict(X_test)))

0.6111111111111112
