In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import numpy as np

# Function for preprocessing data
def preprocess_data(data):
    # Drop unnecessary columns
    remove_columns = ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit', 'Acq.Date', 'EXAMDATE', 'Dx Codes for Submission']
    data = data.drop(remove_columns, axis=1)

    # Handle missing values
    imputer = SimpleImputer(strategy="mean")
    data[["AGE", "PTEDUCAT", "MMSE"]] = imputer.fit_transform(data[["AGE", "PTEDUCAT", "MMSE"]])

    # Label Encode categorical features
    categorical_features = ["PTGENDER", "PTETHCAT", "PTRACCAT", "APOE Genotype"]  # Add APOE Genotype
    encoder = LabelEncoder()
    for feature in categorical_features:
        data[feature] = encoder.fit_transform(data[feature])

    # Feature scaling
    # scaler = StandardScaler()
    # data[["AGE", "MMSE"]] = scaler.fit_transform(data[["AGE", "MMSE"]])

    # Drop rows with missing values in the 'imputed_genotype' column
    data = data.dropna(subset=['imputed_genotype'])

    # Separate features (X) and target variable (y)
    X = data.drop("DX.bl", axis=1)
    y = data["DX.bl"]

    X.columns = X.columns.astype(str)
    print(data.head())
    
    return X, y

# Load data
data = pd.read_csv("ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv")

# Preprocess data
X, y = preprocess_data(data.copy())


  DX.bl   AGE  PTGENDER  PTEDUCAT  PTETHCAT  PTRACCAT  APOE4  MMSE  \
0    AD  81.3         1      18.0         1         2      1  20.0   
1  LMCI  67.5         1      10.0         0         2      0  27.0   
2    CN  73.7         1      16.0         1         2      0  29.0   
3  LMCI  80.4         0      13.0         1         2      0  25.0   
4    AD  73.9         0      12.0         1         2      1  24.0   

  imputed_genotype  APOE Genotype  
0             True              4  
1            False              3  
2             True              3  
3             True              3  
4             True              4  


In [None]:
y

0        AD
1      LMCI
2        CN
3      LMCI
4        AD
       ... 
623    LMCI
624    LMCI
625    LMCI
626    LMCI
627    LMCI
Name: DX.bl, Length: 627, dtype: object

In [None]:
data.head()

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
0,4702245ea294ce5d4e9b8a87027dfdf4,011_S_0003,3,32237,MRI,1,09-01-2005,AD,09-12-2005,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34,AD
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/05,LMCI,11-08-2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
2,90419199306997753de8042f1fd55e38,011_S_0005,5,32246,MRI,1,09-02-2005,CN,09-07-2005,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33,CN
3,d8d175ffff1e2053e6a18c5df494ccdf,100_S_0006,6,33025,MRI,1,11/15/05,LMCI,11/29/05,80.4,Female,13,Not Hisp/Latino,White,0,25,True,33,MCI
4,986e75b2e604cd44b38feb2188476fb2,011_S_0010,10,32270,MRI,1,11-07-2005,AD,11-10-2005,73.9,Female,12,Not Hisp/Latino,White,1,24,True,34,AD


In [None]:
for i in data.columns:
    print(data[i].value_counts())

directory.id
4702245ea294ce5d4e9b8a87027dfdf4    1
982b5f23e7543256a2108a9d83daad03    1
37fb3790019f18d49bf8c91dca84f28d    1
9cd1f789b47ecb56c4c12d668e96eb17    1
8e3f79c75189702313ea21de877befc0    1
                                   ..
491bea0789af05de0b3c53225717cef6    1
381abe3ee0203923915332e08c3ac81d    1
44d2cdcb432b0560a16a6e8f7d3b3dee    1
d6bb5e19b1844cfedc6438c5f10b96bf    1
ee3a9cb8b2dfbb1e3649e8485e41f855    1
Name: count, Length: 628, dtype: int64
Subject
011_S_0003    1
082_S_0928    1
053_S_0919    1
033_S_0920    1
094_S_0921    1
             ..
131_S_0457    1
114_S_0458    1
137_S_0459    1
027_S_0461    1
127_S_1427    1
Name: count, Length: 628, dtype: int64
RID
3       1
928     1
919     1
920     1
921     1
       ..
457     1
458     1
459     1
461     1
1427    1
Name: count, Length: 628, dtype: int64
Image.Data.ID
32237    1
39805    1
65689    1
42481    1
49510    1
        ..
92406    1
39845    1
46629    1
34231    1
91126    1
Name: count, Length

In [None]:
X.head()

Unnamed: 0,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype
0,81.3,1,18.0,1,2,1,20.0,True,4
1,67.5,1,10.0,0,2,0,27.0,False,3
2,73.7,1,16.0,1,2,0,29.0,True,3
3,80.4,0,13.0,1,2,0,25.0,True,3
4,73.9,0,12.0,1,2,1,24.0,True,4


In [None]:
data.head()

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
0,4702245ea294ce5d4e9b8a87027dfdf4,011_S_0003,3,32237,MRI,1,09-01-2005,AD,09-12-2005,81.3,Male,18,Not Hisp/Latino,White,1,20,True,34,AD
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/05,LMCI,11-08-2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
2,90419199306997753de8042f1fd55e38,011_S_0005,5,32246,MRI,1,09-02-2005,CN,09-07-2005,73.7,Male,16,Not Hisp/Latino,White,0,29,True,33,CN
3,d8d175ffff1e2053e6a18c5df494ccdf,100_S_0006,6,33025,MRI,1,11/15/05,LMCI,11/29/05,80.4,Female,13,Not Hisp/Latino,White,0,25,True,33,MCI
4,986e75b2e604cd44b38feb2188476fb2,011_S_0010,10,32270,MRI,1,11-07-2005,AD,11-10-2005,73.9,Female,12,Not Hisp/Latino,White,1,24,True,34,AD


In [None]:
data[data['PTETHCAT']=="Hisp/Latino"]

Unnamed: 0,directory.id,Subject,RID,Image.Data.ID,Modality,Visit,Acq.Date,DX.bl,EXAMDATE,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype,Dx Codes for Submission
1,2e89e352af743597b2368c412e0f6de2,022_S_0004,4,64631,MRI,1,9/22/05,LMCI,11-08-2005,67.5,Male,10,Hisp/Latino,White,0,27,False,33,MCI
5,1b4f75db908c740500a9f46c409b8a30,022_S_0014,14,59375,MRI,1,9/29/05,CN,11-04-2005,78.5,Female,12,Hisp/Latino,White,0,29,False,33,CN
462,4c38f5b5523467ce2cab128cfaf0a991,016_S_1028,1028,40799,MRI,1,11-02-2006,LMCI,11/29/06,76.9,Female,7,Hisp/Latino,White,1,25,True,34,MCI
469,885a8003a39878dab23aa8d98c1ea71c,128_S_1043,1043,69091,MRI,1,11/15/06,LMCI,12/14/06,68.5,Male,16,Hisp/Latino,White,0,25,False,33,MCI
498,854ebdaec732e67ad63a56a452563e00,016_S_1117,1117,46384,MRI,1,12-01-2006,LMCI,12-11-2006,68.9,Female,18,Hisp/Latino,White,0,26,True,33,MCI
502,3ef0dd85a37e4e1a01549f5b3da823b8,016_S_1121,1121,96234,MRI,1,12-06-2006,LMCI,12/28/06,56.2,Female,18,Hisp/Latino,White,0,24,False,33,MCI
503,3cb62964c907d3a51d87d0015ae305f1,003_S_1122,1122,52799,MRI,1,12-06-2006,LMCI,01-12-2007,76.6,Female,14,Hisp/Latino,White,0,28,False,33,MCI
570,60558ad84adb5792f213b1943e6e9a76,002_S_1280,1280,60056,MRI,1,2/13/07,CN,2/27/07,70.7,Female,14,Hisp/Latino,White,1,30,False,34,CN


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# Function to train the 1st stage Gaussian Naive Bayes classifier
def train_stage1_classifier(X_train, y_train):
    
    # Define hyperparameters grid for Gaussian Naive Bayes (Stage 1 Classifier)
    # gnb_param_grid = {'alpha': [0.1, 0.5, 1.0]}
    gnb_param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7]}
    
    # Perform grid search for Gaussian Naive Bayes
    model_stage1 = GridSearchCV(GaussianNB(), gnb_param_grid, cv=5)
    model_stage1.fit(X_train, y_train)
    model_stage1 = model_stage1.best_estimator_
    
    # model_stage1 = GaussianNB()
    # model_stage1.fit(X_train, y_train)
    return model_stage1

# Function to train the 2nd stage SVM or KNN classifier
def train_stage2_classifier(X_train, y_train, classification_type):
    if classification_type == 'SVM':

        # Define hyperparameters grid for SVM (Stage 2 Classifier)
        svm_param_grid = {'C': [0.1, 1, 10],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'poly', 'rbf']}
        
        # Perform grid search for SVM
        model_stage2 = GridSearchCV(SVC(), svm_param_grid, cv=5)
        model_stage2.fit(X_train, y_train)
        model_stage2 = model_stage2.best_estimator_
        
        # model_stage2 = SVC()
    elif classification_type == 'KNN':

        knn_param_grid = {'n_neighbors': [3, 5, 7],
                          'p': [1, 2, 3],
                          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
        
        # Perform grid search for KNN
        model_stage2 = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5)
        model_stage2.fit(X_train, y_train)
        model_stage2 = model_stage2.best_estimator_
        
        # model_stage2 = KNeighborsClassifier()
    else:
        raise ValueError("Invalid classification type. Must be 'SVM' or 'KNN'.")
    
    # model_stage2.fit(X_train, y_train)
    return model_stage2

# Function for multistage classification
def multistage_classifier(input_object, model_stage1, model_stage2_svm, model_stage2_knn):
    result_stage1 = model_stage1.predict_proba([input_object])
    
    if result_stage1[0][0] >= 0.5:
        # High confidence in AD or MCI/NC classification, no need for further stage
        print("****",result_stage1)
        
        return 'AD'
    else:
        ma = max(result_stage1[0][:])
        print("max is ", ma)
        if ma>=0.5:
            # Uncertain classification from the first stage, proceed to the second stage
            result_stage2_svm = model_stage2_svm.predict([input_object])
            
            if result_stage2_svm[0] == 'LMCI':
                return 'LMCI'
            # elif result_stage2_svm[0] == 'CN':
            #     return 'CN'
            else:
                return 'CN'
                # return result_stage2_svm[0]
        else:
            result_stage2_knn = model_stage2_knn.predict([input_object])
            return result_stage2_knn[0]


In [None]:
y_train.value_counts()

DX.bl
LMCI    246
CN      148
AD      107
Name: count, dtype: int64

In [None]:

# Convert X_train and X_test to NumPy arrays
X_train = X_train.values
X_test = X_test.values

In [None]:
# Train the 1st stage Gaussian Naive Bayes classifier
model_stage1 = train_stage1_classifier(X_train, y_train)

# Train the 2nd stage SVM classifier
model_stage2_svm = train_stage2_classifier(X_train, y_train, 'SVM')

# Train the 2nd stage KNN classifier
model_stage2_knn = train_stage2_classifier(X_train, y_train, 'KNN')




In [None]:
y_test

71       CN
473    LMCI
415    LMCI
517      AD
200      CN
       ... 
243      CN
457      CN
401      AD
455      CN
239    LMCI
Name: DX.bl, Length: 126, dtype: object

In [None]:
# # Evaluate on test data
# y_pred = []
# for sample in X_test.values:
#     y_pred.append(multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn))



# Evaluate on test data
y_pred = []
for sample in X_test:
    # print(list(sample))
    y_pred.append(multistage_classifier(list(sample), model_stage1, model_stage2_svm, model_stage2_knn))
    # print(y_pred[-1])



max is  0.7331360877232674
**** [[0.54740877 0.00087748 0.45171376]]
max is  0.9190313176537951
**** [[9.65311344e-01 7.22046365e-12 3.46886563e-02]]
max is  0.7807288608268641
max is  0.8246361184356181
max is  0.7535950316866202
max is  0.5379078098442936
max is  0.647541766347202
max is  0.7303337356537529
**** [[9.92348067e-01 7.57191607e-17 7.65193310e-03]]
**** [[9.38310596e-01 3.40062849e-08 6.16893704e-02]]
max is  0.8259152879420532
**** [[6.65559032e-01 2.44943894e-06 3.34438519e-01]]
**** [[9.76572323e-01 1.35928698e-12 2.34276770e-02]]
max is  0.7439445069757239
**** [[9.29261616e-01 9.86136980e-09 7.07383739e-02]]
max is  0.8524698509354722
max is  0.8174065627896172
max is  0.7926256338665808
max is  0.6084623412210033
max is  0.8892540156041462
max is  0.8177950930557301
max is  0.8341096568508968
max is  0.8391006523047999
**** [[8.63686156e-01 2.90233257e-08 1.36313814e-01]]
**** [[9.80455204e-01 2.16712751e-14 1.95447960e-02]]
**** [[6.82720067e-01 3.70544670e-06 3.17

In [None]:
y_test.values

array(['CN', 'LMCI', 'LMCI', 'AD', 'CN', 'CN', 'LMCI', 'CN', 'LMCI',
       'LMCI', 'AD', 'AD', 'LMCI', 'AD', 'AD', 'LMCI', 'AD', 'CN', 'CN',
       'CN', 'LMCI', 'LMCI', 'CN', 'LMCI', 'CN', 'LMCI', 'AD', 'LMCI',
       'LMCI', 'LMCI', 'CN', 'CN', 'LMCI', 'AD', 'CN', 'LMCI', 'LMCI',
       'AD', 'LMCI', 'CN', 'CN', 'LMCI', 'AD', 'CN', 'LMCI', 'CN', 'CN',
       'LMCI', 'AD', 'LMCI', 'CN', 'LMCI', 'CN', 'CN', 'AD', 'LMCI',
       'LMCI', 'CN', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'AD',
       'CN', 'AD', 'CN', 'CN', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'LMCI',
       'AD', 'CN', 'AD', 'LMCI', 'LMCI', 'LMCI', 'LMCI', 'AD', 'LMCI',
       'CN', 'LMCI', 'LMCI', 'AD', 'LMCI', 'LMCI', 'CN', 'CN', 'CN',
       'LMCI', 'LMCI', 'LMCI', 'LMCI', 'CN', 'CN', 'AD', 'AD', 'LMCI',
       'LMCI', 'LMCI', 'LMCI', 'CN', 'AD', 'CN', 'CN', 'CN', 'CN', 'AD',
       'CN', 'CN', 'LMCI', 'AD', 'LMCI', 'LMCI', 'CN', 'AD', 'LMCI', 'AD',
       'CN', 'CN', 'AD', 'CN', 'LMCI'], dtype=object)

In [None]:
pd.DataFrame(y_pred).values

array([['CN'],
       ['AD'],
       ['LMCI'],
       ['AD'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['AD'],
       ['AD'],
       ['LMCI'],
       ['AD'],
       ['AD'],
       ['LMCI'],
       ['AD'],
       ['CN'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['CN'],
       ['AD'],
       ['AD'],
       ['AD'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['AD'],
       ['CN'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['AD'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
       ['AD'],
       ['AD'],
       ['CN'],
       ['CN'],
       ['LMCI'],
       ['CN'],
       ['AD'],
       ['LMCI'],
       ['AD'],
       ['CN'],
       ['LMCI'],
       ['LMCI'],
       ['LMCI'],
       ['CN'],
       ['LMCI'],
 

In [None]:
pd.DataFrame(y_pred).value_counts()

LMCI    55
CN      41
AD      30
Name: count, dtype: int64

In [None]:
# import numpy as np

# # Initialize an empty list to store flattened predictions
# flattened_predictions = []

# # Iterate over each element in y_pred
# for pred in y_pred:
#     # Check if the element is an array
#     if isinstance(pred, np.ndarray):
#         # Flatten the array and append it to the list
#         flattened_predictions.extend(pred.flatten())
#     else:
#         # If it's not an array, simply append it to the list
#         flattened_predictions.append(pred)

# # Convert the list to a numpy array
# y_pred = np.array(flattened_predictions)

In [None]:
yp=pd.DataFrame(y_pred)
yp.value_counts()

LMCI    55
CN      41
AD      30
Name: count, dtype: int64

In [None]:
yt=pd.DataFrame(y_test)
yt.value_counts()
# len(y_test)

DX.bl
LMCI     58
CN       42
AD       26
Name: count, dtype: int64

In [None]:
print("Classification Report:")
print(classification_report(yp, yt))

Classification Report:
              precision    recall  f1-score   support

          AD       0.77      0.67      0.71        30
          CN       0.79      0.80      0.80        41
        LMCI       0.69      0.73      0.71        55

    accuracy                           0.74       126
   macro avg       0.75      0.73      0.74       126
weighted avg       0.74      0.74      0.74       126



In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7380952380952381


In [None]:
print(accuracy_score(y_test, model_stage1.predict(X_test)))

0.7222222222222222


In [None]:
print(accuracy_score(y_test, model_stage2_svm.predict(X_test)))

0.7619047619047619


In [None]:
print(accuracy_score(y_test, model_stage2_knn.predict(X_test)))

0.6746031746031746


In [None]:
X.head()
# y.head()

Unnamed: 0,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,imputed_genotype,APOE Genotype
0,0.917438,1,18.0,1,2,1,-2.703157,True,4
1,-1.14778,1,10.0,0,2,0,0.031099,False,3
2,-0.219929,1,16.0,1,2,0,0.812315,True,3
3,0.78275,0,13.0,1,2,0,-0.750117,True,3
4,-0.189998,0,12.0,1,2,1,-1.140725,True,4


In [None]:
ls = [-0.219929,	1,	16.0,	1,	2,	0,	0.812315,	True,	3]
ls

[-0.219929, 1, 16.0, 1, 2, 0, 0.812315, True, 3]

In [None]:
# check
multistage_classifier(list(ls), model_stage1, model_stage2_svm, model_stage2_knn)

max is  0.7310278936350827


'CN'