In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns



In [113]:
# 1. Load Data
df = pd.read_csv('RealisticSyntheticCognitiveData.csv')

In [114]:
# 3. Drop unnecessary columns
df.drop(columns=['participant_id'], inplace=True) #Inplace means it updates the data within the dataset

In [115]:
df.head(10)

Unnamed: 0,age,round1Correct,round2Correct,round3Correct,round1AverageResponseTime,round2AverageResponseTime,round3AverageResponseTime,round1TotalTime,round2TotalTime,round3TotalTime,totalCorrect,round2AudioReplays,overallTotalTime,diagnosis
0,57,7,10,5,9.847178,14.607397,5.0,94.416383,237.568961,43.631683,23,0,375.617026,no_alzheimers
1,36,8,15,3,11.064824,25.944789,14.538293,100.711052,395.98753,122.565326,27,0,619.263909,no_alzheimers
2,59,6,13,6,20.035533,20.321137,16.214687,188.486412,314.868772,141.127056,25,0,644.482241,no_alzheimers
3,36,4,15,6,18.161514,27.803161,15.817027,164.866756,425.944204,129.309475,25,0,720.120434,no_alzheimers
4,42,8,15,6,22.532329,20.191799,21.461666,206.494956,324.545773,181.403723,29,0,712.444452,no_alzheimers
5,48,6,15,6,14.189903,16.830218,20.180438,128.440203,257.602644,165.250833,28,0,551.29368,no_alzheimers
6,55,3,14,6,14.708611,24.704999,24.605838,137.792153,384.033585,206.665499,25,0,728.491238,no_alzheimers
7,21,7,14,1,9.739329,17.719409,11.392918,92.210542,281.35548,96.858788,23,0,470.42481,no_alzheimers
8,62,4,14,2,16.009272,18.088236,20.41119,151.245513,278.387837,165.952698,21,0,595.586048,no_alzheimers
9,49,6,15,4,19.013531,20.730888,22.726286,183.364431,318.612106,185.523095,25,1,687.499632,no_alzheimers


In [116]:
# 4. Check for nulls
print("Missing values:\n", df.isnull().sum())

Missing values:
 age                          0
round1Correct                0
round2Correct                0
round3Correct                0
round1AverageResponseTime    0
round2AverageResponseTime    0
round3AverageResponseTime    0
round1TotalTime              0
round2TotalTime              0
round3TotalTime              0
totalCorrect                 0
round2AudioReplays           0
overallTotalTime             0
diagnosis                    0
dtype: int64


In [117]:
# 5. Encode target
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

In [118]:
# 6. Feature & Target separation
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

In [119]:
# 7. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [120]:
# 8. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [121]:
print(X_test_scaled.shape)

(160, 13)


In [122]:
# 9. Define models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='rbf', probability=True)
}

In [123]:
# 10. Train and evaluate
for name, model in models.items():
    print(f"\n🔍 Model: {name}")
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Accuracy scores
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy:     {test_acc:.4f}")

    # Updated: Classification Report with 3 classes
    print("Classification Report (Test Set):")
    print(classification_report(
        y_test, y_test_pred,
        target_names=['no_alzheimers', 'maybe_alzheimers', 'alzheimers']
    ))





🔍 Model: Decision Tree
Training Accuracy: 1.0000
Test Accuracy:     0.9563
Classification Report (Test Set):
                  precision    recall  f1-score   support

   no_alzheimers       0.94      1.00      0.97        32
maybe_alzheimers       0.90      0.88      0.89        32
      alzheimers       0.98      0.97      0.97        96

        accuracy                           0.96       160
       macro avg       0.94      0.95      0.94       160
    weighted avg       0.96      0.96      0.96       160


🔍 Model: Random Forest
Training Accuracy: 1.0000
Test Accuracy:     0.9812
Classification Report (Test Set):
                  precision    recall  f1-score   support

   no_alzheimers       0.97      0.94      0.95        32
maybe_alzheimers       0.94      0.97      0.95        32
      alzheimers       1.00      1.00      1.00        96

        accuracy                           0.98       160
       macro avg       0.97      0.97      0.97       160
    weighted avg     

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
