## Classifier code

In [2]:
#Import statements
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, balanced_accuracy_score, auc
from sklearn.model_selection import cross_val_predict, LeaveOneOut, train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.lines as mlines
print(torch.__version__)

2.3.0


#### Input non-labeled training, testing datasets, and diagnosis-labeled test data

In [None]:
train_csv = pd.read_csv('model_data/train_brain_labels_deid.csv', index_col=0)
train_parquet_scaled = pd.read_parquet('model_data/train_brain_data_deid_scaled.parquet')
test_csv = pd.read_csv('model_data/test_brain_labels_deid.csv', index_col=0)
test_parquet_scaled = pd.read_parquet('model_data/test_brain_data_deid_scaled.parquet')
nd_csv = pd.read_csv('model_data/nd_brain_labels_deid.csv', index_col=0)
nd_parquet_scaled = pd.read_parquet('model_data/nd_brain_data_deid_scaled.parquet')

#### Input shortened labeled dataset

Retain only data with diagnosis of ad, bvftd, cu, or dlb, and age and sex information

In [None]:
nd_csv_short = pd.read_csv('model_data/nd_filtered_data.csv', index_col=0)

#### Remove sex label

In [None]:
desired_columns = ['age_at_scan', 'ad', 'bvftd', 'cu', 'dlb']
nd_csv_nsex = nd_csv_short[desired_columns].copy()
nd_removed_columns = list(set(nd_csv_nsex.columns) - set(desired_columns))
for col in nd_removed_columns:
    if col in nd_csv_short:
        mask = (nd_csv_nsex[col] != 1)
        mask = mask.loc[nd_csv_nsex.index]
        nd_csv_nsex = nd_csv_nsex[mask]
nd_csv_nage = nd_csv_nsex.drop(axis=1, columns='age_at_scan')

In [None]:
x_data = pd.read_csv('model_data/model_16dim_short_test_embeddings_scaled.csv')
x_data = x_data.drop(axis=1, columns='Unnamed: 0')

In [None]:
le = LabelEncoder()
y = le.fit_transform(nd_csv_nage.idxmax(axis=1))
X = x_data.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

#### Logistic Regression

In [None]:
lr_vae = LogisticRegression(C=10, solver='liblinear')
lr_vae.fit(X_train, y_train)

In [None]:
y_pred_proba = lr_vae.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"ROC AUC Score: {roc_auc}")

#### Calculate combined balanced accuracy

In [None]:
y_pred = lr_vae.predict(X_test)
total_balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f'Total combined balanced accuracy: {total_balanced_accuracy:.2f}')

Predict probabilities for each class

Calculate ROC AUC for each class

Print ROC AUC scores for each class

In [None]:
classes = np.unique(y_test)
y_bin = label_binarize(y_test, classes=classes)
y_pred_proba = lr_vae.predict_proba(X_test)
roc_auc_scores = {}
for i, class_label in enumerate(classes):
    roc_auc_scores[le.inverse_transform([class_label])[0]] = roc_auc_score(y_bin[:, i], y_pred_proba[:, i])
for label, score in roc_auc_scores.items():
    print(f"ROC AUC Score for {label}: {score}")

In [None]:
age_labels = nd_csv_short['age_at_scan'].copy()
sex_labels = nd_csv_short['sex_female'].copy()
ad_labels = nd_csv_short['ad'].copy()
bvftd_labels = nd_csv_short['bvftd'].copy()
cu_labels = nd_csv_short['cu'].copy()
dlb_labels = nd_csv_short['dlb'].copy()

dementia_types = ['ad', 'bvftd', 'cu', 'dlb']
labels = [ad_labels, bvftd_labels, cu_labels, dlb_labels]

In [None]:
color_dict = {
    'ad': 'green',
    'bvftd': 'dodgerblue',
    'cu': 'mediumorchid',
    'dlb': 'tomato',
}

#### K Nearest Neighbors

In [None]:
knn_vae = KNeighborsClassifier(n_neighbors=8, metric='cosine')
knn_vae.fit(X_train, y_train)

In [None]:
y_pred_proba = knn_vae.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"ROC AUC Score: {roc_auc}")

#### KNN and LR Ensemble model

In [None]:
le = LabelEncoder()
y = le.fit_transform(nd_csv_nage.idxmax(axis=1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Now each column corresponds to a binary label for each class

In [None]:
dementia_types = ['ad', 'bvftd', 'cu', 'dlb']
y_test_one_hot = pd.get_dummies(y_test, prefix='class')
y_test_one_hot.columns = dementia_types
y_test_one_hot = y_test_one_hot.astype(int)

ad_labels_test = y_test_one_hot['ad']
bvftd_labels_test = y_test_one_hot['bvftd']
cu_labels_test = y_test_one_hot['cu']
dlb_labels_test = y_test_one_hot['dlb']

labels = [ad_labels_test, bvftd_labels_test, cu_labels_test, dlb_labels_test]

In [None]:
knn_ens = KNeighborsClassifier(n_neighbors=8, metric='cosine')
knn_ens.fit(X_train, y_train)
lr_vae_ens = LogisticRegression(C=10, solver='liblinear')
lr_vae_ens.fit(X_train, y_train)
y_pred_proba_knn_ens = knn_ens.predict_proba(X_test)
y_pred_proba_lr_ens = lr_vae_ens.predict_proba(X_test)
y_pred_proba_ensemble = (y_pred_proba_knn_ens + y_pred_proba_lr_ens) / 2

In [None]:
roc_auc_ens = roc_auc_score(y_test, y_pred_proba_ensemble, multi_class='ovr')
print(f"Ensemble ROC AUC Score: {roc_auc_ens}")

In [None]:
classes = np.unique(y_test)
y_bin = label_binarize(y_test, classes=classes)

y_pred_proba_knn = knn_ens.predict_proba(X_test)
y_pred_proba_lr = lr_vae_ens.predict_proba(X_test)
y_pred_proba_ensemble = (y_pred_proba_knn + y_pred_proba_lr) / 2

roc_auc_scores = {}
for i, class_label in enumerate(classes):
    roc_auc_scores[le.inverse_transform([class_label])[0]] = roc_auc_score(y_bin[:, i], y_pred_proba_ensemble[:, i])
    
for label, score in roc_auc_scores.items():
    print(f"Ensemble ROC AUC Score for {label}: {score}")

In [None]:
plt.figure(figsize=(6, 6))

for dementia_type, label in zip(dementia_types, labels):
    transformed_label = le.transform([dementia_type])
    y_pred_proba_knn = knn_ens.predict_proba(X_test)[:, transformed_label[0]]
    y_pred_proba_lr = lr_vae_ens.predict_proba(X_test)[:, transformed_label[0]]
    y_pred_proba_ensemble = (y_pred_proba_knn + y_pred_proba_lr) / 2
    
    fpr, tpr, _ = roc_curve(label, y_pred_proba_ensemble)
    roc_auc = roc_auc_score(label, y_pred_proba_ensemble)
    plt.plot(fpr, tpr, lw=2, label=f'{dementia_type} ROC curve (area = {roc_auc:.2f})', color=color_dict[dementia_type])

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve -- Ensemble Model')
plt.legend(fontsize="12", loc="lower right")
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

#### PCA version

In [None]:
mask_indices = nd_csv_short.index
nd_data_org_short = nd_parquet_scaled.loc[mask_indices]
pca_res = PCA(n_components=16, whiten=True).fit_transform(nd_data_org_short)

In [None]:
le = LabelEncoder()
y = le.fit_transform(nd_csv_nage.idxmax(axis=1))
X_pca = pca_res
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)
knn_pca = KNeighborsClassifier(n_neighbors=8, metric='cosine')
knn_pca.fit(X_train_pca, y_train_pca)

In [None]:
y_pred_proba = knn_pca.predict_proba(X_test_pca)
roc_auc = roc_auc_score(y_test_pca, y_pred_proba, multi_class='ovr')
print(f"ROC AUC Score: {roc_auc}")

In [None]:
classes = np.unique(y_test_pca)
y_bin = label_binarize(y_test_pca, classes=classes)
y_pred_proba = knn_pca.predict_proba(X_test_pca)

roc_auc_scores = {}
for i, class_label in enumerate(classes):
    roc_auc_scores[le.inverse_transform([class_label])[0]] = roc_auc_score(y_bin[:, i], y_pred_proba[:, i])

for label, score in roc_auc_scores.items():
    print(f"ROC AUC Score for {label}: {score}")

Now each column corresponds to a binary label for each class

In [None]:
dementia_types = ['ad', 'bvftd', 'cu', 'dlb']
y_test_one_hot = pd.get_dummies(y_test_pca, prefix='class')
y_test_one_hot.columns = dementia_types
y_test_one_hot = y_test_one_hot.astype(int)

ad_labels_test = y_test_one_hot['ad']
bvftd_labels_test = y_test_one_hot['bvftd']
cu_labels_test = y_test_one_hot['cu']
dlb_labels_test = y_test_one_hot['dlb']

labels = [ad_labels_test, bvftd_labels_test, cu_labels_test, dlb_labels_test]

In [None]:
plt.figure(figsize=(6, 6))

for dementia_type, label in zip(dementia_types, labels):
    y_pred_proba = knn_pca.predict_proba(X_test_pca)[:, le.transform([dementia_type])[0]]
    fpr, tpr, _ = roc_curve(label, y_pred_proba)
    roc_auc = roc_auc_score(label, y_pred_proba)
    plt.plot(fpr, tpr, lw=2, label=f'{dementia_type} ROC curve (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve -- PCA Model')
plt.legend(fontsize="12", loc="lower right")
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right']. set_visible(False)
plt.show()