**Aim:**

In this kernel, we aim to use the final model, designed in kernel main3, to test it on the hold out test set. 

### Import packages

In [1]:
import os
import tqdm
import pickle
import sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from imblearn import pipeline
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, roc_auc_score, confusion_matrix

from FeatureSelection import FeatureSelector
from Metrics import MetricsCal




### Loading the Data

In [2]:
FeatureDir = "../Data/ExtractedFeatures/Pearson/AVI_P_features_Thr0.875.csv"
with open("../Data/Test_IDs/TestID.pickle", "rb") as file:
    Test_IDs = pickle.load(file)

# Reading the Feature matrix
df = pd.read_csv(FeatureDir)


df_train = df[~df["ScanDir ID"].isin(Test_IDs)]
df_test = df[df["ScanDir ID"].isin(Test_IDs)]

# preparing the feature set and labels
X_train, y_train = df_train.iloc[:, 1:-5], np.array(df_train.loc[:, "DX"])
X_test, y_test = df_test.iloc[:, 1:-5], np.array(df_test.loc[:, "DX"])

### Feature selection

In [3]:
random_state, n_split = 0, 7
ImpFeatures = FeatureSelector(X_train, y_train, random_state, n_split, 1000, 45, 2)
X_train = X_train.loc[:, ImpFeatures]
X_test = X_test.loc[:, ImpFeatures]

print(len(ImpFeatures))

35


In [4]:
ImpFeatures.append("DX")
data2 = df.loc[:, ImpFeatures]
data2.to_csv("PearsonData2.csv")


Modeling pipeline

In [12]:
# Function to load data from a pickle file
def load_data(path):
    with open(path, "rb") as file:
        return pickle.load(file)

# Function to normalize data using StandardScaler
def normalize_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)
    return X_train_normalized, X_test_normalized

# Function to build a classifier based on the given name
def build_classifier(classifier_name):
    if classifier_name == "XGB":
        return XGBClassifier(
            max_depth=3,
            min_child_weight=1,
            subsample=0.5,
            learning_rate=0.8,
            n_estimators=150,
            scale_pos_weight=100
        )
    elif classifier_name == "BRF":
        return BalancedRandomForestClassifier(
            criterion='gini', 
            max_depth=10, 
            max_features='sqrt', 
            min_samples_split=2, 
            n_estimators=150
        )
    elif classifier_name == "EEC":
        return EasyEnsembleClassifier(n_estimators=10)
    else:
        raise ValueError(f"Invalid classifier name: {classifier_name}")

# Function to calculate evaluation metrics for a given classifier
def calculate_metrics(classifier_name, X_test, y_test, new_threshold=0.4):
    classifier = build_classifier(classifier_name)
    pipe_classifier = pipeline.Pipeline(steps=[("Scaler", StandardScaler()), ("Model", classifier)])
    pipe_classifier.fit(X_train, y_train)

    y_score = pipe_classifier.predict_proba(X_test)[:, 1]
    y_pred = (y_score > new_threshold).astype(int)

    cm = confusion_matrix(y_test, y_pred)
    TP = cm[1, 1]  # true positive
    TN = cm[0, 0]  # true negatives
    FP = cm[0, 1]  # false positives
    FN = cm[1, 0]  # false negatives
    Specificity = TN / (TN + FP)
    Sensitivity = TP / (FN + TP)

    AUC = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    ACC = accuracy_score(y_test, y_pred)

    return AUC, Sensitivity, Specificity, f1, ACC

# Function to plot adjacency subnetwork
def plot_adjacency_subnetwork(edges, network_order, mode, cmap, idx_to_label):
    network_colors = {region: "#232324" for region in network_order}
    plt.figure(figsize=(11, 11))
    plot_chord(
        idx_to_label,
        edges,
        network_order=network_order,
        linewidths=7,
        alphas=0.9,
        do_ROI_circles=True,
        do_ROI_circles_specific=True,
        ROI_circle_radius=0.02,
        cmap=cmap,
        network_colors=network_colors,
        mode=mode
    )
    plt.show()

# ... (data loading and preprocessing)

if __name__ == "__main__":
    AUC_BRF, Sensitivity_BRF, Specificity_BRF, AucPred_BRF, f1_BRF, ACC_BRF = [], [], [], [], [], []
    AUC_EEC, Sensitivity_EEC, Specificity_EEC, AucPred_EEC, f1_EEC, ACC_EEC = [], [], [], [], [], []
    AUCs, Specificities, Sensitivities, f1s, ACCs = [], [], [], [], []

    for _ in tqdm(range(50)):
        X_train_normalized, X_test_normalized = normalize_data(X_train, X_test)

        # EEC
        AUC_eec, Sensitivity_eec, Spcifity_eec, f1_eec, ACC_eec = calculate_metrics("EEC", X_test_normalized, y_test)
        AUC_EEC.append(AUC_eec)
        Sensitivity_EEC.append(Sensitivity_eec)
        Specificity_EEC.append(Spcifity_eec)
        f1_EEC.append(f1_eec)
        ACC_EEC.append(ACC_eec)

        # BRF
        AUC_brf, Sensitivity_brf, Spcifity_brf, f1_brf, ACC_brf = calculate_metrics("BRF", X_test_normalized, y_test)
        AUC_BRF.append(AUC_brf)
        Sensitivity_BRF.append(Sensitivity_brf)
        Specificity_BRF.append(Spcifity_brf)
        f1_BRF.append(f1_brf)
        ACC_BRF.append(ACC_brf)

        # XGB
        AUC_xgb, Sensitivity_xgb, Specificity_xgb, f1_xgb, ACC_xgb = calculate_metrics("XGB", X_test_normalized, y_test)
        AUCs.append(AUC_xgb)
        Sensitivities.append(Sensitivity_xgb)
        Specificities.append(Specificity_xgb)
        f1s.append(f1_xgb)
        ACCs.append(ACC_xgb)


 56%|█████▌    | 28/50 [02:15<01:33,  4.23s/it]

In [9]:
print(np.mean(AUC_BRF))
print(np.mean(Sensitivity_BRF))
print(np.mean(spcifity_BRF))
print(np.mean(f1_BRF))
print(np.mean(ACC_BRF))

0.45567272727272723
0.21454545454545457
0.6968000000000001
0.22430585700325906
0.5494444444444446


In [10]:
print(np.mean(AUC_EEC))
print(np.mean(Sensitivity_EEC))
print(np.mean(spcifity_EEC))
print(np.mean(f1_EEC))
print(np.mean(ACC_EEC))

0.44047272727272735
0.29454545454545455
0.5864000000000001
0.26111857192726756
0.4972222222222223


In [11]:
print(np.mean(AUCs))
print(np.mean(Specificities))
print(np.mean(Sensitivities))
print(np.mean(f1s))
print(np.mean(ACCs))

0.4763636363636364
0.68
0.2727272727272727
0.2727272727272727
0.5555555555555556
