In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier, DMatrix
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb

In [2]:
from sklearn.decomposition import KernelPCA
from sklearn.metrics.pairwise import pairwise_kernels
from scipy.stats import skew, kurtosis, shapiro
import numpy as np

def DKPCA(features_scaled, n_components=None, kernel=None):
    
    # Perform KPCA
    kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=True)
    kpca.fit(features_scaled)
    
    # Transform the features
    projections = kpca.transform(features_scaled)
    
    # Determine thresholds
    thresholds = {}
    for j in range(projections.shape[1]):
        s = skew(projections[:, j])
        k = kurtosis(projections[:, j], fisher=False)
        stat, p_value = shapiro(projections[:, j])
        if p_value > 0.05:
            mean = np.mean(projections[:, j])
            std = np.std(projections[:, j])
            thresholds[j] = mean + 2 * std  # 95% confidence interval
        else:
            thresholds[j] = np.percentile(projections[:, j], 95)  # 95th percentile
    
    # Select subset indices
    subset_indices = []
    for j in range(projections.shape[1]):
        candidate_indices = np.where(projections[:, j] < thresholds[j])[0]
        if candidate_indices.size > 0:
            subset_index = candidate_indices[np.argmax(projections[candidate_indices, j])]
            subset_indices.append(subset_index)
    
    subset_indices = list(set(subset_indices))
    
    # Compute the new kernel matrix using the same kernel function
    K_new = pairwise_kernels(features_scaled[subset_indices, :], features_scaled, metric=kernel)
    
    # Compute the DKPCA features
    eigenvectors_subset = kpca.eigenvectors_[subset_indices, :]
    features_dkpca = np.dot(K_new.T, eigenvectors_subset)
    
    return features_dkpca

In [4]:
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, KernelPCA
from concurrent.futures import ThreadPoolExecutor
import concurrent

def parallel(features_scaled, n_component, feature_reduction_method, kernel=None, labels= None):
    if feature_reduction_method == 'pca':
        pca = PCA(n_components=n_component)
        features_reduced = pca.fit_transform(features_scaled)
    elif feature_reduction_method == 'kpca':
        kpca = KernelPCA(n_components=n_component, kernel=kernel)
        features_reduced = kpca.fit_transform(features_scaled)
    elif feature_reduction_method == 'dkpca':
        features_reduced = DKPCA(features_scaled, n_components=n_component, kernel=kernel)
    else:
        features_reduced = features_scaled

    X_train, X_test, y_train, y_test = train_test_split(features_reduced, labels, test_size=0.3, random_state=42)
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)

    dtrain = DMatrix(X_train, label=y_train)
    dtest = DMatrix(X_test, label=y_test)

    classifiers = [xgb.XGBClassifier(device = 'cuda'), KNeighborsClassifier(n_neighbors= 14), LinearDiscriminantAnalysis(solver = 'lsqr')]
    clf_names = ['XGBClassifier', 'KNeighborsClassifier', 'LinearDiscriminantAnalysis']
    local_results = []
    for clf, clf_name in zip(classifiers, clf_names):

        if clf_name == 'XGBClassifier':
            param = clf.get_xgb_params()
            param['objective'] = 'multi:softmax'
            param['num_class'] = len(set(y_train))
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dtest = xgb.DMatrix(X_test, label=y_test)
            bst = xgb.train(param, dtrain)
            y_pred = bst.predict(dtest)
        else:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        local_results.append((report['accuracy'], clf_name, feature_reduction_method, n_component, kernel))

    return local_results

def evaluate_models(df, feature_reduction=None, components_range=None, kernels=None):
    results = []
    features = df.iloc[:, 2:]
    labels = df.iloc[:, 1]
    scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = scaler.fit_transform(features)

    if feature_reduction:
        loop_range = components_range
    else:
        loop_range = [None]

    for kernel in kernels:
        with ThreadPoolExecutor(max_workers=len(loop_range)) as executor:
            future_tasks = {executor.submit(parallel, features_scaled, n_component, feature_reduction, kernel, labels): n_component for n_component in loop_range}

            for future in concurrent.futures.as_completed(future_tasks):
                results.extend(future.result())
                
    results.sort(key=lambda x: x[0], reverse=True)
    top_5_results = results[:5]
    return top_5_results


In [8]:
import pandas as pd

types = ["ori", "segment"]
regions = ["hippo", "ven"]

file_path = f"/mnt/data_lab513/tramy/4CAD/data/entropy/Entropy_{regions[0]}_{types[0]}.csv"
merged_df = pd.read_csv(file_path)

for r in regions:
    for t in types[1:]:
        file_path = f"/mnt/data_lab513/tramy/4CAD/data/entropy/Entropy_{r}_{t}.csv"
        
        df = pd.read_csv(file_path)

        merged_df = pd.merge(merged_df, df, on=["subject", "label"])



In [34]:
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler

features = merged_df.iloc[:, 2:]
cols_to_keep = features.columns[(features != 0).any(axis=0)]
features = features[cols_to_keep]
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
labels = merged_df.iloc[:, 1]
not_significant_features = []

for feature in merged_df.columns:
    if feature not in ['subject', 'label']:
        class0 = merged_df[merged_df['label'] == 0][feature].dropna()
        class1 = merged_df[merged_df['label'] == 1][feature].dropna()
        class2 = merged_df[merged_df['label'] == 2][feature].dropna()
        class3 = merged_df[merged_df['label'] == 3][feature].dropna()

        if len(class0) < 2 or len(class1) < 2 or len(class2) < 2 or len(class3) < 2:
            continue
        F, p = stats.f_oneway(class0, class1, class2, class3)    
        if p > 0.05:
            not_significant_features.append(feature)

for feature in not_significant_features:
    print(f"Feature '{feature}' is not significant.")

merged_df = merged_df.drop(columns=not_significant_features)


In [42]:
merged_df = merged_df[merged_df['label'].isin([0, 2])]

In [35]:
top_5_None = evaluate_models(merged_df, feature_reduction=None, kernels=['rbf'])
for result in top_5_None:
     print(result)

(0.484375, 'LinearDiscriminantAnalysis', None, None, 'rbf')
(0.4479166666666667, 'KNeighborsClassifier', None, None, 'rbf')
(0.4270833333333333, 'XGBClassifier', None, None, 'rbf')


In [36]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'] 

In [38]:
top_5_PCA = evaluate_models(merged_df,feature_reduction='pca', components_range=range(2, 36), kernels = kernels)
for result in top_5_PCA:
     print(result)

(0.5364583333333334, 'LinearDiscriminantAnalysis', 'pca', 10, 'linear')
(0.5364583333333334, 'LinearDiscriminantAnalysis', 'pca', 10, 'poly')
(0.5364583333333334, 'LinearDiscriminantAnalysis', 'pca', 10, 'rbf')
(0.5364583333333334, 'LinearDiscriminantAnalysis', 'pca', 10, 'sigmoid')
(0.5364583333333334, 'LinearDiscriminantAnalysis', 'pca', 10, 'cosine')


In [39]:
top_5_KPCA = evaluate_models(merged_df, feature_reduction='kpca', components_range=range(2, 36), kernels = kernels)
for result in top_5_KPCA:
     print(result)

(0.5364583333333334, 'LinearDiscriminantAnalysis', 'kpca', 10, 'linear')
(0.5364583333333334, 'LinearDiscriminantAnalysis', 'kpca', 8, 'sigmoid')
(0.53125, 'LinearDiscriminantAnalysis', 'kpca', 9, 'linear')
(0.53125, 'LinearDiscriminantAnalysis', 'kpca', 8, 'poly')
(0.53125, 'LinearDiscriminantAnalysis', 'kpca', 10, 'sigmoid')


In [40]:
top_5_DKPCA = evaluate_models(merged_df, feature_reduction='dkpca', components_range=range(2, 36), kernels = kernels)
for result in top_5_DKPCA:
     print(result)

(0.53125, 'LinearDiscriminantAnalysis', 'dkpca', 6, 'rbf')
(0.5260416666666666, 'LinearDiscriminantAnalysis', 'dkpca', 9, 'linear')
(0.5260416666666666, 'LinearDiscriminantAnalysis', 'dkpca', 9, 'poly')
(0.5260416666666666, 'LinearDiscriminantAnalysis', 'dkpca', 10, 'rbf')
(0.5208333333333334, 'LinearDiscriminantAnalysis', 'dkpca', 6, 'cosine')


In [43]:
df = merged_df
features = df.iloc[:, 2:]
labels = df.iloc[:, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
features_reduced = DKPCA(features_scaled, n_components=6, kernel='rbf')
X_train, X_test, y_train, y_test = train_test_split(features_reduced, labels, test_size=0.3, random_state=42)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
clf = LinearDiscriminantAnalysis(solver = 'lsqr')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.66      0.68      0.67        63
           1       0.64      0.62      0.63        58

    accuracy                           0.65       121
   macro avg       0.65      0.65      0.65       121
weighted avg       0.65      0.65      0.65       121

