In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.decomposition import KernelPCA
from sklearn.metrics.pairwise import pairwise_kernels
from scipy.stats import skew, kurtosis, shapiro
import numpy as np

def DKPCA(features_scaled, n_components=None, kernel=None):
    
    # Perform KPCA
    kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=True)
    kpca.fit(features_scaled)
    
    # Transform the features
    projections = kpca.transform(features_scaled)
    
    # Determine thresholds
    thresholds = {}
    for j in range(projections.shape[1]):
        s = skew(projections[:, j])
        k = kurtosis(projections[:, j], fisher=False)
        stat, p_value = shapiro(projections[:, j])
        if p_value > 0.05:
            mean = np.mean(projections[:, j])
            std = np.std(projections[:, j])
            thresholds[j] = mean + 2 * std  # 95% confidence interval
        else:
            thresholds[j] = np.percentile(projections[:, j], 95)  # 95th percentile
    
    # Select subset indices
    subset_indices = []
    for j in range(projections.shape[1]):
        candidate_indices = np.where(projections[:, j] < thresholds[j])[0]
        if candidate_indices.size > 0:
            subset_index = candidate_indices[np.argmax(projections[candidate_indices, j])]
            subset_indices.append(subset_index)
    
    subset_indices = list(set(subset_indices))
    
    # Compute the new kernel matrix using the same kernel function
    K_new = pairwise_kernels(features_scaled[subset_indices, :], features_scaled, metric=kernel)
    
    # Compute the DKPCA features
    eigenvectors_subset = kpca.eigenvectors_[subset_indices, :]
    features_dkpca = np.dot(K_new.T, eigenvectors_subset)
    
    return features_dkpca

In [54]:
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, KernelPCA
from concurrent.futures import ThreadPoolExecutor
import concurrent
# from DKPCA import DKPCA  # Assuming DKPCA is a custom function or module you have

def parallel(features_scaled, scales_subset, windows_subset, n_component, feature_reduction_method, kernel=None, labels= None):
    if feature_reduction_method == 'pca':
        pca = PCA(n_components=n_component)
        features_reduced = pca.fit_transform(features_scaled)
    elif feature_reduction_method == 'kpca':
        kpca = KernelPCA(n_components=n_component, kernel=kernel)
        features_reduced = kpca.fit_transform(features_scaled)
    elif feature_reduction_method == 'dkpca':
        features_reduced = DKPCA(features_scaled, n_components=n_component, kernel=kernel)
    else:
        features_reduced = features_scaled

    X_train, X_test, y_train, y_test = train_test_split(features_reduced, labels, test_size=0.3, random_state=42)
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.fit_transform(y_test)
    classifiers = [XGBClassifier(device = 'cuda'), KNeighborsClassifier(n_neighbors= 14), LinearDiscriminantAnalysis(solver = 'lsqr')]
    clf_names = ['XGBClassifier', 'KNeighborsClassifier', 'LinearDiscriminantAnalysis']
    local_results = []
    for clf, clf_name in zip(classifiers, clf_names):
        # Train and evaluate the classifier
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Append results
        local_results.append((report['accuracy'], scales_subset, windows_subset, clf_name, feature_reduction_method, n_component, kernel))

    return local_results

def evaluate_models(df, scales, windows, feature_reduction=None, components_range=None, kernels=None):
    results = []
    features = df.iloc[:, 2:]
    labels = df.iloc[:, 1]

    for r in range(1, len(scales)+1):
        for scales_subset in itertools.combinations(scales, r):
            for s in range(1, len(windows)+1):
                for windows_subset in itertools.combinations(windows, s):
                    features_remove = features
                    for scale in scales_subset:
                        features_remove = features_remove.loc[:, ~features_remove.columns.str.contains(scale)]
                    for window in windows_subset:
                        features_remove = features_remove.loc[:, ~features_remove.columns.str.contains(window)]
                    if features_remove.empty:
                        continue
                    scaler = MinMaxScaler(feature_range=(0, 1))
                    features_scaled = scaler.fit_transform(features_remove)

                    if feature_reduction:
                        loop_range = components_range
                    else:
                        loop_range = [None]

                    for kernel in kernels:
                        with ThreadPoolExecutor(max_workers=len(loop_range)) as executor:
                            future_tasks = {executor.submit(parallel, features_scaled, scales_subset, windows_subset, n_component, feature_reduction, kernel, labels): n_component for n_component in loop_range}

                            for future in concurrent.futures.as_completed(future_tasks):
                                results.extend(future.result())
                    results.sort(key=lambda x: x[0], reverse=True)
    results.sort(key=lambda x: x[0], reverse=True)
    top_5_results = results[:5]

    return top_5_results


In [145]:
df1 = pd.read_csv('/mnt/data_lab513/tramy/EntropyResults_hippo.csv')
df2 = pd.read_csv('/mnt/data_lab513/tramy/EntropyResults_ventricles.csv')
df3 = pd.read_csv('/mnt/data_lab513/tramy/EntropyResults_ventricles_ori.csv')

df = df1.merge(df2, on=['subject', 'label'])
df = df.merge(df3, on=['subject', 'label'])



#df = df[df['label'].isin([0, 1, 2])]

scales = ['Scale2', 'Scale3', 'Scale4']
windows = ['Window1', 'Window2', 'Window3', 'Window4']
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'] 

In [146]:
features = df.iloc[:, 2:]
labels = df.iloc[:, 1]
features = features.loc[:, ~features.columns.str.contains("Scale2")]
features = features.loc[:, ~features.columns.str.contains("Window3")]
features = features.loc[:, ~features.columns.str.contains("Window1")]
features = features.loc[:, ~features.columns.str.contains("Window2")]
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
kpca = KernelPCA(n_components=14, kernel='cosine')
features_reduced = kpca.fit_transform(features_scaled)
X_train, X_test, y_train, y_test = train_test_split(features_reduced, labels, test_size=0.3, random_state=42)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
import time
start = time.time()
clf = XGBClassifier(device = 'cuda')
clf.fit(X_train, y_train)
end = time.time()
process_one_time = end - start

print(f"Time taken by process one: {process_one_time} seconds")
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

Time taken by process one: 1.002009630203247 seconds
              precision    recall  f1-score   support

           0       0.60      0.51      0.55        63
           1       0.48      0.52      0.50        66
           2       0.49      0.71      0.58        51
           3       0.31      0.21      0.25        52

    accuracy                           0.49       232
   macro avg       0.47      0.49      0.47       232
weighted avg       0.48      0.49      0.48       232



In [71]:
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler

features = df.iloc[:, 2:]
cols_to_keep = features.columns[(features != 0).any(axis=0)]
features = features[cols_to_keep]
scaler = MinMaxScaler(feature_range=(0, 1))
features_scaled = scaler.fit_transform(features)
labels = df.iloc[:, 1]
not_significant_features = []

for feature in df.columns:
    if feature not in ['subject', 'label']:
        class0 = df[df['label'] == 0][feature].dropna()
        class1 = df[df['label'] == 1][feature].dropna()
        class2 = df[df['label'] == 2][feature].dropna()
        class3 = df[df['label'] == 3][feature].dropna()

        if len(class0) < 2 or len(class1) < 2 or len(class2) < 2 or len(class3) < 2:
            continue
        F, p = stats.f_oneway(class0, class1, class2, class3)    
        if p > 0.05:
            not_significant_features.append(feature)

for feature in not_significant_features:
    print(f"Feature '{feature}' is not significant.")

df = df.drop(columns=not_significant_features)


In [None]:
top_5_None = evaluate_models(df, scales, windows, feature_reduction=None, kernels=['rbf'])
for result in top_5_None:
     print(result)

In [None]:
top_5_PCA = evaluate_models(df, scales, windows, feature_reduction='pca', components_range=range(5, 18), kernels = kernels)
for result in top_5_PCA:
     print(result)

In [None]:
top_5_KPCA = evaluate_models(df, scales, windows, feature_reduction='kpca', components_range=range(5, 15), kernels = kernels)
for result in top_5_KPCA:
     print(result)

In [None]:
top_5_DKPCA = evaluate_models(df, scales, windows, feature_reduction='dkpca', components_range=range(5, 15), kernels = kernels)
for result in top_5_DKPCA:
     print(result)