# Cross Dataset Classification

In [1]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as p
import cedalion.datasets
import cedalion.plots
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score,roc_curve, roc_auc_score, auc
import pickle
import os
import pandas as pd
from scipy.stats import pointbiserialr
from scipy.stats import binom
from classify_utils_old import plot_grouped_bars_by_subject, plot_grouped_bars_by_dt, loso_classification, extract_features, parcel_loso_classification, get_parcel_loso_features

import cedalion_parcellation
import cedalion_parcellation.datasets
import cedalion_parcellation.plots
import cedalion_parcellation.imagereco.forward_model as fw
from cedalion.imagereco.solver import pseudo_inverse_stacked
import configs
from configs import load_dataset_configs

import cedalion.geometry.landmarks as cd_landmarks

In [2]:
data_path = configs.data_path_prefix

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
def min_significant_correct(n, p=0.5, alpha=0.05):
    print(n)
    print(next(k for k in range(n + 1) if binom.sf(k - 1, n, p) < alpha))
    return next(k for k in range(n + 1) if binom.sf(k - 1, n, p) < alpha)

In [5]:
data_types = ['HD_Squeezing', 'BS_Laura']
dataset_configs = load_dataset_configs(data_types=data_types, load_sensitivity=True, test=True)
shared_parcel_subset = [p for p in dataset_configs['HD_Squeezing'].sensitive_parcels]
#dataset_configs = load_dataset_configs(data_types=data_types)
#shared_parcel_subset = [p for p in dataset_configs['HD_Squeezing']["sensitive_parcels"]]

In [6]:
len(shared_parcel_subset)

104

In [7]:
with open(os.path.join(data_path, 'BS_Laura', 'subsets_data'), 'rb') as file:
    subsets_data = pickle.load(file)

## Extract LOSO Features for both datasets

In [8]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

classifiers = {
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'),
    #'SVC_lin': SVC(kernel='linear', probability=True, C=0.1),
}

save_plot = True

prune_by_zeroing = True
n_reduced_feat_ws = 30
n_reduced_feat_loso = 30
spatial_scaling = 2
datasets_path = "/home/thomas/Dokumente/Master/Master_Thesis/datasets/"

dt_conditions_parcel = [
    "all_od", 
    "all_od_ss_mean_full"
]

dt_labels_parcel = {
    "all_od": "Parcel No SS Correction",
    "all_od_ss_mean_full": "Parcel SS corrected",
}

all_parcel_features = {dtcp: {} for dtcp in dt_conditions_parcel}
parcel_features_by_dt = {dt: {dtcp: {} for dtcp in dt_conditions_parcel} for dt in data_types}

# Main pipeline
for data_type in data_types:

    print(f"\nProcessing {data_type} dataset... \n")

    cfg = dataset_configs[data_type]
    synthetic = cfg.synthetic
    subjects = cfg.subjects
    #subjects = subjects[0:2]  # For testing, only use the first subject
    base_path = cfg.base_path
    ft_slices = cfg.feature_slices
    long_chs = cfg.long_channels
    probe_area = cfg.probe_area
    with open(os.path.join(base_path, 'subsets_data'), 'rb') as file:
        subsets_data = pickle.load(file)
    subset_keys = list(reversed(subsets_data.keys()))

    print("SUBSETS")
    print(subsets_data)

    clean_ch_map = {}
    for subject_idx, subject in enumerate(subjects):
        clean_ch_map[subject] = {}
        for run in range(cfg.n_runs(subject_idx)):
            clean_ch_map[subject][run] = {}
            with open(os.path.join(base_path, cfg.clean_channels_path(subject, run)), 'rb') as f:
                clean_ch_map[subject][run]['parcel'] = pickle.load(f)

    result_path = f'/home/thomas/Dokumente/Master/Master_Thesis/results/{data_type}/channel_space/'

    for int_scaling in ['03']:
        data = {}
        for subject_idx, subject in enumerate(subjects):
            data[subject] = {}
            n_runs = cfg.n_runs(subject_idx)
            for run in range(n_runs):
                ep_path = cfg.epochs_labels_path(subject, run, int_scaling, spatial_scaling)
                print(os.path.join(base_path, ep_path))
                with open(os.path.join(base_path, ep_path), 'rb') as f:
                    data[subject][run] = pickle.load(f)

        for feature_types in ['Slope']:

            print(f"\nFeature type: {feature_types} \n")

            for prune_channels in [True]:

                print(f"\nPruning channels: {prune_channels} \n")

                for reduce_features in [False]:

                    print(f"\nFeature reduction: {reduce_features} \n")

                    ############### ---- PARCEL SPACE CLASSIFICATION (per subject) ############### 

                    full_channel_subset = subsets_data['full']['all']

                    selected_Adot = cfg.Adot
                    selected_B = cfg.B

                    if synthetic:
                        selected_channels_mask = np.isin(selected_Adot.channel.values, full_channel_subset)
                        selected_Adot = selected_Adot[selected_channels_mask,:]
                        selected_channels_mask_stacked = np.tile(selected_channels_mask, 2)
                        selected_B = selected_B[:, selected_channels_mask_stacked]


                    print("biggest subset size: ", len(full_channel_subset)) 

                    print("parcel subset size: ", len(shared_parcel_subset))

                    ############### LOSO Classification (Parcel Space) ############### 

                    # need to change dt to 'full' version if 'ss' in dt

                    LOSO_parcel_results = {}

                    for dt in dt_conditions_parcel:

                        data_parcel = {}
                        for subject in subjects:
                            data_parcel[subject] = {}
                            for run in data[subject]:
                                if "ss" in dt:
                                    if dt not in data[subject][run]['full']:
                                        continue
                                    data_parcel[subject][run] = data[subject][run]['full'][dt]
                                else:
                                    data_parcel[subject][run] = data[subject][run]['all_od']        

                        all_parcel_features[dt] = all_parcel_features[dt] | get_parcel_loso_features(
                            data=data_parcel,
                            subjects=subjects,
                            feature_types=feature_types,
                            ft_slices=ft_slices,
                            Adot=cfg.Adot,
                            B=cfg.B,
                            clean_ch_map=clean_ch_map,
                            parcels=shared_parcel_subset,
                            prune=prune_channels
                        )

                        parcel_features_by_dt[data_type][dt] = get_parcel_loso_features(
                            data=data_parcel,
                            subjects=subjects,
                            feature_types=feature_types,
                            ft_slices=ft_slices,
                            Adot=cfg.Adot,
                            B=cfg.B,
                            clean_ch_map=clean_ch_map,
                            parcels=shared_parcel_subset,
                            prune=prune_channels
                        )


Processing HD_Squeezing dataset... 

SUBSETS
{'full': {'n_optodes': 46, 'n_optodes_percent': 100.0, 'all': ['S1D1', 'S1D2', 'S1D4', 'S1D5', 'S1D6', 'S1D8', 'S2D2', 'S2D3', 'S2D5', 'S2D6', 'S2D7', 'S2D9', 'S3D1', 'S3D4', 'S3D5', 'S3D8', 'S3D10', 'S3D11', 'S3D14', 'S4D2', 'S4D4', 'S4D5', 'S4D6', 'S4D7', 'S4D8', 'S4D9', 'S4D10', 'S4D11', 'S4D12', 'S4D13', 'S4D15', 'S5D3', 'S5D6', 'S5D7', 'S5D9', 'S5D12', 'S5D13', 'S5D16', 'S6D8', 'S6D10', 'S6D11', 'S6D12', 'S6D14', 'S6D15', 'S7D9', 'S7D11', 'S7D12', 'S7D13', 'S7D15', 'S7D16', 'S8D17', 'S8D18', 'S8D20', 'S8D21', 'S8D22', 'S8D24', 'S9D18', 'S9D19', 'S9D21', 'S9D22', 'S9D23', 'S9D25', 'S10D17', 'S10D20', 'S10D21', 'S10D24', 'S10D26', 'S10D27', 'S10D30', 'S11D18', 'S11D20', 'S11D21', 'S11D22', 'S11D23', 'S11D24', 'S11D25', 'S11D26', 'S11D27', 'S11D28', 'S11D29', 'S11D31', 'S12D19', 'S12D22', 'S12D23', 'S12D25', 'S12D28', 'S12D29', 'S12D32', 'S13D24', 'S13D26', 'S13D27', 'S13D28', 'S13D30', 'S13D31', 'S14D25', 'S14D27', 'S14D28', 'S14D29', 'S

In [9]:
'''
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

classifiers = {
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'),
    #'SVC_lin': SVC(kernel='linear', probability=True, C=0.1),
}

save_plot = True

prune_by_zeroing = True
n_reduced_feat_ws = 30
n_reduced_feat_loso = 30
spatial_scaling = 2
datasets_path = "/home/thomas/Dokumente/Master/Master_Thesis/datasets/"

dt_conditions_parcel = [
    "all_od", 
    "all_od_ss_mean_full"
]

dt_labels_parcel = {
    "all_od": "Parcel No SS Correction",
    "all_od_ss_mean_full": "Parcel SS corrected",
}

all_parcel_features = {dtcp: {} for dtcp in dt_conditions_parcel}
parcel_features_by_dt = {dt: {dtcp: {} for dtcp in dt_conditions_parcel} for dt in data_types}

# Main pipeline
for data_type in data_types:

    print(f"\nProcessing {data_type} dataset... \n")

    cfg = dataset_configs[data_type]
    synthetic = cfg["synthetic"]
    subjects = cfg["subjects"]
    #subjects = subjects[0:2]  # For testing, only use the first subject
    base_path = cfg["base_path"]
    ft_slices = cfg["feature_slices"]
    long_chs = cfg["long_channels"]
    probe_area = cfg["probe_area"]
    with open(os.path.join(base_path, 'subsets_data'), 'rb') as file:
        subsets_data = pickle.load(file)
    subset_keys = list(reversed(subsets_data.keys()))

    print("SUBSETS")
    print(subsets_data)

    clean_ch_map = {}
    for subject_idx, subject in enumerate(subjects):
        clean_ch_map[subject] = {}
        for run in range(cfg["n_runs"](subject_idx)):
            clean_ch_map[subject][run] = {}
            with open(os.path.join(base_path, cfg["clean_channels_path"](subject, run)), 'rb') as f:
                clean_ch_map[subject][run]['parcel'] = pickle.load(f)

    result_path = f'/home/thomas/Dokumente/Master/Master_Thesis/results/{data_type}/channel_space/'

    for int_scaling in ['03']:
        data = {}
        for subject_idx, subject in enumerate(subjects):
            data[subject] = {}
            n_runs = cfg["n_runs"](subject_idx)
            for run in range(n_runs):
                ep_path = cfg["epochs_labels_path"](subject, run, int_scaling, spatial_scaling)
                print(os.path.join(base_path, ep_path))
                with open(os.path.join(base_path, ep_path), 'rb') as f:
                    data[subject][run] = pickle.load(f)

        for feature_types in ['Slope']:

            print(f"\nFeature type: {feature_types} \n")

            for prune_channels in [True]:

                print(f"\nPruning channels: {prune_channels} \n")

                for reduce_features in [False]:

                    print(f"\nFeature reduction: {reduce_features} \n")

                    ############### ---- PARCEL SPACE CLASSIFICATION (per subject) ############### 

                    full_channel_subset = subsets_data['full']['all']

                    selected_Adot = cfg["Adot"]
                    selected_B = cfg["B"]

                    if synthetic:
                        selected_channels_mask = np.isin(selected_Adot.channel.values, full_channel_subset)
                        selected_Adot = selected_Adot[selected_channels_mask,:]
                        selected_channels_mask_stacked = np.tile(selected_channels_mask, 2)
                        selected_B = selected_B[:, selected_channels_mask_stacked]


                    print("biggest subset size: ", len(full_channel_subset)) 

                    print("parcel subset size: ", len(shared_parcel_subset))

                    ############### LOSO Classification (Parcel Space) ############### 

                    # need to change dt to 'full' version if 'ss' in dt

                    LOSO_parcel_results = {}

                    for dt in dt_conditions_parcel:

                        data_parcel = {}
                        for subject in subjects:
                            data_parcel[subject] = {}
                            for run in data[subject]:
                                if "ss" in dt:
                                    if dt not in data[subject][run]['full']:
                                        continue
                                    data_parcel[subject][run] = data[subject][run]['full'][dt]
                                else:
                                    data_parcel[subject][run] = data[subject][run]['all_od']        

                        all_parcel_features[dt] = all_parcel_features[dt] | get_parcel_loso_features(
                            data=data_parcel,
                            subjects=subjects,
                            feature_types=feature_types,
                            ft_slices=ft_slices,
                            Adot=cfg["Adot"],
                            B=cfg["B"],
                            clean_ch_map=clean_ch_map,
                            parcels=shared_parcel_subset,
                            prune=prune_channels
                        )

                        parcel_features_by_dt[data_type][dt] = get_parcel_loso_features(
                            data=data_parcel,
                            subjects=subjects,
                            feature_types=feature_types,
                            ft_slices=ft_slices,
                            Adot=cfg["Adot"],
                            B=cfg["B"],
                            clean_ch_map=clean_ch_map,
                            parcels=shared_parcel_subset,
                            prune=prune_channels
                        )
'''

'\nimport os\nimport pickle\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\n\nclassifiers = {\n    \'LDA\': LinearDiscriminantAnalysis(solver=\'lsqr\', shrinkage=\'auto\'),\n    #\'SVC_lin\': SVC(kernel=\'linear\', probability=True, C=0.1),\n}\n\nsave_plot = True\n\nprune_by_zeroing = True\nn_reduced_feat_ws = 30\nn_reduced_feat_loso = 30\nspatial_scaling = 2\ndatasets_path = "/home/thomas/Dokumente/Master/Master_Thesis/datasets/"\n\ndt_conditions_parcel = [\n    "all_od", \n    "all_od_ss_mean_full"\n]\n\ndt_labels_parcel = {\n    "all_od": "Parcel No SS Correction",\n    "all_od_ss_mean_full": "Parcel SS corrected",\n}\n\nall_parcel_features = {dtcp: {} for dtcp in dt_conditions_parcel}\nparcel_features_by_dt = {dt: {dtcp: {} for dtcp in dt_condit

In [10]:
all_parcel_features['all_od'].keys()

dict_keys(['sub-170', 'sub-173', 'sub-174', 'sub-176', 'sub-177', 'sub-179', 'sub-181', 'sub-182', 'sub-183', 'sub-185', 'sub-577', 'sub-580', 'sub-586', 'sub-587', 'sub-592', 'sub-613', 'sub-618', 'sub-619', 'sub-621', 'sub-633', 'sub-638', 'sub-640'])

# Classify

In [11]:
classifiers = {
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'),
    'Linear SVM': SVC(kernel='linear', C=0.1, max_iter=10000),
    'RBF SVM': SVC(kernel='rbf', C=0.1, gamma='scale'),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'MLP' : MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=1000, early_stopping=True, random_state=69),
}
#classifier = classifiers["LDA"]
classifier = classifiers["Linear SVM"]
#classifier = classifiers["RBF SVM"]
#classifier = classifiers["Random Forest"]
#classifier = classifiers["MLP"]

In [12]:
results_loso_laura =  loso_classification(parcel_features_by_dt['BS_Laura']['all_od'], list(parcel_features_by_dt['BS_Laura']['all_od'].keys()), classifier)
results_loso_ss_laura =  loso_classification(parcel_features_by_dt['BS_Laura']['all_od_ss_mean_full'], list(parcel_features_by_dt['BS_Laura']['all_od'].keys()), classifier)

results_loso_hdsq =  loso_classification(parcel_features_by_dt['HD_Squeezing']['all_od'], list(parcel_features_by_dt['HD_Squeezing']['all_od'].keys()), classifier)
results_loso_ss_hdsq =  loso_classification(parcel_features_by_dt['HD_Squeezing']['all_od_ss_mean_full'], list(parcel_features_by_dt['HD_Squeezing']['all_od'].keys()), classifier)

X ALL:
(498, 104)
4.898399959027884e-09
X ALL:
(498, 104)
3.3998824493825618e-09
X ALL:
(900, 104)
1.576239517520676e-09
X ALL:
(900, 104)
1.5592138586758084e-09


In [13]:
print("HD_Squeezing")
print(np.mean(list(results_loso_hdsq.values())))
print(np.mean(list(results_loso_ss_hdsq.values())))
print("BS Laura")
print(np.mean(list(results_loso_laura.values())))
print(np.mean(list(results_loso_ss_laura.values())))
print("All (mean acc)")
print(np.mean(list(results_loso_laura.values()) + list(results_loso_hdsq.values())))
print(np.mean(list(results_loso_ss_laura.values()) + list(results_loso_ss_hdsq.values())))

HD_Squeezing
0.7633333333333334
0.7899999999999999
BS Laura
0.7327039583137144
0.7906569461447511
All (mean acc)
0.7466264015044503
0.7903583342607732


## Train on pooled / combined data

In [14]:
results_loso =  loso_classification(all_parcel_features['all_od'], list(all_parcel_features['all_od'].keys()), classifier, False, 30)
results_loso_ss =  loso_classification(all_parcel_features['all_od_ss_mean_full'], list(all_parcel_features['all_od'].keys()), classifier, False, 30)

X ALL:
(1398, 104)
2.75967006106187e-09
X ALL:
(1398, 104)
2.214902669957613e-09


In [15]:
print(list(all_parcel_features['all_od'].keys()))
print(list(results_loso.values()))

['sub-170', 'sub-173', 'sub-174', 'sub-176', 'sub-177', 'sub-179', 'sub-181', 'sub-182', 'sub-183', 'sub-185', 'sub-577', 'sub-580', 'sub-586', 'sub-587', 'sub-592', 'sub-613', 'sub-618', 'sub-619', 'sub-621', 'sub-633', 'sub-638', 'sub-640']
[0.7666666666666667, 0.7777777777777778, 0.6222222222222222, 0.7777777777777778, 0.6333333333333333, 0.7111111111111111, 0.7444444444444445, 0.8666666666666667, 0.7, 0.9777777777777777, 0.5952380952380952, 0.5853658536585366, 0.8809523809523809, 0.8333333333333334, 0.5714285714285714, 0.6190476190476191, 0.8095238095238095, 0.7857142857142857, 0.7142857142857143, 0.8095238095238095, 0.7027027027027027, 0.6666666666666666]


In [16]:
print("HD Squeezing")
print(np.mean(list(results_loso.values())[:10]))
print(np.mean(list(results_loso_ss.values())[:10]))
print("BS Laura")
print(np.mean(list(results_loso.values())[10:]))
print(np.mean(list(results_loso_ss.values())[10:]))
print("All")
print(np.mean(list(results_loso.values())))
print(np.mean(list(results_loso_ss.values())))


HD Squeezing
0.7577777777777779
0.768888888888889
BS Laura
0.7144819035062938
0.758862521057643
All
0.7341618463569684
0.7634199609809366


## Unseen dataset classifcation

In [17]:
def classify_on_all(all_data, subjects, clf, scaler, fit_clf=True):
    # either train or test classifier on whole dataset - used for unseen dataset classification
    X_all, y_all, subj_idx = [], [], []
    for i, subject in enumerate(subjects):
        for X, y in all_data[subject]:  # each run already processed to X, y
            X_all.append(X)
            y_all.append(y)
            subj_idx += [i] * len(y)

    X_all = np.vstack(X_all)
    y_all = np.concatenate(y_all)
    subj_idx = np.array(subj_idx)

    if fit_clf:
        X_all = scaler.fit_transform(X_all)
        clf.fit(X_all, y_all)
        results = None
    else: 
        X_all = scaler.transform(X_all)
        y_pred = clf.predict(X_all)
        
        # Compute accuracy per subject
        results = {}
        for i, subject in enumerate(subjects):
            idx = subj_idx == i
            acc = accuracy_score(y_all[idx], y_pred[idx])
            results[subject] = acc

    return results, clf, scaler

Important: don't reuse one classifier object but define new one with every function call so the saved fitted classifier objects don't get manipulated

In [18]:
results_all_hd_sq, clf_all_hd_sq, scaler_all_hd_sq = classify_on_all(parcel_features_by_dt['HD_Squeezing']['all_od'], parcel_features_by_dt['HD_Squeezing']['all_od'].keys(), SVC(kernel='linear', C=0.1, max_iter=10000), StandardScaler(), fit_clf=True)
results_all_ss_hd_sq, clf_all_ss_hd_sq, scaler_all_ss_hd_sq = classify_on_all(parcel_features_by_dt['HD_Squeezing']['all_od_ss_mean_full'], parcel_features_by_dt['HD_Squeezing']['all_od'].keys(), SVC(kernel='linear', C=0.1, max_iter=10000), StandardScaler(), fit_clf=True)

results_unseen_laura, _, __ = classify_on_all(parcel_features_by_dt['BS_Laura']['all_od'], parcel_features_by_dt['BS_Laura']['all_od'].keys(), clf_all_hd_sq, scaler_all_hd_sq, fit_clf=False)
results_unseen_ss_laura, _, __ =  classify_on_all(parcel_features_by_dt['BS_Laura']['all_od_ss_mean_full'], parcel_features_by_dt['BS_Laura']['all_od'].keys(), clf_all_ss_hd_sq, scaler_all_ss_hd_sq, fit_clf=False)

In [19]:
print(results_unseen_laura)
print(np.mean(list(results_unseen_laura.values())))
print(results_unseen_ss_laura)
print(np.mean(list(results_unseen_ss_laura.values())))

{'sub-577': 0.38095238095238093, 'sub-580': 0.6341463414634146, 'sub-586': 0.5952380952380952, 'sub-587': 0.7142857142857143, 'sub-592': 0.4523809523809524, 'sub-613': 0.6666666666666666, 'sub-618': 0.6190476190476191, 'sub-619': 0.6190476190476191, 'sub-621': 0.5, 'sub-633': 0.7857142857142857, 'sub-638': 0.8378378378378378, 'sub-640': 0.5714285714285714}
0.6147288403385964
{'sub-577': 0.35714285714285715, 'sub-580': 0.7073170731707317, 'sub-586': 0.8809523809523809, 'sub-587': 0.7619047619047619, 'sub-592': 0.5476190476190477, 'sub-613': 0.7857142857142857, 'sub-618': 0.7380952380952381, 'sub-619': 0.6904761904761905, 'sub-621': 0.7619047619047619, 'sub-633': 0.9761904761904762, 'sub-638': 0.7837837837837838, 'sub-640': 0.6666666666666666}
0.7214806269684318
