In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [8]:
# Load data

df = pd.read_csv('dataset/Dortmund_features.csv')

print("Shape of raw data:", df.shape)
print("Columns:", df.columns.tolist())

sub_id_col = 'Unnamed: 0'

eeg_features = [c for c in df.columns if c != sub_id_col]
print("Number of EEG features:", len(eeg_features))


Shape of raw data: (600, 148)
Columns: ['Unnamed: 0', 'nl_rqa mean diag length', 'kurtosis_amp_theta', 'lt_delta', 'envelope_mean_alpha', 'nl_correlation dim', 'kurtosis_amp_alpha', 'mod_index_theta-gamma', 'nl_rqa rte', 'nl_higuchi fractal dim', 'acw50', 'envelope_std_beta', 'wt_delta', 'mod_index_theta-beta', 'spec_ent_beta', 'wt_beta', 'spec_ent_delta', 'nl_lyapunov exponent', 'nl_katz fractal dim', 'mod_index_theta-alpha', 'total_power_theta', 'skewness_amp_theta', 'mod_index_alpha-beta', 'nl_rqa determinism', 'rel_amp_gamma', 'skewness_amp_gamma', 'spec_ent_gamma', 'nl_rqa laminarity', 'acw0', 'envelope_mean_beta', 'total_power_beta', 'kurtosis_amp_beta', 'spec_ent_theta', 'kurtosis_amp_gamma', 'wt_alpha', 'spec_ent_alpha', 'wt_theta', 'rel_amp_theta', 'envelope_std_theta', 'rel_amp_beta', 'total_power_alpha', 'skewness_amp_alpha', 'skewness_amp_beta', 'nl_hjorth mobility', 'lt_alpha', 'skewness_amp_delta', 'envelope_std_gamma', 'wt_gamma', 'lt_theta', 'nl_rqa trapping time', 'kur

In [9]:
# Drop low-variance features
var_threshold = 1e-6   # set threshold for variance filtering - selected based on data scale
variances = df[eeg_features].var()
low_var_cols = variances[variances < var_threshold].index.tolist()

print(f"\nDropping {len(low_var_cols)} low-variance features (var < {var_threshold}):")
print(low_var_cols)

remaining_after_var = [c for c in eeg_features if c not in low_var_cols]
X_var = df[remaining_after_var].copy()
print("Number of EEG features after low-variance filter:", len(X_var.columns))


Dropping 7 low-variance features (var < 1e-06):
['mod_index_theta-gamma', 'mod_index_theta-beta', 'mod_index_theta-alpha', 'mod_index_alpha-beta', 'skewness_amp_gamma', 'mod_index_beta-gamma', 'mod_index_alpha-gamma']
Number of EEG features after low-variance filter: 140


In [10]:
# Standardization

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_var)
X_scaled = pd.DataFrame(X_scaled, columns=X_var.columns, index=df.index)

print("Standardized feature matrix shape:", X_scaled.shape)


Standardized feature matrix shape: (600, 140)


In [11]:
# Drop highly correlated features

corr_matrix = X_scaled.corr().abs()
# Only keep higher triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

corr_threshold = 0.95
to_drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]

print("Dropping due to high correlation (>|", corr_threshold, "|):")
print(to_drop)

X_nocorr = X_scaled.drop(columns=to_drop)
remaining_features = X_nocorr.columns.tolist()

print("Features before:", len(eeg_features))
print("Features after correlation filtering:", len(remaining_features))


Dropping due to high correlation (>| 0.95 |):
['spec_ent_gamma', 'envelope_mean_beta', 'total_power_beta', 'rel_amp_theta', 'envelope_std_theta', 'rel_amp_beta', 'total_power_alpha', 'nl_hjorth mobility', 'lt_alpha', 'lt_theta', 'envelope_mean_gamma', 'rel_amp_delta', 'envelope_std_delta', 'nl_lzc exhaustive', 'envelope_std_alpha', 'nl_rqa entropy', 'rel_amp_alpha', 'nl_sampling ent', 'total_power_gamma', 'envelope_mean_theta', 'lt_beta', 'lt_gamma', 'total_power_delta', 'plv_local-eff_beta', 'plv_clust-coef_alpha', 'plv_clust-coef_theta', 'wpli_local-eff_alpha', 'wpli_clust-coef_beta', 'coh_local-eff_beta', 'coh_clust-coef_alpha', 'plv_clust-coef_beta', 'plv_global-eff_alpha', 'wpli_global-eff_beta', 'coh_clust-coef_theta', 'wpli_clust-coef_theta', 'plv_ns_theta', 'plv_betw-cent_alpha', 'plv_local-eff_alpha', 'coh_global-eff_beta', 'wpli_clust-coef_alpha']
Features before: 147
Features after correlation filtering: 100


In [12]:
# Handle ouliers: Z-score clipping

X_clean = X_nocorr.copy()
z_clip = 3.0  # clip at Â±3 standard deviations - selected based on data scale

for col in X_clean.columns:
    X_clean[col] = np.clip(X_clean[col], -z_clip, z_clip)

print("Shape after clipping extreme values (subjects x features):", X_clean.shape)

Shape after clipping extreme values (subjects x features): (600, 100)


In [13]:
# Save preprocessed data 

# Attach subject IDs
df_preprocessed = df[[sub_id_col]].copy()  # start with subject IDs
df_preprocessed = pd.concat([df_preprocessed, X_clean], axis=1)

# Save to CSV
output_path = 'dataset/Dortmund_features_preprocessed.csv'
df_preprocessed.to_csv(output_path, index=False)

print(f"Preprocessed features saved to {output_path}")




Preprocessed features saved to dataset/Dortmund_features_preprocessed.csv


# Lemon Dataset

In [14]:
df_lemon = pd.read_csv('dataset/Lemon_features.csv')

print("Shape of raw data:", df_lemon.shape)
print("Columns:", df_lemon.columns.tolist())

sub_id_col = 'Unnamed: 0'

eeg_features_lemon = [c for c in df_lemon.columns if c != sub_id_col]
print("Number of EEG features:", len(eeg_features))

Shape of raw data: (190, 148)
Columns: ['Unnamed: 0', 'range_assym_alpha', 'coh_global-eff_theta', 'skewness_amp_alpha', 'skewness_amp_delta', 'range_cv_alpha', 'wpli_betw-cent_beta', 'spec_ent_alpha', 'nl_rqa mean diag length', 'kurtosis_amp_theta', 'plv_betw-cent_theta', 'range_assym_gamma', 'envelope_std_gamma', 'coh_ns_beta', 'coh_ns_alpha', '1f_slope', 'rel_amp_beta', 'wpli_betw-cent_theta', 'acw50', 'wpli_betw-cent_alpha', 'wt_gamma', 'range_cv_beta', 'plv_ns_beta', 'osc_theta', 'total_power_theta', '1f_offset', 'kurtosis_amp_delta', 'fei_alpha', 'plv_ns_alpha', 'nl_lzc exhaustive', 'envelope_std_beta', 'nl_approximate ent', 'wpli_global-eff_alpha', 'wt_alpha', 'nl_hjorth mobility', 'kurtosis_amp_alpha', 'coh_clust-coef_beta', 'plv_global-eff_beta', 'coh_betw-cent_beta', 'mod_index_theta-beta', 'coh_local-eff_theta', 'plv_local-eff_beta', 'coh_betw-cent_theta', 'nl_hjorth complexity', 'rel_amp_alpha', 'nl_rqa max vertical line', 'mod_index_alpha-beta', 'coh_local-eff_alpha', 'rel

In [15]:
# Drop low-variance features for Lemon data
variances_lemon = df_lemon[eeg_features_lemon].var()
low_var_cols_lemon = variances_lemon[variances_lemon < var_threshold].index.tolist()

print(f"\nDropping {len(low_var_cols_lemon)} low-variance Lemon features (var < {var_threshold}):")
print(low_var_cols_lemon)

remaining_after_var_lemon = [c for c in eeg_features_lemon if c not in low_var_cols_lemon]
X_var_lemon = df_lemon[remaining_after_var_lemon].copy()
print("Number of Lemon EEG features after low-variance filter:", len(X_var_lemon.columns))


Dropping 7 low-variance Lemon features (var < 1e-06):
['mod_index_theta-beta', 'mod_index_alpha-beta', 'mod_index_beta-gamma', 'mod_index_alpha-gamma', 'mod_index_theta-alpha', 'mod_index_theta-gamma', 'skewness_amp_gamma']
Number of Lemon EEG features after low-variance filter: 140


In [16]:
# Standardization for Lemon data
scaler_lemon = StandardScaler()
X_scaled_lemon = scaler_lemon.fit_transform(X_var_lemon)
X_scaled_lemon = pd.DataFrame(X_scaled_lemon, columns=X_var_lemon.columns, index=df_lemon.index)
print("Standardized Lemon feature matrix shape:", X_scaled_lemon.shape)


Standardized Lemon feature matrix shape: (190, 140)


In [17]:
# Drop highly correlated features for Lemon data
corr_matrix_lemon = X_scaled_lemon.corr().abs()
upper_lemon = corr_matrix_lemon.where(np.triu(np.ones(corr_matrix_lemon.shape), k=1).astype(bool))
to_drop_lemon = [column for column in upper_lemon.columns if any(upper_lemon[column] > corr_threshold)]

print("Dropping Lemon features due to high correlation (>|", corr_threshold, "|):")
print(to_drop_lemon)

X_nocorr_lemon = X_scaled_lemon.drop(columns=to_drop_lemon)
remaining_features_lemon = X_nocorr_lemon.columns.tolist()

print("Lemon features before:", len(eeg_features_lemon))
print("Lemon features after correlation filtering:", len(remaining_features_lemon))

Dropping Lemon features due to high correlation (>| 0.95 |):
['nl_approximate ent', 'wpli_global-eff_alpha', 'nl_hjorth mobility', 'coh_local-eff_theta', 'plv_local-eff_beta', 'total_power_gamma', 'plv_clust-coef_alpha', 'coh_global-eff_alpha', 'total_power_alpha', 'wpli_ns_alpha', 'envelope_mean_alpha', 'envelope_mean_delta', 'nl_rqa determinism', 'plv_clust-coef_theta', 'wpli_ns_beta', 'envelope_std_delta', 'envelope_std_alpha', 'envelope_std_theta', 'nl_lzc primitive', 'envelope_mean_gamma', 'rel_amp_theta', 'envelope_mean_beta', 'lt_alpha', 'nl_rqa entropy', 'wpli_local-eff_alpha', 'spec_ent_gamma', 'wpli_clust-coef_beta', 'coh_local-eff_beta', 'wpli_local-eff_theta', 'coh_clust-coef_alpha', 'plv_clust-coef_beta', 'plv_global-eff_alpha', 'lt_theta', 'envelope_mean_theta', 'nl_permutation ent', 'nl_rqa laminarity', 'wpli_global-eff_beta', 'spec_ent_beta', 'coh_clust-coef_theta', 'wpli_clust-coef_theta', 'plv_ns_theta', 'plv_betw-cent_alpha', 'plv_local-eff_alpha', 'coh_global-eff_be

In [18]:
# outlier handling for Lemon data
X_clean_lemon = X_nocorr_lemon.copy()
for col in X_clean_lemon.columns:
    X_clean_lemon[col] = np.clip(X_clean_lemon[col], -z_clip, z_clip)
print("Shape after clipping extreme Lemon values (subjects x features):", X_clean_lemon.shape)


Shape after clipping extreme Lemon values (subjects x features): (190, 91)


In [19]:
# Save preprocessed Lemon data
df_preprocessed_lemon = df_lemon[[sub_id_col]].copy()
df_preprocessed_lemon = pd.concat([df_preprocessed_lemon, X_clean_lemon], axis=1)

output_path_lemon = 'dataset/Lemon_features_preprocessed.csv'
df_preprocessed_lemon.to_csv(output_path_lemon, index=False)
print(f"Preprocessed Lemon features saved to {output_path_lemon}")

Preprocessed Lemon features saved to dataset/Lemon_features_preprocessed.csv
