In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [2]:
# Load data

df = pd.read_csv('dataset/Dortmund_features.csv')

print("Shape of raw data:", df.shape)
print("Columns:", df.columns.tolist())

sub_id_col = 'Unnamed: 0'

eeg_features = [c for c in df.columns if c != sub_id_col]
print("Number of EEG features:", len(eeg_features))


Shape of raw data: (600, 148)
Columns: ['Unnamed: 0', 'nl_rqa mean diag length', 'kurtosis_amp_theta', 'lt_delta', 'envelope_mean_alpha', 'nl_correlation dim', 'kurtosis_amp_alpha', 'mod_index_theta-gamma', 'nl_rqa rte', 'nl_higuchi fractal dim', 'acw50', 'envelope_std_beta', 'wt_delta', 'mod_index_theta-beta', 'spec_ent_beta', 'wt_beta', 'spec_ent_delta', 'nl_lyapunov exponent', 'nl_katz fractal dim', 'mod_index_theta-alpha', 'total_power_theta', 'skewness_amp_theta', 'mod_index_alpha-beta', 'nl_rqa determinism', 'rel_amp_gamma', 'skewness_amp_gamma', 'spec_ent_gamma', 'nl_rqa laminarity', 'acw0', 'envelope_mean_beta', 'total_power_beta', 'kurtosis_amp_beta', 'spec_ent_theta', 'kurtosis_amp_gamma', 'wt_alpha', 'spec_ent_alpha', 'wt_theta', 'rel_amp_theta', 'envelope_std_theta', 'rel_amp_beta', 'total_power_alpha', 'skewness_amp_alpha', 'skewness_amp_beta', 'nl_hjorth mobility', 'lt_alpha', 'skewness_amp_delta', 'envelope_std_gamma', 'wt_gamma', 'lt_theta', 'nl_rqa trapping time', 'kur

In [3]:
# Drop low-variance features
var_threshold = 1e-6  
variances = df[eeg_features].var()
low_var_cols = variances[variances < var_threshold].index.tolist()

print(f"\nDropping {len(low_var_cols)} low-variance features (var < {var_threshold}):")
print(low_var_cols)

remaining_after_var = [c for c in eeg_features if c not in low_var_cols]
X_var = df[remaining_after_var].copy()
print("Number of EEG features after low-variance filter:", len(X_var.columns))


Dropping 7 low-variance features (var < 1e-06):
['mod_index_theta-gamma', 'mod_index_theta-beta', 'mod_index_theta-alpha', 'mod_index_alpha-beta', 'skewness_amp_gamma', 'mod_index_beta-gamma', 'mod_index_alpha-gamma']
Number of EEG features after low-variance filter: 140


In [6]:
# Standardization

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_var)
X_scaled = pd.DataFrame(X_scaled, columns=X_var.columns, index=df.index)

print("Standardized feature matrix shape:", X_scaled.shape)


Standardized feature matrix shape: (600, 140)


In [None]:
# Drop highly correlated features

corr_matrix = X_scaled.corr().abs()
# Only keep higher triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

corr_threshold = 0.95
to_drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]

print("Dropping due to high correlation (>|", corr_threshold, "|):")
print(to_drop)

X_nocorr = X_scaled.drop(columns=to_drop)
remaining_features = X_nocorr.columns.tolist()

print("Features before:", len(eeg_features))
print("Features after correlation filtering:", len(remaining_features))


Dropping due to high correlation (>| 0.95 |):
['spec_ent_gamma', 'envelope_mean_beta', 'total_power_beta', 'rel_amp_theta', 'envelope_std_theta', 'rel_amp_beta', 'total_power_alpha', 'nl_hjorth mobility', 'lt_alpha', 'lt_theta', 'envelope_mean_gamma', 'rel_amp_delta', 'envelope_std_delta', 'nl_lzc exhaustive', 'envelope_std_alpha', 'nl_rqa entropy', 'rel_amp_alpha', 'nl_sampling ent', 'total_power_gamma', 'envelope_mean_theta', 'lt_beta', 'lt_gamma', 'total_power_delta', 'plv_local-eff_beta', 'plv_clust-coef_alpha', 'plv_clust-coef_theta', 'wpli_local-eff_alpha', 'wpli_clust-coef_beta', 'coh_local-eff_beta', 'coh_clust-coef_alpha', 'plv_clust-coef_beta', 'plv_global-eff_alpha', 'wpli_global-eff_beta', 'coh_clust-coef_theta', 'wpli_clust-coef_theta', 'plv_ns_theta', 'plv_betw-cent_alpha', 'plv_local-eff_alpha', 'coh_global-eff_beta', 'wpli_clust-coef_alpha']
Features before: 147
Features after correlation filtering: 100


In [8]:
# Handle ouliers: Z-score clipping

X_clean = X_nocorr.copy()
z_clip = 3.0  # clip at Â±3 standard deviations

for col in X_clean.columns:
    X_clean[col] = np.clip(X_clean[col], -z_clip, z_clip)

print("Shape after clipping extreme values (subjects x features):", X_clean.shape)

Shape after clipping extreme values (subjects x features): (600, 100)


In [9]:
# Save preprocessed data 

# Attach subject IDs
df_preprocessed = df[[sub_id_col]].copy()  # start with subject IDs
df_preprocessed = pd.concat([df_preprocessed, X_clean], axis=1)

# Save to CSV
output_path = 'dataset/Dortmund_features_preprocessed.csv'
df_preprocessed.to_csv(output_path, index=False)

print(f"Preprocessed features saved to {output_path}")


Preprocessed features saved to dataset/Dortmund_features_preprocessed.csv
