# Data visualization


In [8]:
print(f"X shape: {X.shape}, dtype: {X.dtype}")
print(f"y shape: {y.shape}, dtype: {y.dtype}\n")

# View 1st sample
sample_index = 11050 # Change to 1, 2, etc., to view other samples
print(f"Sample {sample_index} EEG shape: {X[sample_index].shape}")  # (64, 416)
print(f"Label for sample {sample_index}: {y[sample_index]}\n")

# Print first 5 channels of the sample (each has 416 values)
for i in range(64):  # 0 to 4
    print(f"Channel {i}:")
    print(X[sample_index][i])  # 1D array of 416 time points
    print()

NameError: name 'X' is not defined

In [9]:
data = X


# Trimming data


In [10]:
import numpy as np

def trim_trailing_zeros(data):
    # data shape: (samples, channels, timepoints)
    trimmed_data = []
    for sample in data:
        # Find timepoints where at least one channel is non-zero
        mask = np.any(sample != 0, axis=0)
        nonzero_indices = np.where(mask)[0]
        
        if len(nonzero_indices) == 0:
            # Entire sample is zero; keep one zero timepoint to avoid empty arrays
            trimmed_sample = sample[:, :1]
        else:
            last_valid_idx = nonzero_indices[-1] + 1
            trimmed_sample = sample[:, :last_valid_idx]
        
        trimmed_data.append(trimmed_sample)
    
    return trimmed_data

# Padding data

In [11]:
def pad_samples(trimmed_data, pad_to=None):
    # pad_to: max length to pad to (if None, auto-detect)
    n_samples = len(trimmed_data)
    n_channels = trimmed_data[0].shape[0]
    max_len = pad_to or max(s.shape[1] for s in trimmed_data)

    padded = np.zeros((n_samples, n_channels, max_len))
    for i, s in enumerate(trimmed_data):
        padded[i, :, :s.shape[1]] = s
    return padded

In [12]:
trimmed = trim_trailing_zeros(data)
padded_data = pad_samples(trimmed)  # Now ready for CNNs or ML models

In [13]:
data_clipped = np.clip(padded_data, -100, 100)  # values in microvolts

In [14]:
import numpy as np

# Assuming your data is in a NumPy array called eeg_data
# eeg_data.shape == (11057, 64, 256)

# Remove the last 3 channels (keep channels 0 to 60)
eeg_data_cleaned = data_clipped[:, :61, :]

# eeg_data_cleaned.shape will be (11057, 61, 256)


In [29]:
print(eeg_data_cleaned.shape)  # should be (11053, 64, 416)
print(np.isnan(eeg_data_cleaned).sum())  # check for NaNs
print(np.min(eeg_data_cleaned), np.max(eeg_data_cleaned), np.mean(eeg_data_cleaned), np.std(eeg_data_cleaned))  # global stats

print(f"y shape: {y.shape}, dtype: {y.dtype}\n")
print(y)

(11057, 61, 256)
0
-100.0 100.0 -1.0627293675958795 9.827144638860839
y shape: (11057,), dtype: int64

[1 1 1 ... 0 0 0]


In [2]:
import numpy as np

# Suppose your cleaned data is in the variable 'preprocessed_data'
np.save('eeg_data_cleaned.npy', eeg_data_cleaned)
np.save('y.npy', y)


NameError: name 'eeg_data_cleaned' is not defined

In [3]:
import matplotlib.pyplot as plt

sample_idx = 5675
channels_to_plot = [60]  # Choose 3 channels

for ch in channels_to_plot:
    plt.plot(eeg_data_cleaned[sample_idx, ch], label=f'Channel {ch}')
plt.title("Raw EEG Signal (One Sample)")
plt.xlabel("Timepoints")
plt.ylabel("μV")
plt.legend()
plt.show()

ImportError: Matplotlib requires numpy>=1.23; you have 1.21.6

In [4]:
eeg_data_cleaned = np.load('eeg_data_cleaned.npy')
y = np.load('y.npy')

In [5]:
import numpy as np
from scipy.signal import butter, lfilter, iirnotch

# --- Parameters ---
sfreq = 256  # Sampling frequency in Hz
lowcut = 1.0
highcut = 45.0
notch_freq = 50.0
notch_quality = 30.0
order = 4

# --- Filter Design ---
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def bandpass_filter(data, lowcut=1.0, highcut=45.0, fs=256, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def notch_filter(data, fs=256, freq=50.0, quality=30.0):
    b, a = iirnotch(freq / (fs / 2), quality)
    y = lfilter(b, a, data)
    return y

def baseline_correction(data):
    mean = np.mean(data, axis=-1, keepdims=True)
    return data - mean

# --- Preprocessing Pipeline ---
def preprocess_eeg(eeg_data, sfreq=256):
    samples, channels, timepoints = eeg_data.shape
    preprocessed = np.zeros_like(eeg_data)
    for i in range(samples):
        for ch in range(channels):
            signal = eeg_data[i, ch, :]
            # 1. Bandpass filter
            filtered = bandpass_filter(signal, lowcut=1.0, highcut=45.0, fs=sfreq, order=4)
            # 2. Notch filter
            filtered = notch_filter(filtered, fs=sfreq, freq=50.0, quality=30.0)
            # 3. Baseline correction
            corrected = baseline_correction(filtered)
            preprocessed[i, ch, :] = corrected
    return preprocessed

# Example usage (for a subset due to memory constraints)
# eeg_data_cleaned: shape (samples, channels, timepoints)
# For demonstration, use a small subset:


preprocessed_data = preprocess_eeg(eeg_data_cleaned, sfreq=sfreq)
print(preprocessed_data.shape, preprocessed_data.dtype)

  from scipy.signal import butter, lfilter, iirnotch


(11057, 61, 256) float64


In [6]:
from sklearn.model_selection import train_test_split

# X: EEG data, shape (11057, 61, 256)
# y: labels, shape (11057,)

# First split off the test set (15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    eeg_data_cleaned, y, test_size=0.15, random_state=42, stratify=y
)

# Now split the remaining data into training (70%) and validation (15%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
)
# 0.1765 ≈ 0.15 / 0.85, to make validation 15% of original data

# Optional: add channel dimension for CNN
X_train = X_train[..., None]
X_val = X_val[..., None]
X_test = X_test[..., None]


# Data Augmentation

In [7]:
import numpy as np

# --- 3D EEG Augmentation Functions ---

def add_gaussian_noise(X, noise_level=0.02):
    noise = np.random.normal(0, noise_level, X.shape)
    return X + noise

def random_time_shift(X, max_shift=8):
    shifted = np.zeros_like(X)
    for i in range(X.shape[0]):
        shift = np.random.randint(-max_shift, max_shift)
        if shift > 0:
            shifted[i, :, shift:] = X[i, :, :-shift]
        elif shift < 0:
            shifted[i, :, :shift] = X[i, :, -shift:]
        else:
            shifted[i] = X[i]
    return shifted

def random_amplitude_scaling(X, scale_range=(0.95, 1.05)):
    # X shape: (samples, channels, timepoints, 1)
    scales = np.random.uniform(scale_range[0], scale_range[1], (X.shape[0], 1, 1, 1))
    return X * scales

def channel_reflection(X):
    # Reflect channels (reverse channel order)
    return X[:, ::-1, :]

def channel_masking(X, mask_prob=0.1):
    X_masked = X.copy()
    for i in range(X.shape[0]):
        for ch in range(X.shape[1]):
            if np.random.rand() < mask_prob:
                X_masked[i, ch, :] = 0
    return X_masked

def signal_flipping(X):
    # Reverse time axis
    return X[:, :, ::-1]

def mixup(X, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha)
    batch_size = X.shape[0]
    index = np.random.permutation(batch_size)
    X_mix = lam * X + (1 - lam) * X[index]
    y_mix = lam * y + (1 - lam) * y[index]
    return X_mix, y_mix

# --- Apply Augmentations ---

# Apply augmentations sequentially
# Assume you already split your data:
# X_train, X_val, X_test, y_train, y_val, y_test

# Apply augmentations to X_train only
X_aug = add_gaussian_noise(X_train)
X_aug = random_time_shift(X_aug)
X_aug = random_amplitude_scaling(X_aug)
X_aug = channel_reflection(X_aug)
X_aug = channel_masking(X_aug)
X_aug = signal_flipping(X_aug)

# Mixup (on training set only)
X_mix, y_mix = mixup(X_train, y_train, alpha=0.2)

# Combine all data
X_train_combined = np.concatenate([X_train, X_aug, X_mix], axis=0)
y_train_combined = np.concatenate([y_train, y_train, y_mix], axis=0)


print("Original:", eeg_data_cleaned.shape)
print("Augmented:", X_aug.shape)
print("Mixup:", X_mix.shape)
print("Combined:", X_train_combined.shape)
print("Labels:", y_train_combined.shape)


Original: (11057, 61, 256)
Augmented: (7739, 61, 256, 1)
Mixup: (7739, 61, 256, 1)
Combined: (23217, 61, 256, 1)
Labels: (23217,)


In [10]:
from tensorflow.keras import layers, models

input_shape = (256, 61)  # (timesteps, channels)

model = models.Sequential([
    # CNN block
    layers.Conv1D(64, kernel_size=3, activation='elu', padding='same', input_shape=input_shape),
    layers.BatchNormalization(),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.3),

    layers.Conv1D(128, kernel_size=3, activation='elu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.3),

    # Reshape for LSTM
    layers.Reshape((-1, 128)),  # Adjust based on last Conv1D filters

    # LSTM block
    layers.Bidirectional(layers.LSTM(64, return_sequences=False)),
    layers.BatchNormalization(),
    layers.Dropout(0.4),

    # Dense layers
    layers.Dense(64, activation='elu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),

    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 256, 64)           11776     
                                                                 
 batch_normalization (BatchN  (None, 256, 64)          256       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 128, 64)          0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 128, 64)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 128, 128)          24704     
                                                                 
 batch_normalization_1 (Batc  (None, 128, 128)         5

In [11]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 256, 64)           11776     
                                                                 
 batch_normalization (BatchN  (None, 256, 64)          256       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 128, 64)          0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 128, 64)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 128, 128)          24704     
                                                                 
 batch_normalization_1 (Batc  (None, 128, 128)         5

In [15]:
import tensorflow as tf
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',         # You can also use 'val_accuracy'
    patience=7,                 # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True   # Restores model weights from the epoch with the best value of the monitored quantity
)

In [16]:
history = model.fit(
    X_train, y_train,
    epochs=50,                  # Set a large max epoch, early stopping will halt earlier if needed
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

MemoryError: Unable to allocate 461. MiB for an array with shape (7739, 61, 256, 1) and data type float32

In [17]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Test accuracy: 0.9554


In [50]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step
[[ 544   60]
 [  36 1019]]
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       604
           1       0.94      0.97      0.96      1055

    accuracy                           0.94      1659
   macro avg       0.94      0.93      0.94      1659
weighted avg       0.94      0.94      0.94      1659



In [51]:
model.save('best_eeg_cnn_model_with_preprocessing.keras')