In [None]:
# ## Importing Libraries
import os
import gc
import numpy as np
import pandas as pd
import logging

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# ## Import Utilities
from utilities import (
    string2array,
    EEGDataFrameUI,
    display_waveform,
    display_fft,
    display_psd,
    prepare_signals_for_training,
    process_raw_signals,
    extract_features_from_processed,
    aggregate_multichannel_features,
    select_best_features,
    prepare_for_training
)

In [None]:
# ## Setup Logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(filename)s : %(lineno)d - %(message)s')

In [None]:
gc.collect()

In [None]:
data_dir = os.path.join('/mnt', 'd', 'work', 'Walsh', 'Capstone', 'Published', 'Data')
prk_folder = os.path.join(data_dir, 'parquets')

# Global variables
SAMPLING_RATES = {
    "MW": 512, 
    "EP": 128, 
    "MU": 220, 
    "IN": 128
}

In [None]:
# ## Main Pipeline Function
def run_full_pipeline(device_name, raw_df, sampling_rate, pipeline_params, prk_folder, k_best_features=40):
    """
    Executes the full, optimized, and checkpointed EEG processing pipeline for a given device.
    """
    logging.info(f"\n--- Starting Full Pipeline for {device_name} ---")
    
    # Define file paths for each stage
    processed_file = os.path.join(prk_folder, f'{device_name}_processed.parquet')
    features_file = os.path.join(prk_folder, f'{device_name}_features.parquet')
    aggregated_file = os.path.join(prk_folder, f'{device_name}_aggregated.parquet')
    model_ready_file = os.path.join(prk_folder, f'{device_name}_model_ready.parquet')

    # --- Stage 1: Signal Processing ---
    if os.path.exists(processed_file):
        logging.info(f"Loading existing processed data from {processed_file}")
        processed_df = pd.read_parquet(processed_file)
    else:
        logging.info("Stage 1: Processing raw signals...")
        processed_df = process_raw_signals(raw_df, sampling_rate, pipeline_params)
        processed_df.to_parquet(processed_file, index=False)
        logging.info(f"Saved processed signals to {processed_file}")

    # --- Stage 2: Denoising and Feature Extraction ---
    if os.path.exists(features_file):
        logging.info(f"Loading existing features data from {features_file}")
        features_df = pd.read_parquet(features_file)
    else:
        logging.info("Stage 2: Denoising and extracting features...")
        is_multichannel = device_name != "MW"
        features_df = extract_features_from_processed(processed_df, pipeline_params, is_multichannel=is_multichannel, no_car=pipeline_params['no_car'])
        features_df.to_parquet(features_file, index=False)
        logging.info(f"Saved features to {features_file}")
        
    # --- Stage 3: Feature Aggregation ---
    if os.path.exists(aggregated_file):
        logging.info(f"Loading existing aggregated data from {aggregated_file}")
        aggregated_df = pd.read_parquet(aggregated_file)
    else:
        logging.info("Stage 3: Aggregating features for multi-channel data...")
        is_multichannel = device_name != "MW"
        aggregated_df = aggregate_multichannel_features(features_df, is_multichannel=is_multichannel)
        aggregated_df.to_parquet(aggregated_file, index=False)
        logging.info(f"Saved aggregated features to {aggregated_file}")

    # --- Stage 4: Feature Selection ---
    if os.path.exists(model_ready_file):
        logging.info(f"Loading existing model-ready data from {model_ready_file}")
        model_ready_df = pd.read_parquet(model_ready_file)
    else:
        logging.info("Stage 4: Selecting best features for model training...")
        model_ready_df = select_best_features(aggregated_df, k=k_best_features)
        model_ready_df.to_parquet(model_ready_file, index=False)
        logging.info(f"Saved model-ready data to {model_ready_file}")


In [None]:
mw_pipeline_params = {
    'target_sr': 128, 'target_duration_s': 2, 'low_cut_bp': 0.6,
    'high_cut_bp': 60, 'notch_freq': 50, 'filter_order_bp': 5,
    'q_factor_notch': 30.0, 'segment_selection_criteria': 'min_abs_amplitude',
    'normalization_method': 'z-score', 'correlation_threshold': 0.9,
    'no_car': True
}

# Load raw dataframes
logging.info("Loading raw MindBigData datasets from Parquet files...")
mw_df = pd.read_parquet(os.path.join(prk_folder, 'MW.parquet'))
# You can load other dataframes (mu_df, in_df, etc.) here as well
logging.info("Raw datasets loaded.")

# --- Execute Pipeline for MW device (Single-channel example) ---
run_full_pipeline(
    device_name="MW",
    raw_df=mw_df,
    sampling_rate=SAMPLING_RATES['MW'],
    pipeline_params=mw_pipeline_params,
    prk_folder=prk_folder,
    k_best_features=20
)

In [None]:
mu_pipeline_params = {
    'target_sr': 128, 'target_duration_s': 2, 'low_cut_bp': 0.6,
    'high_cut_bp': 60, 'notch_freq': 50, 'filter_order_bp': 5,
    'q_factor_notch': 30.0, 'segment_selection_criteria': 'min_abs_amplitude',
    'normalization_method': 'z-score', 'correlation_threshold': 0.9,
    'no_car': True
}
# Load raw dataframes
logging.info("Loading raw MindBigData datasets from Parquet files...")
mu_df = pd.read_parquet(os.path.join(prk_folder, 'MU.parquet'))
# You can load other dataframes (mu_df, in_df, etc.) here as well
logging.info("Raw datasets loaded.")

# --- Execute Pipeline for MU device (Multi-channel example) ---
run_full_pipeline(
    device_name="MU",
    raw_df=mu_df,
    sampling_rate=SAMPLING_RATES['MU'],
    pipeline_params=mu_pipeline_params,
    prk_folder=prk_folder,
    k_best_features=20
)

In [None]:
in_pipeline_params = {
    'target_sr': 128, 'target_duration_s': 2, 'low_cut_bp': 0.6,
    'high_cut_bp': 60, 'notch_freq': 50, 'filter_order_bp': 5,
    'q_factor_notch': 30.0, 'segment_selection_criteria': 'min_abs_amplitude',
    'normalization_method': 'z-score', 'correlation_threshold': 0.9,
    'no_car': True
}
# Load raw dataframes
logging.info("Loading raw MindBigData datasets from Parquet files...")
in_df = pd.read_parquet(os.path.join(prk_folder, 'IN.parquet'))
# You can load other dataframes (mu_df, in_df, etc.) here as well
logging.info("Raw datasets loaded.")

# --- Execute Pipeline for MU device (Multi-channel example) ---
run_full_pipeline(
    device_name="IN",
    raw_df=in_df,
    sampling_rate=SAMPLING_RATES['IN'],
    pipeline_params=in_pipeline_params,
    prk_folder=prk_folder,
    k_best_features=20
)

In [None]:
ep_pipeline_params = {
    'target_sr': 128, 'target_duration_s': 2, 'low_cut_bp': 0.6,
    'high_cut_bp': 60, 'notch_freq': 50, 'filter_order_bp': 5,
    'q_factor_notch': 30.0, 'segment_selection_criteria': 'min_abs_amplitude',
    'normalization_method': 'z-score', 'correlation_threshold': 0.9,
    'no_car': True
}
# Load raw dataframes
logging.info("Loading raw MindBigData datasets from Parquet files...")
ep_df = pd.read_parquet(os.path.join(prk_folder, 'EP.parquet'))
# You can load other dataframes (mu_df, in_df, etc.) here as well
logging.info("Raw datasets loaded.")

# --- Execute Pipeline for MU device (Multi-channel example) ---
run_full_pipeline(
    device_name="EP",
    raw_df=ep_df,
    sampling_rate=SAMPLING_RATES['EP'],
    pipeline_params=ep_pipeline_params,
    prk_folder=prk_folder,
    k_best_features=20
)

In [None]:
mw_df = pd.read_parquet(os.path.join(prk_folder, 'MW.parquet'))
mu_df = pd.read_parquet(os.path.join(prk_folder, 'MU.parquet'))
in_df = pd.read_parquet(os.path.join(prk_folder, 'IN.parquet'))
ep_df = pd.read_parquet(os.path.join(prk_folder, 'EP.parquet'))

In [None]:
# --- Optional: Interactive UI for exploring raw data ---
# To run this, you must be in a Jupyter environment
logging.info("\nTo explore data interactively, run the following lines in a Jupyter Notebook:")
DATAFRAME_OPTIONS_UI = {
    "MW": mw_df,
    "MU": mu_df,
    "IN": in_df,
    "EP": ep_df
}

eeg_ui = EEGDataFrameUI(DATAFRAME_OPTIONS_UI, string2array, SAMPLING_RATES, display_waveform, display_fft, display_psd)