## Data Cleaning Test

In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pywt
from scipy import signal, stats
from sklearn.preprocessing import StandardScaler
import re
import contextlib
from pathlib import Path

In [82]:
# Dataset-specific loaders
def load_io_data(filepath: str) -> pd.DataFrame:
    """Load original IO format data (voluntary/involuntary)"""
    df = pd.read_csv(filepath, sep=';', skiprows=2, usecols=range(11))
    df.columns = ['SampleIndex', 'FP1', 'FP2', 'Channel3', 'Channel4', 'Channel5',
                  'Channel6', 'Channel7', 'Channel8', 'Channel9', 'Channel10']
    return df

def load_vr_data(filepath: str) -> pd.DataFrame:
    """Load VR data with automatic format detection"""
    try:
        # Attempt to auto-detect format
        df = pd.read_csv(
            filepath,
            sep=None,  # Auto-detect delimiter
            engine='python',
            skiprows=2,
            on_bad_lines='warn',
            dtype={'SampleIndex': int, 'FP1': float, 'FP2': float}
        )
        
        # Column name normalization
        df.columns = df.columns.str.strip().str.replace('[^A-Za-z0-9]+', '', regex=True)
        
        # Required columns check
        required = {'SampleIndex', 'FP1', 'FP2', 'Blink'}
        if required.issubset(df.columns):
            return df[list(required)]
            
        # Fallback for column positions
        return df.iloc[:, :4].set_axis(['SampleIndex', 'FP1', 'FP2', 'Blink'], axis=1)
        
    except Exception as e:
        print(f"VR load failed for {Path(filepath).name}: {str(e)}")
        raise

def load_vv_data(filepath: str) -> pd.DataFrame:
    """Load VV data with proper column mapping"""
    df = pd.read_csv(filepath, delimiter='\t', skiprows=3)
    return df.rename(columns={
        'TimeStamp': 'Timestamp',
        'Blink': 'BlinkFlag',
        'Condition': 'TrialType'
    })[['Timestamp', 'FP1', 'FP2', 'BlinkFlag', 'TrialType']]

def load_labels(filepath: str) -> pd.DataFrame:
    """Load labels with validation"""
    labels = pd.read_csv(filepath)
    return labels[['StartSample', 'EndSample', 'Blink', 'TrialType']]

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Safer column standardization"""
    # Convert timestamp first
    if 'Timestamp' in df.columns:
        df['SampleIndex'] = (df['Timestamp'] * 250).astype('int32')
    
    # Handle blink labels
    blink_sources = ['Blink', 'BlinkFlag']
    existing_blink = [col for col in blink_sources if col in df.columns]
    df['Blink'] = df[existing_blink[0]] if existing_blink else 0
    
    # Map trial types
    trial_map = {'voluntary': 1, 'V': 1, 'involuntary': 0, 'I': 0}
    df['Voluntary'] = df['TrialType'].map(trial_map).fillna(0).astype('int8')
    
    return df[['SampleIndex', 'FP1', 'FP2', 'Blink', 'Voluntary', 'DatasetType']]

# def _add_dataset_type(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:
#     return df.assign(DatasetType=dataset_type)

# def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
#     """Ensure consistent column names and units"""
#     # Convert timestamps to sample indices if needed
#     if 'Timestamp' in df.columns:
#         df['SampleIndex'] = (df['Timestamp'] * 250).astype(int)  # Assuming 250Hz
    
#     # Standardize blink labels
#     df['Blink'] = df['Blink'].map({'yes': 1, 'no': 0}).fillna(0).astype(int)
    
#     # Convert voluntary to binary
#     df['Voluntary'] = df['Voluntary'].map({'voluntary': 1, 'involuntary': 0}).fillna(0)
    
#     return df[['SampleIndex', 'FP1', 'FP2', 'Blink', 'Voluntary', 'DatasetType']]

# # Denoise Method
# def wavelet_denoise(signal_data, wavelet='db4', level=3):
#     """Improved wavelet denoising with auto-padding"""
#     # Calculate required length (next multiple of 2^level)
#     required_length = ((len(signal_data) + (2 ** level - 1)) // (2 ** level)) * (2 ** level)
#     padded_signal = np.pad(signal_data, (0, required_length - len(signal_data)), 
#                          mode='edge')
    
#     # Now calculate max_level based on the padded signal length
#     max_level = pywt.swt_max_level(len(padded_signal))
#     adjusted_level = min(level, max_level)
    
#     if adjusted_level < 1:
#         raise ValueError(f"Cannot perform SWT with level {adjusted_level}. Need at least level 1.")
        
#     # Perform SWT with adjusted level
#     coeffs = pywt.swt(padded_signal, wavelet, level=adjusted_level)

#     # Adaptive thresholding
#     sigma = np.median(np.abs(coeffs[-1][1])) / 0.6745
#     threshold = sigma * np.sqrt(2 * np.log(len(padded_signal)))
    
#     # Apply threshold to detail coefficients
#     denoised_coeffs = [coeffs[0]]  # Keep approximation coefficients
#     for c in coeffs[1:]:
#         denoised_coeffs.append(pywt.threshold(c, threshold, mode='soft'))
    
#     # Reconstruct signal
#     denoised = pywt.iswt(denoised_coeffs, wavelet)
#     return denoised[:len(signal_data)]  # Remove padding



In [83]:
# New unified processing pipeline
def process_all_data(base_path: str) -> pd.DataFrame:
    """Simplified processing pipeline"""
    base_path = Path(base_path)
    datasets = []
    
    # Unified dataset processor
    def process_dataset(pattern, loader, needs_labels=False, dataset_type=None):
        for file in base_path.glob(pattern):
            try:
                data = loader(file)
                
                # Handle labels
                if needs_labels:
                    label_file = file.parent / file.name.replace('_data.csv', '_labels.csv')
                    labels = load_labels(label_file) if label_file.exists() else None
                    data = pd.merge(data, labels, on='SampleIndex') if labels is not None else data
                
                data['DatasetType'] = dataset_type or file.parent.name.split('-')[-1]
                datasets.append(data)
                print(f"✅ Processed {file.name}")
                
            except Exception as e:
                print(f"❌ Failed {file.name}: {str(e)}")
                continue

    # Process all dataset types 
    process_dataset("EEG-IO/*_data.csv", load_io_data, dataset_type='IO')
    process_dataset("EEG-VR/*_data.csv", load_vr_data, needs_labels=True)
    process_dataset("EEG-VV/*_data.csv", load_vv_data, needs_labels=True)
    
    # Add recalibration data
    process_dataset("EEG-VR/*R_data.csv", 
                   lambda f: pd.read_csv(f).rename(columns={'Time':'SampleIndex'})[['SampleIndex','FP1','FP2']],
                   dataset_type='VR-Recal')

    if not datasets:
        raise ValueError("No valid data files found in any dataset folder")
    
    return pd.concat(datasets).pipe(standardize_columns)

# def process_all_data(base_path: str) -> pd.DataFrame:
#     """Main processing pipeline"""
#     base_path = Path(base_path)
#     dfs = []

#     # Initialize all dataset lists
#     io_dfs, vr_dfs, vv_dfs, recal_dfs = [], [], [], []

#     # Process IO data
#     try:
#         io_files = list(base_path.glob("EEG-EyeBlinks/EEG-IO/*_data.csv"))
#         for io_file in io_files:
#             try:
#                 io_dfs.append(load_io_data(io_file))
#                 print(f"Processed IO file: {io_file.name}")
#             except Exception as e:
#                 print(f"Skipped IO file {io_file.name}: {str(e)}")
#     except Exception as e:
#         print(f"IO processing failed: {str(e)}")

#     # VR processing
#     try:
#         vr_files = list(base_path.glob("EEG-EyeBlinks/EEG-VR/*_data.csv"))
#         for vr_file in vr_files:
#             try:
#                 data = load_vr_data(f)
#                 label_path = f.parent / f.name.replace('_data.csv', '_labels.csv')
                
#                 if label_path.exists():
#                     labels = load_labels(label_path)
#                     merged = pd.merge(data, labels, on='SampleIndex', how='left')
#                 else:
#                     print(f"Warning: Missing labels for {f.name}, using defaults")
#                     merged = data.assign(Blink=0, TrialType='voluntary' if 'V' in f.stem else 'involuntary')
                
#                 vr_dfs.append(merged)
#             except Exception as e:
#                 print(f"Skipped VR file {f.name}: {str(e)}")
#                 continue
#     except Exception as e:
#         print(f"VR processing failed: {str(e)}")
    
#     # Process VV data with labels
#     vv_dfs = []
#     for vv_file in base_path.glob("EEG-EyeBlinks/EEG-VV/*_data.csv"):
#         try:
#             data = load_vv_data(vv_file)
#             labels = load_labels(vv_file.parent / vv_file.name.replace('_data.csv', '_labels.csv'))
#             merged = pd.merge(data, labels, left_on='Timestamp', right_on='StartSample')
#             vv_dfs.append(merged)
#             print(f"Processed VV file: {vv_file.name}")
#         except Exception as e:
#             print(f"Skipped VV file {vv_file.name}: {str(e)}")


#     # Process Recalibration files
#     recal_dfs = []
#     for recal_file in base_path.glob("EEG-VR/*R_data.csv"):
#         try:
#             df = pd.read_csv(recal_file, sep='\t', skiprows=3)
#             df = df.rename(columns={
#                 'Time': 'SampleIndex',
#                 'EEG1': 'FP1',
#                 'EEG2': 'FP2'
#             })[['SampleIndex', 'FP1', 'FP2']]
#             df['Blink'] = 0
#             df['DatasetType'] = 'VR-Recal'
#             dfs.append(df)
#         except Exception as e:
#             print(f"Skipped recalibration file {recal_file.name}: {str(e)}")

    
#     # Combine all datasets
#     return pd.concat([
#         *(_add_dataset_type(df, 'IO') for df in io_dfs),
#         *(_add_dataset_type(df, 'VR') for df in vr_dfs),
#         *(_add_dataset_type(df, 'VV') for df in vv_dfs),
#         *recal_dfs
#     ]).pipe(standardize_columns)
    
    # # Standardize columns
    # return combined_df.rename(columns={
    #     'BlinkFlag': 'Blink',
    #     'TrialType': 'Voluntary'
    # }).pipe(standardize_columns)

In [84]:
# Data Cleaning
def clean_eeg_data(df: pd.DataFrame, sfreq: float) -> pd.DataFrame:
    """Main cleaning pipeline"""
    df = df.astype({
        'SampleIndex': 'int32',
        'Blink': 'int8',
        'Voluntary': 'int8',
        'DatasetType': 'category'
    })
    
    # 1. Remove constant invalid values (-187500)
    eeg_channels = ['FP1', 'FP2', 'Channel3', 'Channel4', 'Channel5']
    df[eeg_channels] = df[eeg_channels].replace(-187500.02, np.nan)

     # Preserve metadata columns through processing
    metadata = df[['SampleIndex', 'Blink', 'Voluntary', 'DatasetType']]
    processed = df.drop(columns=['Blink', 'Voluntary', 'DatasetType'])
    # 2. Handle missing values
    df = df.ffill().bfill()
    
    # 3. Remove non-EEG columns (using ACTUAL existing columns)
    df = df[['SampleIndex'] + eeg_channels]  # Now matches loaded columns
    
    # 4. Convert to μV to Volts
    df[eeg_channels] /= 1e6  # Convert from μV to V
    
    # 5. Outlier removal using Hampel filter
    for ch in eeg_channels:
        median = df[ch].rolling(window=100, center=True).median()
        mad = np.abs(df[ch] - median).rolling(window=100, center=True).median()
        df[ch] = np.where(np.abs(df[ch] - median) > 3*mad, median, df[ch])
    
    # 6. Bandpass filtering (1-40 Hz) with Nyquist check
    nyquist = 0.5 * sfreq
    low = 1.0
    high = min(40.0, nyquist * 0.95)  # Ensure we stay below Nyquist
    
    # Validate frequency range
    if low >= high:
        raise ValueError(f"Invalid filter range: low={low}Hz, high={high}Hz (Nyquist={nyquist}Hz)")
    
    sos = signal.butter(2, [low, high], btype='bandpass', fs=sfreq, output='sos')
    for ch in eeg_channels:
        df[ch] = signal.sosfiltfilt(sos, df[ch])
    
    
    # 7. Notch filter (50 Hz)
    b, a = signal.iirnotch(50, 30, fs=sfreq)
    for ch in eeg_channels:
        df[ch] = signal.filtfilt(b, a, df[ch])
    
    # Merge back metadata
    return pd.concat([metadata, processed], axis=1)

In [85]:
# Main execution
if __name__ == "__main__":
    # Load with debug
    try:
        combined_raw = process_all_data("EEG_EyeBlinks")
        # Add dtype check
        print("Raw dtypes:", combined_raw.dtypes)
        
        # Clean data
        clean_df = clean_eeg_data(combined_raw, 250)
        
        # Final validation
        assert not clean_df.isna().any().any(), "NaNs in cleaned data"
        print("Processing complete!")
        
    except Exception as e:
        print(f"Fatal error: {str(e)}")
        raise

Fatal error: No objects to concatenate


ValueError: No objects to concatenate

In [None]:
# all_dfs = []
# sfreqs = set()

# # Process all CSV files in EEG* subdirectories
# print("Searching in:", Path("EEG_EyeBlinks").absolute())
# found_files = list(Path("EEG_EyeBlinks").glob("EEG*/**/*.csv"))
# print(f"Found {len(found_files)} CSV files")

# for csv_file in found_files:
#     # Add these checks before processing
#     if not csv_file.name.lower().endswith("_data.csv"):
#         print(f"Skipping non-data file: {csv_file.name}")
#         continue
        
#     try:
#         df, sf = load_eeg_data(str(csv_file))
#         print(f"Successfully loaded {len(df)} samples from {csv_file.name}")
#         all_dfs.append(df)
#         sfreqs.add(sf)
#     except Exception as e:
#         print(f"SKIPPED {csv_file.name} - {str(e)}")
#         continue

# # Verify consistent sample rates
# if len(sfreqs) > 1:
#     raise ValueError(f"Multiple sample rates detected: {sfreqs}. All files must have the same rate.")
# if not sfreqs:
#     raise ValueError("No EEG files found in EEG* subfolders")

# raw_df = pd.concat(all_dfs, ignore_index=True)
# sfreq = sfreqs.pop()

# print(f"Loaded {len(raw_df):,} samples from {len(all_dfs)} files")
# print("Sample rate:", sfreq)

# # Clean data
# clean_df = clean_eeg_data(raw_df, sfreq)

# # Apply wavelet denoising to kept channel
# eeg_channels = ['FP1', 'FP2', 'Channel3', 'Channel4', 'Channel5']

# # Apply wavelet denoising
# for ch in eeg_channels:
#     clean_df[ch] = wavelet_denoise(clean_df[ch].values)