# Sleep-EDF Dataset Preprocessing

Este notebook executa o pré-processamento completo do dataset Sleep-EDF.

## O que este notebook faz:
1. Carrega dados EDF (gravações EEG/EOG) e hypnogramas
2. Aplica filtragem Butterworth
3. Segmenta em épocas de 30 segundos
4. Extrai features (tempo + frequência)
5. Normaliza e divide em train/val/test
6. Salva dados processados no Google Drive

**⚠️ IMPORTANTE**: Execute este notebook apenas UMA VEZ. Os dados processados são salvos permanentemente.


In [None]:
# ============================================================================
# STEP 1: Setup (run 00_colab_setup.ipynb first)
# ============================================================================

print("="*70)
print("SLEEP-EDF PREPROCESSING")
print("="*70)

# Import our preprocessing module
from src.preprocessing.sleep_edf import preprocess_sleep_edf, load_processed_sleep_edf
import os

# Define paths
RAW_DATA_PATH = '/content/drive/MyDrive/mhealth-data/raw/sleep-edf'
PROCESSED_DATA_PATH = '/content/drive/MyDrive/mhealth-data/processed/sleep-edf'

print(f"Raw data path: {RAW_DATA_PATH}")
print(f"Processed data path: {PROCESSED_DATA_PATH}")

# Check if raw data exists
if not os.path.exists(RAW_DATA_PATH):
    print(f"❌ Raw data directory not found: {RAW_DATA_PATH}")
    print("Please download Sleep-EDF dataset and place it in the raw directory.")
    print("See data/README.md for download instructions.")
else:
    print("✅ Raw data directory found")
    
    # List available files
    files = os.listdir(RAW_DATA_PATH)
    edf_files = [f for f in files if f.endswith('.edf') and not f.endswith('.hyp.edf')]
    hyp_files = [f for f in files if f.endswith('.hyp.edf')]
    
    print(f"Found {len(edf_files)} recording files and {len(hyp_files)} hypnogram files")
    if edf_files:
        print("Sample files:", edf_files[:3])


In [None]:
# ============================================================================
# STEP 2: Run Preprocessing
# ============================================================================

if os.path.exists(RAW_DATA_PATH) and len(edf_files) > 0:
    print("Starting Sleep-EDF preprocessing...")
    
    # Run preprocessing
    preprocessing_info = preprocess_sleep_edf(
        data_dir=RAW_DATA_PATH,
        output_dir=PROCESSED_DATA_PATH,
        test_size=0.15,
        val_size=0.15,
        random_state=42
    )
    
    print("\n" + "="*70)
    print("PREPROCESSING COMPLETE!")
    print("="*70)
    print(f"Preprocessing info: {preprocessing_info}")
    
else:
    print("❌ Cannot proceed without raw data files")
    print("Please download the Sleep-EDF dataset first.")


In [None]:
# ============================================================================
# STEP 3: Verify Processed Data
# ============================================================================

if os.path.exists(PROCESSED_DATA_PATH):
    print("Verifying processed data...")
    
    # Load processed data to verify
    X_train, X_val, X_test, y_train, y_val, y_test, scaler, label_encoder, info = load_processed_sleep_edf(PROCESSED_DATA_PATH)
    
    print(f"\nData shapes:")
    print(f"  Train: {X_train.shape}")
    print(f"  Val:   {X_val.shape}")
    print(f"  Test:  {X_test.shape}")
    
    print(f"\nLabel distribution:")
    print(f"  Train: {np.bincount(y_train)}")
    print(f"  Val:   {np.bincount(y_val)}")
    print(f"  Test:  {np.bincount(y_test)}")
    
    print(f"\nClass names: {info['class_names']}")
    print(f"Features per sample: {info['n_features']}")
    
    print("\n✅ Sleep-EDF preprocessing completed successfully!")
    print("Data is ready for training models.")
    
else:
    print("❌ Processed data not found. Please run preprocessing first.")
