In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Configuration
SAMPLE_RATE = 16000
DURATION = 3.0
RANDOM_STATE = 42

print("🔧 PHASE 1 FIXED: Split FIRST, Then Augment")
print("=" * 50)

🔧 PHASE 1 FIXED: Split FIRST, Then Augment


In [3]:
# Same file discovery as before
base_path = Path('footstepData')

non_footstep_folders = [
    base_path / 'Bo6GunSounds' / 'GunReloading',
    base_path / 'Bo6GunSounds' / 'Gunshot Sounds', 
    base_path / 'UselessSoundPack'
]

footstep_folder = base_path / 'FootstepSounds'
excluded_folder = footstep_folder / 'Gun+Footsteppack'

def list_audio_files(folders, exclude=None):
    files = []
    supported_formats = ['.mp4', '.wav', '.mp3', '.m4a', '.flac']
    
    for folder in folders:
        if folder.exists():
            for file in folder.rglob('*'):
                if file.suffix.lower() in supported_formats:
                    if exclude and exclude in file.parents:
                        continue
                    files.append(file)
    return files

footstep_files = list_audio_files([footstep_folder], exclude=excluded_folder)
non_footstep_files = list_audio_files(non_footstep_folders)

print(f"📊 Original Files Found:")
print(f"   Footstep files: {len(footstep_files)}")
print(f"   Non-footstep files: {len(non_footstep_files)}")

📊 Original Files Found:
   Footstep files: 21
   Non-footstep files: 103


In [5]:
print("🎯 STEP 1: SPLITTING ORIGINAL FILES (NO AUGMENTATION YET)")
print("=" * 60)

# Split footstep files
footstep_train_files, footstep_temp = train_test_split(
    footstep_files, test_size=0.3, random_state=RANDOM_STATE
)
footstep_val_files, footstep_test_files = train_test_split(
    footstep_temp, test_size=0.5, random_state=RANDOM_STATE
)

# Split non-footstep files
non_footstep_train_files, non_footstep_temp = train_test_split(
    non_footstep_files, test_size=0.3, random_state=RANDOM_STATE
)
non_footstep_val_files, non_footstep_test_files = train_test_split(
    non_footstep_temp, test_size=0.5, random_state=RANDOM_STATE
)

print(f"✅ Original File Splits (LEAK-FREE):")
print(f"   Train: {len(footstep_train_files)} footstep, {len(non_footstep_train_files)} non-footstep")
print(f"   Val: {len(footstep_val_files)} footstep, {len(non_footstep_val_files)} non-footstep")
print(f"   Test: {len(footstep_test_files)} footstep, {len(non_footstep_test_files)} non-footstep")

# Verify no overlap
train_stems = {f.stem for f in footstep_train_files + non_footstep_train_files}
val_stems = {f.stem for f in footstep_val_files + non_footstep_val_files}
test_stems = {f.stem for f in footstep_test_files + non_footstep_test_files}

overlap = train_stems.intersection(val_stems) or train_stems.intersection(test_stems) or val_stems.intersection(test_stems)
print(f"   Overlap check: {'✅ NO LEAKAGE' if not overlap else '❌ STILL HAVE LEAKAGE'}")

🎯 STEP 1: SPLITTING ORIGINAL FILES (NO AUGMENTATION YET)
✅ Original File Splits (LEAK-FREE):
   Train: 14 footstep, 72 non-footstep
   Val: 3 footstep, 15 non-footstep
   Test: 4 footstep, 16 non-footstep
   Overlap check: ✅ NO LEAKAGE


In [7]:
def segment_and_augment_file(file_path, target_samples=50):
    """Process single file with segmentation and augmentation"""
    try:
        audio, sr = librosa.load(str(file_path), sr=SAMPLE_RATE, mono=True)
        segments = []
        
        # Segment audio (3-second chunks with overlap)
        target_length = int(SAMPLE_RATE * DURATION)
        if len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)))
        
        # Create overlapping segments
        hop_length = target_length // 2  # 50% overlap
        for start in range(0, len(audio) - target_length + 1, hop_length):
            segment = audio[start:start + target_length]
            segments.append(segment)
        
        # Apply augmentation to segments
        augmented_samples = []
        for segment in segments:
            # Original
            augmented_samples.append(segment)
            
            # Pitch shifts
            for n_steps in [-1, 1]:
                try:
                    pitched = librosa.effects.pitch_shift(segment, sr=sr, n_steps=n_steps)
                    augmented_samples.append(pitched)
                except:
                    pass
            
            # Time stretch
            for rate in [0.9, 1.1]:
                try:
                    stretched = librosa.effects.time_stretch(segment, rate=rate)
                    if len(stretched) > len(segment):
                        stretched = stretched[:len(segment)]
                    else:
                        stretched = np.pad(stretched, (0, len(segment) - len(stretched)))
                    augmented_samples.append(stretched)
                except:
                    pass
            
            # Noise
            try:
                noise = np.random.normal(0, 0.005, len(segment))
                noisy = segment + noise
                augmented_samples.append(noisy)
            except:
                pass
        
        # Limit to target number of samples
        if len(augmented_samples) > target_samples:
            augmented_samples = augmented_samples[:target_samples]
        
        return augmented_samples
        
    except Exception as e:
        print(f"   ❌ Error processing {file_path}: {e}")
        return []

print("✅ Augmentation functions defined")

✅ Augmentation functions defined


In [9]:
def process_file_split(files, label, split_name, target_samples_per_file=30):
    """Process a single split with augmentation"""
    print(f"🔧 Processing {split_name} split: {len(files)} files...")
    
    all_samples = []
    
    for i, file_path in enumerate(files):
        samples = segment_and_augment_file(file_path, target_samples_per_file)
        
        for j, sample in enumerate(samples):
            all_samples.append({
                'audio_data': sample,
                'label': label,
                'class_name': 'footstep' if label == 1 else 'non_footstep',
                'original_file': str(file_path),
                'sample_id': f"{file_path.stem}_{split_name}_{j:03d}"
            })
        
        if (i + 1) % 5 == 0:
            print(f"   Processed {i + 1}/{len(files)} files...")
    
    print(f"   ✅ Generated {len(all_samples)} samples for {split_name}")
    return all_samples

# Process each split separately (THIS IS THE KEY FIX)
print("🚀 PROCESSING SPLITS SEPARATELY (NO LEAKAGE)")
print("=" * 50)

# Training set
train_footstep_samples = process_file_split(footstep_train_files, 1, 'train', 35)
train_non_footstep_samples = process_file_split(non_footstep_train_files, 0, 'train', 25)
train_samples = train_footstep_samples + train_non_footstep_samples

# Validation set  
val_footstep_samples = process_file_split(footstep_val_files, 1, 'val', 35)
val_non_footstep_samples = process_file_split(non_footstep_val_files, 0, 'val', 25)
val_samples = val_footstep_samples + val_non_footstep_samples

# Test set
test_footstep_samples = process_file_split(footstep_test_files, 1, 'test', 35) 
test_non_footstep_samples = process_file_split(non_footstep_test_files, 0, 'test', 25)
test_samples = test_footstep_samples + test_non_footstep_samples

print(f"\n📊 FINAL LEAK-FREE DATASET:")
print(f"   Training: {len(train_samples)} samples")
print(f"   Validation: {len(val_samples)} samples") 
print(f"   Test: {len(test_samples)} samples")

🚀 PROCESSING SPLITS SEPARATELY (NO LEAKAGE)
🔧 Processing train split: 14 files...
   Processed 5/14 files...
   Processed 10/14 files...
   ✅ Generated 274 samples for train
🔧 Processing train split: 72 files...
   Processed 5/72 files...
   Processed 10/72 files...
   Processed 15/72 files...
   Processed 20/72 files...
   Processed 25/72 files...
   Processed 30/72 files...
   Processed 35/72 files...
   Processed 40/72 files...
   Processed 45/72 files...
   Processed 50/72 files...
   Processed 55/72 files...
   Processed 60/72 files...
   Processed 65/72 files...
   Processed 70/72 files...
   ✅ Generated 833 samples for train
🔧 Processing val split: 3 files...
   ✅ Generated 48 samples for val
🔧 Processing val split: 15 files...
   Processed 5/15 files...
   Processed 10/15 files...
   Processed 15/15 files...
   ✅ Generated 189 samples for val
🔧 Processing test split: 4 files...
   ✅ Generated 60 samples for test
🔧 Processing test split: 16 files...
   Processed 5/16 files...
  

In [11]:
# Convert to DataFrames and save
train_df = pd.DataFrame(train_samples).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
val_df = pd.DataFrame(val_samples).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
test_df = pd.DataFrame(test_samples).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Save to pickle (preserves audio data)
train_df.to_pickle('train_manifest_FIXED.pkl')
val_df.to_pickle('val_manifest_FIXED.pkl')
test_df.to_pickle('test_manifest_FIXED.pkl')

print(f"✅ SAVED LEAK-FREE DATASETS:")
print(f"   train_manifest_FIXED.pkl ({len(train_df)} samples)")
print(f"   val_manifest_FIXED.pkl ({len(val_df)} samples)")
print(f"   test_manifest_FIXED.pkl ({len(test_df)} samples)")

print(f"\n🎯 Class Distribution:")
for split_name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    footstep_count = len(df[df['label'] == 1])
    non_footstep_count = len(df[df['label'] == 0])
    ratio = footstep_count / (non_footstep_count + 1)
    print(f"   {split_name}: {footstep_count} footstep, {non_footstep_count} non-footstep (ratio: {ratio:.2f})")

print(f"\n🚀 READY FOR PHASE 2 WITH FIXED DATA!")
print(f"   Expected model performance: 75-90% (realistic)")
print(f"   No more perfect scores from data leakage")

✅ SAVED LEAK-FREE DATASETS:
   train_manifest_FIXED.pkl (1107 samples)
   val_manifest_FIXED.pkl (237 samples)
   test_manifest_FIXED.pkl (229 samples)

🎯 Class Distribution:
   Train: 274 footstep, 833 non-footstep (ratio: 0.33)
   Val: 48 footstep, 189 non-footstep (ratio: 0.25)
   Test: 60 footstep, 169 non-footstep (ratio: 0.35)

🚀 READY FOR PHASE 2 WITH FIXED DATA!
   Expected model performance: 75-90% (realistic)
   No more perfect scores from data leakage


In [13]:
# Phase 2: Feature Extraction (Leak-Free)
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Feature extraction configuration
SAMPLE_RATE = 16000
DURATION = 3.0
N_MFCC = 13
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

print("🎵 Phase 2: Feature Extraction (LEAK-FREE)")
print("=" * 50)
print(f"📊 Configuration:")
print(f"   Sample Rate: {SAMPLE_RATE} Hz")
print(f"   Duration: {DURATION} seconds")
print(f"   MFCC coefficients: {N_MFCC}")
print(f"   Mel filters: {N_MELS}")
print(f"   Hop length: {HOP_LENGTH}")


🎵 Phase 2: Feature Extraction (LEAK-FREE)
📊 Configuration:
   Sample Rate: 16000 Hz
   Duration: 3.0 seconds
   MFCC coefficients: 13
   Mel filters: 128
   Hop length: 512


In [15]:
print("📂 Loading leak-free datasets...")

# Load the fixed datasets
train_df = pd.read_pickle('train_manifest_FIXED.pkl')
val_df = pd.read_pickle('val_manifest_FIXED.pkl')
test_df = pd.read_pickle('test_manifest_FIXED.pkl')

print(f"✅ Loaded leak-free datasets:")
print(f"   Training: {len(train_df)} samples")
print(f"   Validation: {len(val_df)} samples")
print(f"   Test: {len(test_df)} samples")

# Verify audio data is present
print(f"\n🔍 Data Verification:")
print(f"   Training audio shape: {train_df['audio_data'].iloc[0].shape}")
print(f"   Audio data type: {type(train_df['audio_data'].iloc[0])}")

# Check class distribution
for split_name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    footstep_count = len(df[df['label'] == 1])
    non_footstep_count = len(df[df['label'] == 0])
    print(f"   {split_name}: {footstep_count} footstep, {non_footstep_count} non-footstep")


📂 Loading leak-free datasets...
✅ Loaded leak-free datasets:
   Training: 1107 samples
   Validation: 237 samples
   Test: 229 samples

🔍 Data Verification:
   Training audio shape: (48000,)
   Audio data type: <class 'numpy.ndarray'>
   Train: 274 footstep, 833 non-footstep
   Val: 48 footstep, 189 non-footstep
   Test: 60 footstep, 169 non-footstep


In [17]:
def extract_mfcc_features(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC, hop_length=HOP_LENGTH):
    """Extract MFCC features from audio"""
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
    return mfccs

def extract_mel_spectrogram(audio, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH):
    """Extract mel-spectrogram for CNN input"""
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, hop_length=hop_length)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

def extract_spectral_features(audio, sr=SAMPLE_RATE, hop_length=HOP_LENGTH):
    """Extract spectral features"""
    features = {}
    
    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=hop_length)
    features['spectral_centroid'] = np.mean(spectral_centroid)
    features['spectral_centroid_std'] = np.std(spectral_centroid)
    
    # Spectral rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, hop_length=hop_length)
    features['spectral_rolloff'] = np.mean(spectral_rolloff)
    features['spectral_rolloff_std'] = np.std(spectral_rolloff)
    
    # Spectral bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr, hop_length=hop_length)
    features['spectral_bandwidth'] = np.mean(spectral_bandwidth)
    features['spectral_bandwidth_std'] = np.std(spectral_bandwidth)
    
    # Zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(y=audio, hop_length=hop_length)
    features['zcr'] = np.mean(zcr)
    features['zcr_std'] = np.std(zcr)
    
    return features

def extract_temporal_features(audio, sr=SAMPLE_RATE):
    """Extract temporal features"""
    features = {}
    
    # RMS energy
    rms = librosa.feature.rms(y=audio)
    features['rms'] = np.mean(rms)
    features['rms_std'] = np.std(rms)
    
    # Tempo (if detectable)
    try:
        tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
        features['tempo'] = tempo
    except:
        features['tempo'] = 120.0  # Default tempo
    
    return features

def process_dataset_features(df, split_name):
    """Extract features from entire dataset"""
    print(f"🔧 Extracting features from {split_name} set ({len(df)} samples)...")
    
    mfcc_features = []
    mel_features = []
    spectral_features = []
    temporal_features = []
    labels = []
    
    for idx, row in df.iterrows():
        try:
            # Get audio data
            audio = np.array(row['audio_data'], dtype=np.float32)
            
            # Ensure correct length
            target_length = int(SAMPLE_RATE * DURATION)
            if len(audio) != target_length:
                if len(audio) < target_length:
                    audio = np.pad(audio, (0, target_length - len(audio)))
                else:
                    audio = audio[:target_length]
            
            # Extract features
            mfcc = extract_mfcc_features(audio)
            mel_spec = extract_mel_spectrogram(audio)
            spectral = extract_spectral_features(audio)
            temporal = extract_temporal_features(audio)
            
            mfcc_features.append(mfcc)
            mel_features.append(mel_spec)
            spectral_features.append(spectral)
            temporal_features.append(temporal)
            labels.append(row['label'])
            
            if (idx + 1) % 100 == 0:
                print(f"   Processed {idx + 1}/{len(df)} samples...")
                
        except Exception as e:
            print(f"   ❌ Error processing sample {idx}: {e}")
            continue
    
    # Convert to numpy arrays
    features = {
        'mfcc': np.array(mfcc_features),
        'mel_spectrogram': np.array(mel_features),
        'spectral': spectral_features,  # Keep as list of dicts for now
        'temporal': temporal_features,  # Keep as list of dicts for now
        'labels': np.array(labels)
    }
    
    print(f"✅ {split_name} feature extraction completed!")
    print(f"   MFCC: {features['mfcc'].shape}")
    print(f"   Mel-spectrogram: {features['mel_spectrogram'].shape}")
    print(f"   Spectral: {len(features['spectral'])} samples")
    print(f"   Temporal: {len(features['temporal'])} samples")
    
    return features

print("✅ Feature extraction functions defined")


✅ Feature extraction functions defined


In [19]:
print("🎵 TRAINING SET FEATURE EXTRACTION")
print("=" * 40)

train_features = process_dataset_features(train_df, "Training")

# Quick quality check
mfcc_mean = np.mean(np.abs(train_features['mfcc']))
print(f"\n📊 Training Feature Quality:")
print(f"   MFCC mean magnitude: {mfcc_mean:.3f} (should be > 1.0)")
print(f"   Labels: {np.bincount(train_features['labels'])}")


🎵 TRAINING SET FEATURE EXTRACTION
🔧 Extracting features from Training set (1107 samples)...
   Processed 100/1107 samples...
   Processed 200/1107 samples...
   Processed 300/1107 samples...
   Processed 400/1107 samples...
   Processed 500/1107 samples...
   Processed 600/1107 samples...
   Processed 700/1107 samples...
   Processed 800/1107 samples...
   Processed 900/1107 samples...
   Processed 1000/1107 samples...
   Processed 1100/1107 samples...
✅ Training feature extraction completed!
   MFCC: (1107, 13, 94)
   Mel-spectrogram: (1107, 128, 94)
   Spectral: 1107 samples
   Temporal: 1107 samples

📊 Training Feature Quality:
   MFCC mean magnitude: 39.091 (should be > 1.0)
   Labels: [833 274]


In [21]:
print("\n🎵 VALIDATION SET FEATURE EXTRACTION")
print("=" * 40)

val_features = process_dataset_features(val_df, "Validation")

print(f"\n📊 Validation Feature Quality:")
print(f"   MFCC mean magnitude: {np.mean(np.abs(val_features['mfcc'])):.3f}")
print(f"   Labels: {np.bincount(val_features['labels'])}")



🎵 VALIDATION SET FEATURE EXTRACTION
🔧 Extracting features from Validation set (237 samples)...
   Processed 100/237 samples...
   Processed 200/237 samples...
✅ Validation feature extraction completed!
   MFCC: (237, 13, 94)
   Mel-spectrogram: (237, 128, 94)
   Spectral: 237 samples
   Temporal: 237 samples

📊 Validation Feature Quality:
   MFCC mean magnitude: 40.069
   Labels: [189  48]


In [22]:
print("\n🎵 TEST SET FEATURE EXTRACTION")
print("=" * 40)

test_features = process_dataset_features(test_df, "Test")

print(f"\n📊 Test Feature Quality:")
print(f"   MFCC mean magnitude: {np.mean(np.abs(test_features['mfcc'])):.3f}")
print(f"   Labels: {np.bincount(test_features['labels'])}")



🎵 TEST SET FEATURE EXTRACTION
🔧 Extracting features from Test set (229 samples)...
   Processed 100/229 samples...
   Processed 200/229 samples...
✅ Test feature extraction completed!
   MFCC: (229, 13, 94)
   Mel-spectrogram: (229, 128, 94)
   Spectral: 229 samples
   Temporal: 229 samples

📊 Test Feature Quality:
   MFCC mean magnitude: 41.026
   Labels: [169  60]


In [25]:
print("🔧 FEATURE NORMALIZATION")
print("=" * 30)

def safe_normalize_features(train_features, val_features, test_features):
    """Normalize features using training set statistics"""
    normalized_features = {}
    scalers = {}
    
    # 1. Normalize MFCC features
    print("   Normalizing MFCC features...")
    train_mfcc_flat = train_features['mfcc'].reshape(len(train_features['mfcc']), -1)
    val_mfcc_flat = val_features['mfcc'].reshape(len(val_features['mfcc']), -1)
    test_mfcc_flat = test_features['mfcc'].reshape(len(test_features['mfcc']), -1)
    
    mfcc_scaler = StandardScaler()
    train_mfcc_norm = mfcc_scaler.fit_transform(train_mfcc_flat)
    val_mfcc_norm = mfcc_scaler.transform(val_mfcc_flat)
    test_mfcc_norm = mfcc_scaler.transform(test_mfcc_flat)
    
    normalized_features['train_mfcc'] = train_mfcc_norm.reshape(train_features['mfcc'].shape)
    normalized_features['val_mfcc'] = val_mfcc_norm.reshape(val_features['mfcc'].shape)
    normalized_features['test_mfcc'] = test_mfcc_norm.reshape(test_features['mfcc'].shape)
    scalers['mfcc'] = mfcc_scaler
    
    # 2. Normalize Mel-spectrograms
    print("   Normalizing Mel-spectrogram features...")
    mel_scaler = MinMaxScaler(feature_range=(-1, 1))
    train_mel_flat = train_features['mel_spectrogram'].reshape(len(train_features['mel_spectrogram']), -1)
    val_mel_flat = val_features['mel_spectrogram'].reshape(len(val_features['mel_spectrogram']), -1)
    test_mel_flat = test_features['mel_spectrogram'].reshape(len(test_features['mel_spectrogram']), -1)
    
    train_mel_norm = mel_scaler.fit_transform(train_mel_flat)
    val_mel_norm = mel_scaler.transform(val_mel_flat)
    test_mel_norm = mel_scaler.transform(test_mel_flat)
    
    normalized_features['train_mel'] = train_mel_norm.reshape(train_features['mel_spectrogram'].shape)
    normalized_features['val_mel'] = val_mel_norm.reshape(val_features['mel_spectrogram'].shape)
    normalized_features['test_mel'] = test_mel_norm.reshape(test_features['mel_spectrogram'].shape)
    scalers['mel'] = mel_scaler
    
    # 3. Normalize spectral features
    print("   Normalizing spectral features...")
    train_spectral_df = pd.DataFrame(train_features['spectral'])
    val_spectral_df = pd.DataFrame(val_features['spectral'])
    test_spectral_df = pd.DataFrame(test_features['spectral'])
    
    spectral_scaler = StandardScaler()
    train_spectral_norm = spectral_scaler.fit_transform(train_spectral_df)
    val_spectral_norm = spectral_scaler.transform(val_spectral_df)
    test_spectral_norm = spectral_scaler.transform(test_spectral_df)
    
    normalized_features['train_spectral'] = train_spectral_norm
    normalized_features['val_spectral'] = val_spectral_norm
    normalized_features['test_spectral'] = test_spectral_norm
    scalers['spectral'] = spectral_scaler
    
    # 4. Normalize temporal features
    print("   Normalizing temporal features...")
    train_temporal_df = pd.DataFrame(train_features['temporal'])
    val_temporal_df = pd.DataFrame(val_features['temporal'])
    test_temporal_df = pd.DataFrame(test_features['temporal'])
    
    temporal_scaler = StandardScaler()
    train_temporal_norm = temporal_scaler.fit_transform(train_temporal_df)
    val_temporal_norm = temporal_scaler.transform(val_temporal_df)
    test_temporal_norm = temporal_scaler.transform(test_temporal_df)
    
    normalized_features['train_temporal'] = train_temporal_norm
    normalized_features['val_temporal'] = val_temporal_norm
    normalized_features['test_temporal'] = test_temporal_norm
    scalers['temporal'] = temporal_scaler
    
    # Store labels
    normalized_features['train_labels'] = train_features['labels']
    normalized_features['val_labels'] = val_features['labels']
    normalized_features['test_labels'] = test_features['labels']
    
    return normalized_features, scalers

# Apply normalization
normalized_features, scalers = safe_normalize_features(train_features, val_features, test_features)

print(f"\n✅ Feature normalization completed!")
print(f"   MFCC shapes: Train {normalized_features['train_mfcc'].shape}, Val {normalized_features['val_mfcc'].shape}, Test {normalized_features['test_mfcc'].shape}")
print(f"   Mel-spec shapes: Train {normalized_features['train_mel'].shape}, Val {normalized_features['val_mel'].shape}, Test {normalized_features['test_mel'].shape}")
print(f"   Spectral shapes: Train {normalized_features['train_spectral'].shape}")
print(f"   Temporal shapes: Train {normalized_features['train_temporal'].shape}")


🔧 FEATURE NORMALIZATION
   Normalizing MFCC features...
   Normalizing Mel-spectrogram features...
   Normalizing spectral features...
   Normalizing temporal features...

✅ Feature normalization completed!
   MFCC shapes: Train (1107, 13, 94), Val (237, 13, 94), Test (229, 13, 94)
   Mel-spec shapes: Train (1107, 128, 94), Val (237, 128, 94), Test (229, 128, 94)
   Spectral shapes: Train (1107, 8)
   Temporal shapes: Train (1107, 3)


In [27]:
print("💾 SAVING PROCESSED FEATURES (LEAK-FREE)")
print("=" * 40)

# Save all features and scalers
features_to_save = {
    'normalized_features': normalized_features,
    'scalers': scalers,
    'config': {
        'sample_rate': SAMPLE_RATE,
        'duration': DURATION,
        'n_mfcc': N_MFCC,
        'n_mels': N_MELS,
        'hop_length': HOP_LENGTH,
        'n_fft': N_FFT
    }
}

# Save to pickle file
with open('extracted_features_FIXED.pkl', 'wb') as f:
    pickle.dump(features_to_save, f)

print(f"✅ Saved complete feature set to 'extracted_features_FIXED.pkl'")

# Save individual feature matrices
np.save('train_mfcc_FIXED.npy', normalized_features['train_mfcc'])
np.save('val_mfcc_FIXED.npy', normalized_features['val_mfcc'])
np.save('test_mfcc_FIXED.npy', normalized_features['test_mfcc'])

np.save('train_mel_spec_FIXED.npy', normalized_features['train_mel'])
np.save('val_mel_spec_FIXED.npy', normalized_features['val_mel'])
np.save('test_mel_spec_FIXED.npy', normalized_features['test_mel'])

np.save('train_labels_FIXED.npy', normalized_features['train_labels'])
np.save('val_labels_FIXED.npy', normalized_features['val_labels'])
np.save('test_labels_FIXED.npy', normalized_features['test_labels'])

print(f"✅ Saved individual feature matrices (FIXED versions)")

print(f"\n🎉 PHASE 2 COMPLETED (LEAK-FREE)!")
print(f"   Ready for Phase 3: CNN Training with realistic performance")
print(f"   Expected accuracy: 75-85% (no more 100% fake scores)")


💾 SAVING PROCESSED FEATURES (LEAK-FREE)
✅ Saved complete feature set to 'extracted_features_FIXED.pkl'
✅ Saved individual feature matrices (FIXED versions)

🎉 PHASE 2 COMPLETED (LEAK-FREE)!
   Ready for Phase 3: CNN Training with realistic performance
   Expected accuracy: 75-85% (no more 100% fake scores)
