In [5]:
import os
import glob
import numpy as np
import pandas as pd
import librosa

# --- CONFIGURATION ---
# PASTE YOUR FOLDER PATH HERE
DATASET_PATH = r"../data/raw/Audio/Baby Cry Dataset/" 

# MAPPING: 1 = Pain, 0 = No Pain
LABEL_MAP = {
    'belly pain': 1, 'cold_hot': 1, 'discomfort': 1,  # PAIN CLASSES
    'hungry': 0, 'tired': 0, 'burping': 0, 'lonely': 0, 'scared': 0 # NO PAIN CLASSES
}

def add_noise(data):
    """Adds random static noise"""
    noise_amp = 0.005 * np.random.uniform() * np.amax(data)
    return data + noise_amp * np.random.normal(size=data.shape)

def shift_pitch(data, sr):
    """Changes the pitch slightly (higher/lower)"""
    return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=2.0)

def stretch_time(data):
    """Speeds up the audio slightly"""
    return librosa.effects.time_stretch(y=data, rate=1.2)

def extract_features(y, sr):
    """Extracts MFCCs (the main feature for audio AI)"""
    # MFCCs
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    # Mel Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    
    return np.hstack([mfccs, mel])

# --- MAIN LOOP ---
features = []
labels = []

print("Starting Data Processing with AUGMENTATION...")

for folder_name, label in LABEL_MAP.items():
    folder_path = os.path.join(DATASET_PATH, folder_name)
    audio_files = glob.glob(os.path.join(folder_path, "*.wav"))
    
    print(f"Processing '{folder_name}' (Class {label}). Found {len(audio_files)} files.")
    
    for file in audio_files:
        try:
            # Load Audio (limit to 5 seconds for consistency)
            y, sr = librosa.load(file, duration=5.0)
            
            # 1. Save the ORIGINAL file
            feat = extract_features(y, sr)
            features.append(feat)
            labels.append(label)
            
            # 2. IF CLASS IS PAIN (1) -> AUGMENT DATA
            # We create artificial copies to balance the dataset
            if label == 1:
                # Augment 1: Add Noise
                y_noise = add_noise(y)
                features.append(extract_features(y_noise, sr))
                labels.append(label)
                
                # Augment 2: Pitch Shift (Sound slightly higher)
                y_pitch = shift_pitch(y, sr)
                features.append(extract_features(y_pitch, sr))
                labels.append(label)
                
                # Augment 3: Time Stretch (Faster)
                y_stretch = stretch_time(y)
                features.append(extract_features(y_stretch, sr))
                labels.append(label)
                
        except Exception as e:
            print(f"Error file {file}: {e}")

# --- SAVE ---
cols = [f'mfcc_{i}' for i in range(40)] + [f'mel_{i}' for i in range(128)]
df = pd.DataFrame(features, columns=cols)
df['label'] = labels

print(f"\nPROCESSING COMPLETE!")
print(f"Total samples processed: {len(df)}")
print(f"Class Distribution:\n{df['label'].value_counts()}")

df.to_csv("../data/processed/processed.csv", index=False)
print("Saved to processed.csv'")

Starting Data Processing with AUGMENTATION...
Processing 'belly pain' (Class 1). Found 127 files.


  "class": algorithms.Blowfish,


Processing 'cold_hot' (Class 1). Found 107 files.
Processing 'discomfort' (Class 1). Found 138 files.
Processing 'hungry' (Class 0). Found 382 files.
Processing 'tired' (Class 0). Found 136 files.
Processing 'burping' (Class 0). Found 118 files.
Processing 'lonely' (Class 0). Found 11 files.
Processing 'scared' (Class 0). Found 20 files.

PROCESSING COMPLETE!
Total samples processed: 2155
Class Distribution:
label
1    1488
0     667
Name: count, dtype: int64
Saved to processed.csv'
