## Notebook 2: Prepare Synthetic Speech Commands Dataset (Final, Robust Version)

This notebook downloads the [Synthetic Speech Commands Dataset](https://www.kaggle.com/datasets/jbuchner/synthetic-speech-commands-dataset) and prepares it for training.

**Features:**
- **Skips Download**: Checks if the dataset directory already exists.
- **Retry Logic**: Automatically retries the download up to 3 times on network failure.
- **Progress Bars**: Displays progress for all major operations.
- **Robust File Paths**: Ensures all paths are correct relative to the project root.

In [None]:
import os
import shutil
import kaggle
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import joblib
from tqdm.notebook import tqdm
import time

# --- 1. Setup & Path Definitions ---
sample_rate = 16000
duration = 1.0
n_mels = 256
desired_frames = 61

movement_phrases = ['backward', 'forward', 'up', 'down', 'go', 'left', 'right', 'follow']
emergency_phrases = ['stop', 'yes', 'no']

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
download_path = os.path.join(project_root, 'datasets', 'synthetic-speech-commands-raw')
output_dir = os.path.join(project_root, 'processed_data', 'synthetic-speech-commands')

print(f"Project Root: {project_root}")
print(f"Download Path: {download_path}")
print(f"Output Path: {output_dir}")

In [None]:
# --- 2. Download Dataset with Retry Logic ---
dataset_slug = 'jbuchner/synthetic-speech-commands-dataset'
max_retries = 3
download_success = False

if os.path.exists(download_path) and len(os.listdir(download_path)) > 1: # Check if folder is not empty
    print('Dataset directory already exists. Skipping download.')
    download_success = True
else:
    if os.path.exists(download_path):
        shutil.rmtree(download_path)
    os.makedirs(download_path, exist_ok=True)
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}: Downloading dataset...")
            kaggle.api.dataset_download_files(dataset_slug, path=download_path, unzip=True, quiet=False)
            download_success = True
            print('\n✅ Dataset downloaded and unzipped successfully.')
            break
        except Exception as e:
            print(f"Download failed on attempt {attempt + 1} with error: {e}")
            if attempt < max_retries - 1:
                print("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print("Max retries reached. Download failed.")

if not download_success:
    raise SystemExit("Fatal: Could not download the dataset.")

In [None]:
# --- 3. Process Data ---
if download_success:
    x_data = []
    y_labels = []
    data_path = download_path

    print(f'Processing audio files from {os.path.abspath(data_path)}...')
    keyword_folders = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
    for keyword_folder in tqdm(keyword_folders, desc='Processing folders'):
        if keyword_folder in emergency_phrases: label = 'emergency'
        elif keyword_folder in movement_phrases: label = 'movement'
        else: continue

        keyword_path = os.path.join(data_path, keyword_folder)
        for file in os.listdir(keyword_path):
            if file.endswith(".wav"):
                y_labels.append(label)
                path = os.path.join(keyword_path, file)
                audio, _ = librosa.load(path, sr=sample_rate, duration=duration)
                if len(audio) < sample_rate: audio = np.pad(audio, (0, sample_rate - len(audio)))
                else: audio = audio[:sample_rate]
                
                mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                mel_spec_db = mel_spec_db / 80.0 + 1.0
                x_data.append(mel_spec_db)

    print('\n✅ Data processing complete.')

    # --- 4. Format and Save Files ---
    x_train = np.array(x_data)
    x_train = x_train[..., np.newaxis]
    current_frames = x_train.shape[2]
    if current_frames < desired_frames:
        pad_width = ((0, 0), (0, 0), (0, desired_frames - current_frames), (0, 0))
        x_train = np.pad(x_train, pad_width, mode='constant')
    elif current_frames > desired_frames:
        x_train = x_train[:, :, :desired_frames, :]

    label_encoder = LabelEncoder()
    y_doa_int = label_encoder.fit_transform(y_labels)
    y_doa = to_categorical(y_doa_int)
    y_sed = np.ones((len(y_doa), 1))

    os.makedirs(output_dir, exist_ok=True)
    np.save(os.path.join(output_dir, 'x_train.npy'), x_train)
    np.save(os.path.join(output_dir, 'y_doa.npy'), y_doa)
    np.save(os.path.join(output_dir, 'y_sed.npy'), y_sed)
    joblib.dump(label_encoder, os.path.join(output_dir, 'label_encoder.joblib'))

    print(f"\n✅ Processed data saved to: {os.path.abspath(output_dir)}")
    print(f"x_train shape: {x_train.shape}")
    print(f"y_doa shape: {y_doa.shape}")
    print(f"Classes: {label_encoder.classes_}")