## Notebook 2: Download and Prepare Synthetic Speech Commands Dataset

This notebook downloads the [Synthetic Speech Commands Dataset](https://www.kaggle.com/datasets/jbuchner/synthetic-speech-commands-dataset) from Kaggle and prepares it for training.

**Steps:**
1.  **Set up Kaggle API**: Ensure you have `kaggle.json` configured.
2.  **Download and Unzip**: Download and extract the dataset files.
3.  **Process and Prepare Data**: Convert `.wav` files to Mel Spectrograms.

In [None]:
import os
import kaggle
import shutil

# 1. Download Dataset
print('Downloading synthetic-speech-commands-dataset from Kaggle...')
dataset_slug = 'jbuchner/synthetic-speech-commands-dataset'
download_path = './datasets/synthetic-speech-commands'

if os.path.exists(download_path):
    shutil.rmtree(download_path)
os.makedirs(download_path, exist_ok=True)

kaggle.api.dataset_download_files(dataset_slug, path=download_path, unzip=True)

print(f'Dataset successfully downloaded and unzipped to: {download_path}')

### 2. Prepare Data for Training

In [None]:
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import joblib

# --- Parameters ---
sample_rate = 16000
duration = 1.0
n_mels = 256
desired_frames = 61

# --- Command categorization ---
# Based on 'Drone Research' document
movement_phrases = ['backward', 'forward', 'up', 'down', 'go', 'left', 'right', 'follow']
emergency_phrases = ['stop', 'yes', 'no']
# No 'other' category specified for this dataset in the document

# --- Initialize lists ---
x_data = []
y_labels = []
data_path = download_path

print("Processing audio files...")
for keyword_folder in os.listdir(data_path):
    keyword_path = os.path.join(data_path, keyword_folder)
    if not os.path.isdir(keyword_path):
        continue
        
    if keyword_folder in emergency_phrases:
        label = 'emergency'
    elif keyword_folder in movement_phrases:
        label = 'movement'
    else:
        continue # Skip unused words
        
    for file in os.listdir(keyword_path):
        if file.endswith(".wav"):
            y_labels.append(label)
            path = os.path.join(keyword_path, file)
            
            # Load and process audio
            audio, _ = librosa.load(path, sr=sample_rate, duration=duration)
            if len(audio) < sample_rate:
                audio = np.pad(audio, (0, sample_rate - len(audio)))
            else:
                audio = audio[:sample_rate]

            # Feature extraction
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            mel_spec_db = mel_spec_db / 80.0 + 1.0
            x_data.append(mel_spec_db)

print("Data processing complete.")

# --- Convert, reshape, and save ---
x_train = np.array(x_data)
x_train = x_train[..., np.newaxis]
current_frames = x_train.shape[2]
if current_frames < desired_frames:
    pad_width = ((0, 0), (0, 0), (0, desired_frames - current_frames), (0, 0))
    x_train = np.pad(x_train, pad_width, mode='constant')
elif current_frames > desired_frames:
    x_train = x_train[:, :, :desired_frames, :]

label_encoder = LabelEncoder()
y_doa_int = label_encoder.fit_transform(y_labels)
y_doa = to_categorical(y_doa_int)
y_sed = np.ones((len(y_doa), 1))

output_dir = './processed_data/synthetic-speech-commands'
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, 'x_train.npy'), x_train)
np.save(os.path.join(output_dir, 'y_doa.npy'), y_doa)
np.save(os.path.join(output_dir, 'y_sed.npy'), y_sed)
joblib.dump(label_encoder, os.path.join(output_dir, 'label_encoder.joblib'))

print(f"\n✅ Processed data saved to: {output_dir}")
print(f"x_train shape: {x_train.shape}")
print(f"y_doa shape: {y_doa.shape}")
print(f"Classes: {label_encoder.classes_}")