In [3]:
import os
import glob
import shutil
import random
# %pip install librosa
import librosa
import kagglehub
from typing import Dict, Optional, List

In [4]:
LIST_OF_WORDS = ["bird", "eight", "four", "happy", "left", "marvin", "one", "seven", "three", "zero"]
DATA_DIR = "data"
FILES_PER_WORD = 100
KAGGLE_DATASET_REF = "neehakurelli/google-speech-commands"

In [5]:
def get_full_dataset_path() -> Optional[str]:
    """Downloads the dataset from Kaggle Hub."""
    source_path: str = ""
    print(f'Downloading dataset from Kaggle: {KAGGLE_DATASET_REF}..')
    try:
        source_path = kagglehub.dataset_download(KAGGLE_DATASET_REF)
    except Exception as e:
        print(f"Error downloading dataset from Kaggle: {e}")
        return None
    return str(source_path)

In [6]:
path = get_full_dataset_path()

Downloading dataset from Kaggle: neehakurelli/google-speech-commands..


In [7]:
def get_word_count(path: str) -> int:
    """Counts the number of word folders."""
    if not os.path.exists(path):
        return 0
    
    word_folders = 0
    for entry in os.listdir(path):
        full_path = os.path.join(path, entry)
        if os.path.isdir(full_path) and entry != "_background_noise_":
            word_folders += 1
            
    return word_folders

def get_speakers_per_word(path: str) -> Dict[str, int]:
    """Counts unique speakers based on file naming convention (speaker_id_...)."""
    speakers_map: Dict[str, int] = {}
    if not os.path.exists(path):
        return speakers_map

    word_folders = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d)) and d != "_background_noise_"]

    for word in sorted(word_folders):
        word_path = os.path.join(path, word)
        unique_speakers: set[str] = set()
        for file_path in glob.glob(os.path.join(word_path, '*.wav')):
            filename = os.path.basename(file_path)
            # Speaker ID is usually the part before the first underscore
            speaker_id = filename.split('_')[0] 
            unique_speakers.add(speaker_id)
        
        speakers_map[word] = len(unique_speakers)
        
    return speakers_map


In [8]:
def select_and_copy_random_files(source_dir: str, target_dir: str, words: List[str], num_files: int = FILES_PER_WORD) -> None:

    os.makedirs(target_dir, exist_ok=True)
    
    print(f"\n3. Selecting {num_files} random files per word and copying to ./{target_dir}...")
    
    for word in words:
        source_word_path = os.path.join(source_dir, word)
        target_word_path = os.path.join(target_dir, word)
        
        if not os.path.isdir(source_word_path):
            print(f"\n[Warning] Source folder not found for '{word}'. Skipping.")
            continue
            
        all_files = glob.glob(os.path.join(source_word_path, '*.wav'))
        
        files_to_select = random.sample(all_files, num_files)
        
        os.makedirs(target_word_path, exist_ok=True)
        
        for file_path in files_to_select:
            shutil.copy(file_path, target_word_path)

    print(f"\nDataset subset creation complete. Total {len(words) * num_files} files copied.")

def check_sampling_rate(data_dir: str, words_list: List[str]) -> Optional[float]:
    """Checks the sampling rate of a sample file in the processed directory."""
    first_file_path = None
    
    for word in words_list:
        word_path = os.path.join(data_dir, word)
        files = glob.glob(os.path.join(word_path, '*.wav'))
        if files:
            first_file_path = files[0]
            break
            
    if first_file_path:
        try:
            _, sr = librosa.load(first_file_path, sr=None) 
            return sr
        except Exception as e:
            print(f"Error checking sampling rate: {e}")
            return None
    return None


In [9]:
source_dir = get_full_dataset_path()

if __name__ == '__main__':
    
    if source_dir:
        print("\n--- 1. FULL DATASET STATISTICS (Google Speech Commands) ---")
        
        total_words = get_word_count(source_dir)
        print(f"Total distinct word folders: {total_words}")
        
        speaker_counts = get_speakers_per_word(source_dir)
        if speaker_counts:
            avg_speakers = sum(speaker_counts.values()) / len(speaker_counts) if speaker_counts else 0
            print(f"Average unique speakers per word: {avg_speakers:.1f}")
        
        # Select and copy the subset
        select_and_copy_random_files(source_dir, DATA_DIR, LIST_OF_WORDS, FILES_PER_WORD)

        # Copy background noise files
        background_noise_source = os.path.join(source_dir, "_background_noise_")
        background_noise_target = os.path.join(DATA_DIR, "_background_noise_")
        
        if os.path.exists(background_noise_source):
            shutil.copytree(background_noise_source, background_noise_target, dirs_exist_ok=True)
            print(f"\nBackground noise files copied to {background_noise_target}")
        else:
            print("\n[Warning] No background noise folder found in the source dataset.")

        # Verification and report documentation
        print("\n--- 4. FINAL SUBSET DOCUMENTATION ---")        
        sampling_rate = check_sampling_rate(DATA_DIR, LIST_OF_WORDS)
        print(f"\nSampling Rate (SR): {sampling_rate} Hz (used for audio files)")
        print(f"Words Selected: {LIST_OF_WORDS}")
        print(f"Utterances per word: {FILES_PER_WORD}")
    else:
        print("\nStopping script due to dataset acquisition failure.")


Downloading dataset from Kaggle: neehakurelli/google-speech-commands..

--- 1. FULL DATASET STATISTICS (Google Speech Commands) ---
Total distinct word folders: 30
Average unique speakers per word: 1186.8

3. Selecting 100 random files per word and copying to ./data...

Dataset subset creation complete. Total 1000 files copied.

Background noise files copied to data/_background_noise_

--- 4. FINAL SUBSET DOCUMENTATION ---

Sampling Rate (SR): 16000 Hz (used for audio files)
Words Selected: ['bird', 'eight', 'four', 'happy', 'left', 'marvin', 'one', 'seven', 'three', 'zero']
Utterances per word: 100
