This is (script) that creates out of download from kaggle data, a nice structure that is easy to work with, similar setup like in first project where you had 3 directories: **train**, **test**, **validation**, with silence data created out of long audio tracks. I prefer to do it once, save it in local directory and not worry about it in the future.

**Download only train.7z, then after extraction move only it's CONTENTS to data directory.**

I expect in data directory sub-directory `audio`, and 2 files `testing_list.txt` and `validation_list.txt`.

In [1]:
import os
import shutil
from pathlib import Path

In [2]:
base_dir = "./../data"
audio_dir = os.path.join(base_dir, "audio")
testing_file = os.path.join(base_dir, "testing_list.txt")
validation_file = os.path.join(base_dir, "validation_list.txt")

all_exist = True

if not os.path.isdir(audio_dir):
    print(f"Missing directory: {audio_dir}")
    all_exist = False

if not os.path.isfile(testing_file):
    print(f"Missing file: {testing_file}")
    all_exist = False

if not os.path.isfile(validation_file):
    print(f"Missing file: {validation_file}")
    all_exist = False


if not all_exist:
    raise FileNotFoundError("One o the required file was not found.")

In [3]:
with open(testing_file, "r") as f:
    testing_list = f.read().splitlines()

with open(validation_file, "r") as f:
    validation_list = f.read().splitlines()

In [4]:
base_dir = Path("./../data")
audio_dir = base_dir / "audio"
for split in ["train", "test", "validation"]:
    (base_dir / split).mkdir(parents=True, exist_ok=True)

# Collect all .wav files from the dataset
all_audio_files = list(audio_dir.rglob("*.wav"))
print(f"Read {len(all_audio_files)} files")

# Distribute files into correct folders
for audio_file in all_audio_files:
    rel_path = audio_file.relative_to(audio_dir).as_posix()

    if rel_path in testing_list:
        dest_dir = base_dir / "test"
    elif rel_path in validation_list:
        dest_dir = base_dir / "validation"
    else:
        dest_dir = base_dir / "train"

    target_path = dest_dir / rel_path
    target_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(audio_file, target_path)

print("Dataset restructured into train/test/validation successfully.")

Read 64727 files
Dataset restructured into train/test/validation successfully.


In [5]:
background_src = base_dir / "train" / "_background_noise_"
background_dst = base_dir / "_background_noise_"

if background_src.exists():
    shutil.move(str(background_src), str(background_dst))
    print("Moved _background_noise_ to data directory.")
else:
    print("_background_noise_ not found in train/")

if audio_dir.exists():
    shutil.rmtree(audio_dir)
    print("Removed original audio directory.")
else:
    print("audio directory not found or already removed.")

Moved _background_noise_ to data directory.
Removed original audio directory.


In [None]:
# sudo apt install ffmpeg - run in terminal

In [37]:
import os
from pathlib import Path
from pydub import AudioSegment

In [38]:
base_dir = Path("./../data")
background_dst = base_dir / "_background_noise_"

In [39]:
def save_chunks(chunks, split, filename):
    for i, chunk in enumerate(chunks):
        path = os.path.join(base_dir, split, "silence", f"{filename}_{i}.wav")
        chunk.export(path, format="wav")

In [40]:
chunk_length_ms = 1000  # 1 second
hop_size_ms = chunk_length_ms // 5  # 200ms overlap
split_ratios = (0.8, 0.1, 0.1)

# Create output dirs
for split in ["train", "validation", "test"]:
    os.makedirs(os.path.join(base_dir, split, "silence"), exist_ok=True)

# Gather and split audio
for filename in os.listdir(background_dst):
    if filename.endswith(".wav"):
        audio_path = os.path.join(background_dst, filename)
        audio = AudioSegment.from_wav(audio_path)

        chunks = []

        for i in range(0, len(audio), hop_size_ms):
            chunk = audio[i : i + chunk_length_ms]
            if len(chunk) == chunk_length_ms:  # Only keep full-length chunks
                chunks.append(chunk)

        total = len(chunks)
        train_end = int(split_ratios[0] * total)
        val_end = train_end + int(split_ratios[1] * total)

        name = os.path.splitext(filename)[0]

        save_chunks(chunks[:train_end], "train", name)
        save_chunks(chunks[train_end:val_end], "test", name)
        save_chunks(chunks[val_end:], "validation", name)

print("Done splitting and saving background noise chunks!")

Done splitting and saving background noise chunks!
