This is (script) that creates out of download from kaggle data, a nice structure that is easy to work with, similar setup like in first project where you had 3 directories: **train**, **test**, **validation**, with silence data created out of long audio tracks. I prefer to do it once, save it in local directory and not worry about it in the future.

I expect in data directory sub-directory `audio`, and 2 files `testing_list.txt` and `validation_list.txt`.

In [10]:
import os
import shutil
from pathlib import Path

In [11]:
base_dir = "./../data"
audio_dir = os.path.join(base_dir, "audio")
testing_file = os.path.join(base_dir, "testing_list.txt")
validation_file = os.path.join(base_dir, "validation_list.txt")

all_exist = True

if not os.path.isdir(audio_dir):
    print(f"Missing directory: {audio_dir}")
    all_exist = False

if not os.path.isfile(testing_file):
    print(f"Missing file: {testing_file}")
    all_exist = False

if not os.path.isfile(validation_file):
    print(f"Missing file: {validation_file}")
    all_exist = False


if not all_exist:
    raise FileNotFoundError("One o the required file was not found.")

In [12]:
with open(testing_file, "r") as f:
    testing_list = f.read().splitlines()

with open(validation_file, "r") as f:
    validation_list = f.read().splitlines()

In [20]:
base_dir = Path("./../data")
audio_dir = base_dir / "audio"
for split in ["train", "test", "validation"]:
    (base_dir / split).mkdir(parents=True, exist_ok=True)

# Collect all .wav files from the dataset
all_audio_files = list(audio_dir.rglob("*.wav"))
print(f"Read {len(all_audio_files)} files")

# Distribute files into correct folders
for audio_file in all_audio_files:
    rel_path = audio_file.relative_to(audio_dir).as_posix()

    if rel_path in testing_list:
        dest_dir = base_dir / "test"
    elif rel_path in validation_list:
        dest_dir = base_dir / "validation"
    else:
        dest_dir = base_dir / "train"

    target_path = dest_dir / rel_path
    target_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(audio_file, target_path)

print("Dataset restructured into train/test/validation successfully.")

Read 64727 files
Dataset restructured into train/test/validation successfully.
