In [3]:
from datasets import load_dataset, Audio
import pandas as pd
import os, io
import soundfile as sf
from pydub import AudioSegment

In [4]:
file_path = "metadata_commonvoice.csv"

# Check if the metadata already exists
if os.path.isfile(file_path):
    print("File already exists.")
else:
    # Paths
    cv_root = "cv-corpus-22.0-delta-2025-06-20/en"
    clips_dir = os.path.join(cv_root, "clips")
    validated_tsv = os.path.join(cv_root, "validated.tsv")

    # Load validated metadata
    validated = pd.read_csv(validated_tsv, sep="\t")

    rows = []
    for _, row in validated.iterrows():
        mp3_file = os.path.join(clips_dir, row["path"])
        wav_file = mp3_file.replace(".mp3", ".wav")

        # Convert MP3 → WAV if not already done
        if not os.path.exists(wav_file):
            audio = AudioSegment.from_mp3(mp3_file)
            audio = audio.set_frame_rate(16000).set_channels(1)
            audio.export(wav_file, format="wav")

        # Extract duration + sampling rate
        with sf.SoundFile(wav_file) as f:
            duration = len(f) / f.samplerate
            rows.append({
                "file": wav_file,
                "duration_sec": round(duration, 2),
                "sampling_rate": f.samplerate,
                "dataset": "CommonVoice",
                "speaker_id": row.get("client_id", ""),
                "environment": "unknown",
                "text": row.get("sentence", "")
            })

    cv_df = pd.DataFrame(rows)
    cv_df.to_csv("metadata_commonvoice.csv", index=False)

File already exists.


In [None]:
file_path = "librispeech_metadata.csv"

# Check if the metadata already exists
if os.path.isfile(file_path):
    print("File already exists.")
else:
    # 1. Load dataset without decoding (skip torchcodec)
    dataset = load_dataset("librispeech_asr", "clean")
    dataset = dataset.cast_column("audio", Audio(decode=False))

    # 2. Take a small subset for testing
    subset = dataset["train.100"].shuffle(seed=42).select(range(200))

    # 3. Output folder
    out_dir = "librispeech_subset_wav"
    os.makedirs(out_dir, exist_ok=True)

    rows = []
    for i, item in enumerate(subset):
        audio_bytes = item["audio"]["bytes"]   # raw FLAC bytes
        speaker_id = item["speaker_id"]
        text = item.get("text", "")

        # Decode bytes with soundfile
        audio_array, sr = sf.read(io.BytesIO(audio_bytes))
        duration_sec = len(audio_array) / sr

        # Save as WAV
        out_path = os.path.join(out_dir, f"sample_{i}.wav")
        sf.write(out_path, audio_array, sr)

        # Metadata row
        rows.append({
            "file": out_path,
            "duration_sec": round(duration_sec, 2),
            "sampling_rate": sr,
            "dataset": "LibriSpeech",
            "speaker_id": speaker_id,
            "environment": "clean",
            "text": text
        })

    # 4. Save metadata
    df = pd.DataFrame(rows)
    df.to_csv("librispeech_metadata.csv", index=False)

File already exists.


In [7]:
def unify_metadata(output_path, sources):
    dfs = []
    for src in sources:
        if os.path.isfile(src):
            print(f"Found {src}, adding to unified dataset")
            df = pd.read_csv(src)

            # Ensure required columns exist (fill missing if needed)
            for col in ["file", "duration_sec", "sampling_rate", "dataset",
                        "speaker_id", "environment", "text"]:
                if col not in df.columns:
                    df[col] = ""

            # Make file paths relative
            df["file"] = df["file"].apply(lambda x: os.path.relpath(x))
            dfs.append(df)
        else:
            print(f"{src} not found, skipping")

    if not dfs:
        print("No metadata files found. Run dataset preprocessing first.")
        return

    # Merge all datasets
    full_df = pd.concat(dfs, ignore_index=True)

    # Drop duplicates if any (based on file path)
    full_df.drop_duplicates(subset="file", inplace=True)

    # Save unified metadata
    full_df.to_csv(output_path, index=False)
    print(f"Unified metadata saved to {output_path}")
    print(f"Total files: {len(full_df)}")

In [6]:
file_path = "metadata.csv"

# Check if the metadata already exists
if os.path.isfile(file_path):
    print("File already exists.")
else:
    # List of dataset metadata files (expandable in the future)
    sources = [
        "metadata_commonvoice.csv",
        "librispeech_metadata.csv",
        # Add "metadata_custom.csv" later when they are ready]
    ]
    unify_metadata("metadata.csv", sources)

File already exists.
