In [3]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import csv

# --------------------------
# 1. Configuration
# --------------------------
BASE_DATA_PATH = "../Data"       # Adjust if the notebook is in Experiments
LABELS_PATH    = os.path.join(BASE_DATA_PATH, "labels")
AUDIO_PATH     = os.path.join(BASE_DATA_PATH, "audio")

# Where to save final CSV/JSON. We'll create a new folder "dataset" inside Data.
OUTPUT_DATASET_PATH = os.path.join(BASE_DATA_PATH, "dataset")
os.makedirs(OUTPUT_DATASET_PATH, exist_ok=True)

# Example list of data types (3-letter folder names).
DATA_TYPES = ["bcd", "car", "dat", "id", "mon", "phn", "tim"]

# --------------------------
# 2. Helper Functions
# --------------------------
def build_dataset_for_type(data_type, label_file_suffix="txt", audio_ext="ogg"):
    """
    Reads a label file (e.g., bcd.txt) in LABELS_PATH,
    filters entries to those that have a matching .ogg in Data/audio/<data_type>/voices/,
    extracts speaker from the first underscore,
    and returns a list of dicts: {
        "audio":          <file name without extension>,
        "audio_filepath": <full path to .ogg>,
        "speaker":        <speaker ID>,
        "transcript":     <transcript string>
    }.
    """
    label_file = os.path.join(LABELS_PATH, f"{data_type}.{label_file_suffix}")
    audio_folder = os.path.join(AUDIO_PATH, data_type, "voices")

    if not os.path.isfile(label_file):
        print(f"[Warning] Label file not found: {label_file}")
        return []

    if not os.path.isdir(audio_folder):
        print(f"[Warning] Audio folder not found: {audio_folder}")
        return []

    dataset = []
    with open(label_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Expecting something like: "filename ~ transcript"
            parts = line.split("~")
            if len(parts) != 2:
                continue  # skip malformed lines

            audio_name = parts[0].strip()    # e.g. "1150481435_413681_413682_..."
            transcript = parts[1].strip()

            # Construct full path to .ogg
            audio_file = os.path.join(audio_folder, audio_name + f".{audio_ext}")

            # Check if the audio file exists
            if os.path.isfile(audio_file):
                # Speaker is everything before the first underscore
                speaker = audio_name.split("_")[0]

                dataset.append({
                    "audio": audio_name,
                    "audio_filepath": audio_file,
                    "speaker": speaker,
                    "transcript": transcript
                })

    return dataset


def save_to_csv(data_list, output_csv):
    """
    Saves a list of dicts to CSV with columns:
    [audio, audio_filepath, speaker, transcript].
    """
    fieldnames = ["audio", "audio_filepath", "speaker", "transcript"]
    with open(output_csv, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in data_list:
            writer.writerow(row)


def save_to_json(data_list, output_json):
    """
    Saves a list of dicts to JSON (with the same columns).
    """
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data_list, f, ensure_ascii=False, indent=2)

# --------------------------
# 3. Main Process
# --------------------------
all_data = []

for dt in DATA_TYPES:
    print(f"Processing data type: {dt}")
    dataset = build_dataset_for_type(dt)
    print(f"  Found {len(dataset)} valid entries for {dt}")

    if dataset:
        csv_path = os.path.join(OUTPUT_DATASET_PATH, f"{dt}_dataset.csv")
        json_path = os.path.join(OUTPUT_DATASET_PATH, f"{dt}_dataset.json")

        save_to_csv(dataset, csv_path)
        save_to_json(dataset, json_path)

        print(f"  Saved CSV : {csv_path}")
        print(f"  Saved JSON: {json_path}")

    # Optionally add to a global list for one big combined file
    all_data.extend(dataset)

# If you want a single combined CSV/JSON for everything:
if all_data:
    combined_csv = os.path.join(OUTPUT_DATASET_PATH, "combined_dataset.csv")
    combined_json = os.path.join(OUTPUT_DATASET_PATH, "combined_dataset.json")

    save_to_csv(all_data, combined_csv)
    save_to_json(all_data, combined_json)

    print("\nCreated combined CSV :", combined_csv)
    print("Created combined JSON:", combined_json)


Processing data type: bcd
  Found 755 valid entries for bcd
  Saved CSV : ../Data/dataset/bcd_dataset.csv
  Saved JSON: ../Data/dataset/bcd_dataset.json
Processing data type: car
  Found 693 valid entries for car
  Saved CSV : ../Data/dataset/car_dataset.csv
  Saved JSON: ../Data/dataset/car_dataset.json
Processing data type: dat
  Found 4079 valid entries for dat
  Saved CSV : ../Data/dataset/dat_dataset.csv
  Saved JSON: ../Data/dataset/dat_dataset.json
Processing data type: id
  Found 2343 valid entries for id
  Saved CSV : ../Data/dataset/id_dataset.csv
  Saved JSON: ../Data/dataset/id_dataset.json
Processing data type: mon
  Found 2650 valid entries for mon
  Saved CSV : ../Data/dataset/mon_dataset.csv
  Saved JSON: ../Data/dataset/mon_dataset.json
Processing data type: phn
  Found 789 valid entries for phn
  Saved CSV : ../Data/dataset/phn_dataset.csv
  Saved JSON: ../Data/dataset/phn_dataset.json
Processing data type: tim
  Found 169 valid entries for tim
  Saved CSV : ../Data/d