In [None]:
import datasets
import os
import shutil


def extract_audio_from_arrow(dataset_name, output_folder, splits_to_download=None):
    """
    Downloads specified splits of a Hugging Face dataset's underlying Arrow table and extracts audio.

    Args:
        dataset_name: The name of the Hugging Face dataset.
        output_folder: The path to the output folder.
        splits_to_download: A list of split names to download (e.g., ['train', 'test']).
                           If None, downloads all splits.
    """
    try:
        os.makedirs(output_folder, exist_ok=True)

        # If no splits are specified, use all available splits.
        if splits_to_download is None:
            splits_to_download = list(datasets.load_dataset(
                dataset_name, streaming=False).keys())

        for split_name in splits_to_download:
            # Load the specific split.
            dataset = datasets.load_dataset(
                dataset_name, streaming=False, split=split_name)
            # Access the underlying Arrow table via the `.data` attribute.
            table = dataset.data

            # Extract the "audio" and "transcription" columns as Python lists.
            audio_entries = table.column("audio").to_pylist()
            print(audio_entries)
            transcriptions = table.column("transcription").to_pylist()

            # Iterate through the rows and process each entry.
            for audio_entry, transcription in zip(audio_entries, transcriptions):
                if audio_entry and 'path' in audio_entry:
                    audio_path = audio_entry['path']
                    filename_base = os.path.splitext(
                        os.path.basename(audio_path))[0]
                    new_audio_path = os.path.join(
                        output_folder, os.path.basename(audio_path))

                    # Copy the audio file to the output folder.
                    shutil.copy2(audio_path, new_audio_path)

                    # Write the transcription into a text file.
                    transcription_path = os.path.join(
                        output_folder, f"{filename_base}.txt")
                    with open(transcription_path, "w", encoding="utf-8") as f:
                        f.write(transcription)

    except Exception as e:
        print(f"An error occurred: {e}")


# Example usage:
dataset_name = "ysdede/yeni-split-0"
output_folder = "extracted_audio"
splits_to_download = ['test']  # Process only the 'test' split

extract_audio_from_arrow(dataset_name, output_folder, splits_to_download)

print(f"Audio and transcriptions extracted to: {output_folder}")