## Load dataset

In [None]:
import os

from datasets import Audio, load_dataset

SIZE = None

ds = load_dataset(
    "OrcinusOrca/YouTube-Cantonese",
    split="train",
    streaming=False,
    num_proc=os.cpu_count() * 2,
)
ds = ds.shuffle(seed=42)
ds = ds.select(range(SIZE)) if SIZE else ds
ds = ds.cast_column("mp3", Audio(sampling_rate=16000))
ds

Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/112 [00:00<?, ?it/s]

Dataset({
    features: ['mp3', 'json', '__key__', '__url__'],
    num_rows: 236576
})

In [None]:
def process_columns(row):
    # Extract all text segments and join them into a single string
    row["caption"] = row["json"]["caption"]
    return row


ds = ds.map(
    process_columns,
    remove_columns=["json", "__key__", "__url__"],
    num_proc=os.cpu_count(),
)
ds

Dataset({
    features: ['mp3', 'caption'],
    num_rows: 236576
})

In [None]:
import re


def is_spoken_cantonese(string: str) -> bool:
    """Filter strings that contain Cantonese-specific characters."""
    # Cantonese-specific characters
    PATTERNS = [
        "嘅",
        "啲",
        "㗎",
        "嗰",
        "喺",
        "咗",
        "搵",
        "睇",
        "咩",
        "嚟",
        "佢",
        "乜",
        "嘢",
        "畀",
        "唔",
        "喇",
        "哋",
        "咋",
        "嘛",
        "囉",
        "喎",
        "吓",
        "啱",
        "冇",
        "咁",
        "晒",
        "諗",
        "搞",
        "咪",
        "攰",
        "呃",
        "嗌",
        "拗",
        "嘈",
        "啖",
        "掟",
        "噉",
        "𥄫",
        "嚿",
        "叻",
        "拎",
        "咯",
        "乸",
        "噏",
        "屙",
        "嘥",
        "揸",
        "屌",
        "鳩",
        "閪",
        "撚",
        "柒",
        "𨳒",
        "𨳍",
        "𨳊",
        "𨶙",
        "同埋",
        "你係",
        "我係",
        "邊度",
        "邊個",
        "唔駛",
        "呢度",
        "點算",
        "返去",
        "返來",
        "翻去",
        "翻來",
        "加埋",
        "屋企",
        "仆街",
        "仲要",
    ]

    return any(re.search(pattern, string) for pattern in PATTERNS)


ds = ds.filter(
    lambda batch: [not is_spoken_cantonese(caption) for caption in batch["caption"]],
    batched=True,
    batch_size=os.cpu_count() * 1000,
)
ds

Dataset({
    features: ['mp3', 'caption'],
    num_rows: 213865
})

In [None]:
def normalize_caption(string: str) -> str:
    """Remove all parentheses and their contents from a string and filter garbage characters."""
    # Remove parentheses and square brackets and the content inside
    string = re.sub(r"[\(\[].*?[\)\]]", "", string)

    # Filter invalid characters
    ENGLISH_PATTERN = r"([a-zA-Z])"
    CHINESE_PATTERN = r"([\u4e00-\u9fff])"
    DIGIT_PATTERN = r"(\d)"

    def is_valid_char(c):
        return re.match(ENGLISH_PATTERN, c) or re.match(CHINESE_PATTERN, c) or re.match(DIGIT_PATTERN, c) or c == " "

    string = "".join(c for c in string if is_valid_char(c))

    # Add space between English and Chinese characters
    string = re.sub(f"{ENGLISH_PATTERN}{CHINESE_PATTERN}", r"\1 \2", string)
    string = re.sub(f"{CHINESE_PATTERN}{ENGLISH_PATTERN}", r"\1 \2", string)

    # Remove multiple spaces
    string = re.sub(r" {2,}", " ", string)
    string = string.strip()
    return string


ds = ds.map(
    lambda batch: {"caption": [normalize_caption(caption) for caption in batch["caption"]]},
    batched=True,
    batch_size=os.cpu_count() * 1000,
    num_proc=os.cpu_count() * 2,
)
ds = ds.filter(
    lambda batch: [caption.strip() != "" for caption in batch["caption"]],
    batched=True,
    batch_size=os.cpu_count() * 1000,
)
ds

Dataset({
    features: ['mp3', 'caption'],
    num_rows: 213786
})

In [None]:
from opencc import OpenCC


def s2hk(content: str) -> str:
    """Convert Simplified Chinese to Traditional Chinese."""
    return OpenCC("s2hk").convert(content)


ds = ds.map(
    lambda batch: {"caption": [s2hk(caption) for caption in batch["caption"]]},
    batched=True,
    batch_size=64,
    num_proc=os.cpu_count() * 2,
)
ds

Dataset({
    features: ['mp3', 'caption'],
    num_rows: 213786
})

In [None]:
import random

from rich import print

print(ds.select(random.sample(range(len(ds)), 10))["caption"])

## Load Models

### Processor (Feature Extractor & Tokenizer)

In [None]:
from transformers import WhisperFeatureExtractor, WhisperProcessor, WhisperTokenizer

model_id = "openai/whisper-large-v3-turbo"
processor: WhisperProcessor = WhisperProcessor.from_pretrained(model_id, task="transcribe")
feature_extractor: WhisperFeatureExtractor = processor.feature_extractor
tokenizer: WhisperTokenizer = processor.tokenizer

## Prepare Dataset

### Tokenization

In [None]:
def prepare_batch(batch: dict[str, list]) -> dict[str, list]:
    """Extract log-mels and tokenize captions for a batched slice of the dataset."""
    # Collect raw audio arrays and sample rate
    arrays = [audio["array"] for audio in batch["mp3"]]
    sr = batch["mp3"][0]["sampling_rate"]

    # 1) Feature extraction → returns a BatchFeature with .input_features: np.ndarray
    batch["input_features"] = processor.feature_extractor(
        arrays,
        sampling_rate=sr,
    ).input_features

    # 2) Tokenization of target captions → returns a BatchEncoding
    batch["labels"] = processor.tokenizer(
        batch["caption"],
    ).input_ids

    return batch


ds = ds.map(
    prepare_batch,
    batched=True,
    batch_size=os.cpu_count(),
    num_proc=os.cpu_count(),
    remove_columns=["mp3", "caption"],
)

Map (num_proc=72):   0%|          | 0/213786 [00:00<?, ? examples/s]

2025-05-08 03:13:20.630291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-08 03:13:20.630441: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-08 03:13:20.630517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-08 03:13:20.630577: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-08 03:13:20.631147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for 

## Save

In [None]:
ds_name = f"ds_{SIZE}" if SIZE else "ds"
ds.save_to_disk(f"dataset_local/{ds_name}", num_proc=os.cpu_count() * 2)

Saving the dataset (0/658 shards):   0%|          | 0/213786 [00:00<?, ? examples/s]