In [None]:
%pip install datasets[audio]


In [None]:
from datasets import load_dataset
ds = load_dataset("mispeech/speechocean762")


In [None]:
print(ds)

In [None]:
batch = ds["train"][:5]  # Fetches first 5 examples
print(batch["text"])
print(batch["audio"][0]["array"].shape)


In [None]:
#!/usr/bin/env python3
# Filename: prepare_speechocean_all.py

import os
import torch
import torchaudio
from tqdm import tqdm
from datasets import load_dataset

# Config
N_MELS = 40
WIN_MS = 25
HOP_MS = 10
OUTPUT_DIR = "speechocean_all"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Setup audio transform
spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    mel = spec_transform(waveform)
    return torchaudio.functional.amplitude_to_DB(
        mel, multiplier=10.0, amin=1e-5
    )

def process_and_save(split_name: str, dataset):
    split_dir = os.path.join(OUTPUT_DIR, split_name)
    os.makedirs(split_dir, exist_ok=True)

    for ex in tqdm(dataset, desc=f"Processing {split_name}"):
        utt = ex["utt_name"]
        waveform = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0)  # [1, T]
        log_mel = waveform_to_logmel(waveform)  # [n_mels, frames]

        phones_flat = [ph for w in ex["phones"] for ph in w]
        output = {
            "utt": utt,
            "log_mel": log_mel,           # Tensor
            "phonemes": phones_flat       # List[str]
        }

        save_path = os.path.join(split_dir, f"{utt}.pt")
        torch.save(output, save_path)

    print(f"Saved {len(dataset)} files to {split_dir}")

def main():
    ds = load_dataset("mispeech/speechocean762")
    for split in ["train", "test"]:
        process_and_save(split, ds[split])

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# prepare_speechocean_las.py

import os
import torch
import torchaudio
from tqdm import tqdm
from datasets import load_dataset

# Configuration
N_MELS = 40
WIN_MS = 25
HOP_MS = 10
OUTPUT_DIR = "speechocean_prepared"

# Create output directories
for split in ("train", "test"):
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

# Feature extractor setup
spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    """Converts waveform tensor to log‑mel spectrogram."""
    mel = spec_transform(waveform)
    return torchaudio.functional.amplitude_to_DB(mel, multiplier=10.0, amin=1e-5)

def process_split(split_name: str, dataset):
    """
    Processes a split of the dataset:
    - Converts audio to log-mel features
    - Flattens phoneme sequences
    - Saves each sample as a .pt file
    """
    out_dir = os.path.join(OUTPUT_DIR, split_name)
    print(f"\nProcessing split '{split_name}' with {len(dataset)} examples")

    for ex in tqdm(dataset, desc=f"{split_name} 💾"):
        utt_id = ex["utt_name"]
        # Load waveform
        wav = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0)  # [1, T]
        log_mel = waveform_to_logmel(wav)  # [n_mels, frames]

        # Flatten list of phoneme-lists
        phonemes = [ph for seg in ex["phones"] for ph in seg]

        output = {
            "utt": utt_id,
            "log_mel": log_mel,       # Tensor [n_mels x frames]
            "phonemes": phonemes      # List[str]
        }

        torch.save(output, os.path.join(out_dir, f"{utt_id}.pt"))

    print(f"➡️ Saved {len(dataset)} samples to '{out_dir}'")

def main():
    ds = load_dataset("mispeech/speechocean762")
    for split in ("train", "test"):
        process_split(split, ds[split])

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# File: prepare_speechocean_las.py

import os
import torch
import torchaudio
from tqdm import tqdm
from datasets import load_dataset

# Config for features
N_MELS = 40
WIN_MS = 25
HOP_MS = 10
OUTPUT_DIR = "speechocean_prepared"

# Create output directories for each split
for split in ("train", "test"):
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

# Log-mel transform setup
spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    """Convert waveform [1, T] into decibel log-mel spectrogram [n_mels, time_frames]."""
    mel = spec_transform(waveform)
    return torchaudio.functional.amplitude_to_DB(mel, multiplier=10.0, amin=1e-5)

def process_split(split_name: str, dataset):
    """Process and save one dataset split."""
    out_dir = os.path.join(OUTPUT_DIR, split_name)
    print(f"Processing '{split_name}' — {len(dataset)} examples")

    for ex in tqdm(dataset, desc=split_name):
        utt = ex["utt_name"]  # Utterance identifier
        waveform = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0)  # [1, T]
        log_mel = waveform_to_logmel(waveform)  # [n_mels, frames]

        # Flatten phoneme lists (nested per word) into one list
        phonemes = [ph for segment in ex["phones"] for ph in segment]

        data = {
            "utt": utt,
            "log_mel": log_mel,
            "phonemes": phonemes
        }
        torch.save(data, os.path.join(out_dir, f"{utt}.pt"))

    print(f"✔ Saved {len(dataset)} files to '{out_dir}'")

def main():
    ds = load_dataset("mispeech/speechocean762")  # Load dataset
    for split in ("train", "test"):
        process_split(split, ds[split])

if __name__ == "__main__":
    main()


In [None]:
print(ds["train"].features)


In [None]:
#!/usr/bin/env python3
# File: prepare_speechocean_las.py

import os
import torch
import torchaudio
from tqdm import tqdm
from datasets import load_dataset

# Feature configuration
N_MELS = 40
WIN_MS = 25
HOP_MS = 10
OUTPUT_DIR = "speechocean_prepared"

# Setup output directories
for split in ("train", "test"):
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

# Mel-spectrogram transformer
spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    mel = spec_transform(waveform)
    return torchaudio.functional.amplitude_to_DB(mel, multiplier=10.0, amin=1e-5)

def process_split(split_name: str, dataset):
    out_dir = os.path.join(OUTPUT_DIR, split_name)
    print(f"\nProcessing '{split_name}' — {len(dataset)} examples")

    for i, ex in enumerate(tqdm(dataset, desc=split_name)):
        utt_id = f"{split_name}_{i:05d}"
        wav = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0)  # [1, T]
        log_mel = waveform_to_logmel(wav)                          # [n_mels, frames]

        # Flatten phoneme lists per word into one sequence
        phonemes = [ph for w in ex["words"] for ph in w["phones"]]

        torch.save({
            "utt": utt_id,
            "log_mel": log_mel,
            "phonemes": phonemes
        }, os.path.join(out_dir, f"{utt_id}.pt"))

    print(f"✔ Saved {len(dataset)} files to '{out_dir}'")

def main():
    ds = load_dataset("mispeech/speechocean762")
    for split in ("train", "test"):
        process_split(split, ds[split])
    print("\n✅ All done!")

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# File: prepare_speechocean_las.py

import os
import torch
import torchaudio
from tqdm import tqdm
from datasets import load_dataset

# Constants
N_MELS = 40
WIN_MS = 25
HOP_MS = 10
OUTPUT_DIR = "speechocean_prepared"

# Prepare output directories
for split in ("train", "test"):
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

# MelSpectrogram transform
spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    mel = spec_transform(waveform)
    return torchaudio.functional.amplitude_to_DB(
        mel, multiplier=10.0, amin=1e-5
    )

def process_split(split_name: str, dataset):
    out_dir = os.path.join(OUTPUT_DIR, split_name)
    print(f"\nProcessing '{split_name}' — {len(dataset)} examples")

    for i, ex in enumerate(tqdm(dataset, desc=split_name)):
        utt_id = f"{split_name}_{i:05d}"
        # Cast waveform to float to prevent dtype mismatch errors :contentReference[oaicite:0]{index=0}
        wav = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0).float()
        log_mel = waveform_to_logmel(wav)

        # Flatten phoneme lists per word
        phonemes = [ph for w in ex["words"] for ph in w["phones"]]

        torch.save({
            "utt": utt_id,
            "log_mel": log_mel,
            "phonemes": phonemes
        }, os.path.join(out_dir, f"{utt_id}.pt"))

    print(f"✔ Saved {len(dataset)} files to '{out_dir}'")

def main():
    ds = load_dataset("mispeech/speechocean762")
    for split in ("train", "test"):
        process_split(split, ds[split])
    print("\n✅ All done!")

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# File: prepare_speechocean_las.py

import os
import torch
import torchaudio
from torchaudio import transforms
from tqdm import tqdm
from datasets import load_dataset

# 1. Configuration
N_MELS     = 40
WIN_MS     = 25
HOP_MS     = 10
OUTPUT_DIR = "speechocean_prepared"

# 2. Create output folders
for split in ("train", "test"):
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

# 3. Define transforms
spec_transform = transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=N_MELS,
    win_length=int(16000 * WIN_MS / 1000),
    hop_length=int(16000 * HOP_MS / 1000),
    power=2.0
)
db_transform = transforms.AmplitudeToDB(stype="power", top_db=80.0)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    """
    waveform: Tensor [1, T], dtype float32
    returns : Tensor [n_mels, time_frames], dtype float32
    """
    mel = spec_transform(waveform)
    return db_transform(mel)

def process_split(split_name: str, dataset):
    out_dir = os.path.join(OUTPUT_DIR, split_name)
    print(f"\n▶️  Processing split '{split_name}' with {len(dataset)} examples")
    for idx, ex in enumerate(tqdm(dataset, desc=split_name)):
        # 4. Build a unique utterance ID
        utt_id = f"{split_name}_{idx:05d}"

        # 5. Load and cast waveform
        wav = torch.from_numpy(ex["audio"]["array"]).unsqueeze(0).float()  # [1, T]
        log_mel = waveform_to_logmel(wav)                                 # [n_mels, frames]

        # 6. Flatten phoneme lists
        phonemes = []
        for word in ex["words"]:
            phonemes.extend(word["phones"])

        # 7. Save to .pt
        torch.save(
            {"utt": utt_id, "log_mel": log_mel, "phonemes": phonemes},
            os.path.join(out_dir, f"{utt_id}.pt")
        )
    print(f"✔️  Saved {len(dataset)} tensors to '{out_dir}'")

def main():
    # 8. Load dataset
    ds = load_dataset("mispeech/speechocean762")
    for split in ("train", "test"):
        process_split(split, ds[split])
    print("\n🎉 All splits processed. Data is under 'speechocean_prepared/'")

if __name__ == "__main__":
    main()


In [None]:
import os
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

OUTPUT_FILE = "speechocean_raw.pkl"

# Load dataset
ds = load_dataset("mispeech/speechocean762")

rows = []
for split in ("train", "test"):
    for idx, ex in enumerate(tqdm(ds[split], desc=f"Processing {split}")):
        utt_id = f"{split}_{idx:05d}"
        # Store raw waveform as list of floats
        waveform = ex["audio"]["array"].tolist()
        phonemes = [ph for w in ex["words"] for ph in w["phones"]]
        rows.append({
            "split": split,
            "utt": utt_id,
            "waveform": waveform,
            "sampling_rate": ex["audio"]["sampling_rate"],
            "phonemes": phonemes
        })

df = pd.DataFrame(rows)
df.to_pickle(OUTPUT_FILE)
print(f"Saved raw data to {OUTPUT_FILE}")


In [None]:
import pandas as pd
df = pd.read_pickle("speechocean_raw.pkl")


In [None]:
print(df)

In [None]:
print((df['phonemes'][0]))

In [None]:
import pandas as pd
import torch
import torchaudio
from datasets import load_dataset
from tqdm import tqdm

OUTPUT_FILE = "speechocean_logmel.pkl"

# Parameters for log-mel filterbank
SAMPLE_RATE = 16000
N_MELS = 40
HOP_LENGTH = int(0.010 * SAMPLE_RATE)
WIN_LENGTH = int(0.025 * SAMPLE_RATE)
N_FFT = 512

mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    win_length=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
)
to_db = torchaudio.transforms.AmplitudeToDB(stype="power")

ds = load_dataset("mispeech/speechocean762")

rows = []
for split in ("train", "test"):
    for idx, ex in enumerate(tqdm(ds[split], desc=f"Processing {split}")):
        utt_id = f"{split}_{idx:05d}"
        waveform = torch.tensor(ex["audio"]["array"], dtype=torch.float32)
        sr = ex["audio"]["sampling_rate"]

        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
            waveform = resampler(waveform)

        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)

        mel_spec = mel_transform(waveform)
        log_mel_spec = to_db(mel_spec).squeeze(0).transpose(0, 1)  # Shape: (Time, 40)

        phonemes = [ph for w in ex["words"] for ph in w["phones"]]

        rows.append({
            "split": split,
            "utt_id": utt_id,
            "log_mel": log_mel_spec.numpy(),  # Store as NumPy array
            "phonemes": phonemes
        })

df = pd.DataFrame(rows)
df.to_pickle(OUTPUT_FILE)
print(f"✅ Saved log-mel features to: {OUTPUT_FILE}")


In [None]:
import pandas as pd

df = pd.read_pickle("speechocean_logmel.pkl")

print(df)
# Example: Access log-mel of first utterance
log_mel = df.iloc[0]["log_mel"]
phe = df.iloc[0]["phonemes"]
print("Shape:", log_mel.shape)  # e.g., (Time, 40)
print("Phonemes:", df.iloc[0]["phonemes"])


In [None]:
import os
import sys

# Use current working directory as base (not __file__)
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Then try import
from utils.utils import encode, decode


In [None]:
encoded = encode(phe)
print("Encoded phonemes:", encoded)
decoded = decode(encoded)   
print("Decoded phonemes:", decoded)

In [None]:
max_len = max(len(p) for p in df["phonemes"])
print("Maximum phoneme sequence length:", max_len)
