In [1]:
# Make the audio file 20s 16kHz mono (POWSM requirement: 16kHz, 20s fixed duration)
import librosa
import soundfile as sf

def process_audio(input_file, output_file, target_sr=16000, target_duration=20):
    # Load the audio file
    y, sr = librosa.load(input_file, sr=None, mono=True)

    # Resample to target sample rate if necessary
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    # Calculate the target number of samples
    target_length = target_duration * sr

    # Pad or truncate the audio to the target length
    if len(y) < target_length:
        # Pad with zeros at the end
        y = librosa.util.fix_length(y, size=target_length)
    else:
        # Truncate to the target length
        y = y[:target_length]

    # Save the processed audio file using soundfile (librosa.output was removed in newer versions)
    sf.write(output_file, y, sr)

process_audio("./audio/pi_mono_trimmed.wav", "./audio/pi_powsm.wav")

In [None]:
# Note: Make sure torchaudio is installed: pip install torchaudio
from espnet2.bin.s2t_inference_language import 
import soundfile as sf      # or librosa

s2t = Speech2Language.from_pretrained(
    "espnet/powsm",
    device="cpu",
    nbest=1,                # number of possible languages to return
    first_lang_sym="<afr>", # fixed; defined in vocab list
    last_lang_sym="<zul>"   # fixed; defined in vocab list
)

# Use the processed audio file (16kHz, 20s) from the previous cell
speech, rate = sf.read("./audio/pi_powsm.wav")
pred = s2t(speech)[0]     # a list of lang-prob pair
print(pred)

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 124936.71it/s]
  with autocast(False):


('<eng>', 0.9905632734298706)


In [None]:
# Phone Recognition (PR) with POWSM
from espnet2.bin.s2t_inference import Speech2Text
import soundfile as sf

# Use the processed audio file (16kHz, 20s) from the preprocessing cell
speech, rate = sf.read("./audio/pi_powsm.wav")
print(f"Audio loaded: {len(speech)} samples at {rate} Hz ({len(speech)/rate:.2f} seconds)")

# Initialize model for Phone Recognition
s2t_pr = Speech2Text.from_pretrained(
    "espnet/powsm",
    device="cpu",  # Use "cuda" if you have GPU
    lang_sym="<eng>",   # ISO 639-3 language code; set to <unk> for unseen languages
    task_sym="<pr>",    # Phone Recognition task
)

# For PR, prompt should be "<na>" (not applicable)
prompt = "<na>"

# Get prediction
try:
    result = s2t_pr(speech, text_prev=prompt)
    print(f"Raw result type: {type(result)}")
    print(f"Raw result: {result}")
    
    # Handle the result - it might be a list or tuple
    if isinstance(result, (list, tuple)) and len(result) > 0:
        pred = result[0]
        if isinstance(pred, (list, tuple)) and len(pred) > 0:
            pred = pred[0]
        
        # Post-processing for better format
        if isinstance(pred, str):
            # Check if it contains <notimestamps>
            if "<notimestamps>" in pred:
                pred = pred.split("<notimestamps>")[1].strip()
            else:
                pred = pred.strip()
            
            # Remove slashes for PR (phones are enclosed in slashes in the model output)
            pred = pred.replace("/", "")
            
            print("\nPhone Recognition Result:")
            print(pred)
        else:
            print(f"Unexpected prediction format: {type(pred)}")
            print(pred)
    else:
        print("Unexpected result format")
        print(result)
except Exception as e:
    print(f"Error during inference: {e}")
    import traceback
    traceback.print_exc()


Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 158703.39it/s]
  with autocast(False):


Phone Recognition Result:



In [10]:
# Example: All POWSM Tasks (PR, ASR, G2P, P2G)
# IMPORTANT: Each task needs its own model instance - you cannot change task_sym after initialization
from espnet2.bin.s2t_inference import Speech2Text
import soundfile as sf

# Load audio
speech, rate = sf.read("./audio/pi_powsm.wav")
print(f"Audio loaded: {len(speech)} samples at {rate} Hz\n")

# Task 1: Phone Recognition (PR)
print("=" * 50)
print("Phone Recognition (PR):")
print("=" * 50)
s2t_pr = Speech2Text.from_pretrained(
    "espnet/powsm",
    device="cpu",
    lang_sym="<eng>",
    task_sym="<pr>",
)
pred_pr = s2t_pr(speech, text_prev="<na>")[0][0]
if "<notimestamps>" in pred_pr:
    pred_pr = pred_pr.split("<notimestamps>")[1].strip()
pred_pr = pred_pr.replace("/", "")  # Remove slashes for PR
print(pred_pr)

# Task 2: Automatic Speech Recognition (ASR)
print("\n" + "=" * 50)
print("Automatic Speech Recognition (ASR):")
print("=" * 50)
s2t_asr = Speech2Text.from_pretrained(
    "espnet/powsm",
    device="cpu",
    lang_sym="<eng>",
    task_sym="<asr>",
)
pred_asr = s2t_asr(speech, text_prev="<na>")[0][0]
if "<notimestamps>" in pred_asr:
    pred_asr = pred_asr.split("<notimestamps>")[1].strip()
print(pred_asr)

# Task 3: Grapheme-to-Phoneme (G2P) - needs ASR transcript as prompt
print("\n" + "=" * 50)
print("Grapheme-to-Phoneme (G2P):")
print("=" * 50)
s2t_g2p = Speech2Text.from_pretrained(
    "espnet/powsm",
    device="cpu",
    lang_sym="<eng>",
    task_sym="<g2p>",
)
# Use ASR transcript as prompt
pred_g2p = s2t_g2p(speech, text_prev=pred_asr)[0][0]
if "<notimestamps>" in pred_g2p:
    pred_g2p = pred_g2p.split("<notimestamps>")[1].strip()
pred_g2p = pred_g2p.replace("/", "")  # Remove slashes for G2P
print(pred_g2p)

# Task 4: Phoneme-to-Grapheme (P2G) - needs phone transcription with slashes as prompt
print("\n" + "=" * 50)
print("Phoneme-to-Grapheme (P2G):")
print("=" * 50)
s2t_p2g = Speech2Text.from_pretrained(
    "espnet/powsm",
    device="cpu",
    lang_sym="<eng>",
    task_sym="<p2g>",
)
# Use phone transcription with slashes as prompt (from PR result)
# Format: /pʰ//ɔ//s//ə//m/ (each phone enclosed in slashes)
phone_with_slashes = " ".join([f"/{p}/" for p in pred_pr.split() if p.strip()])
pred_p2g = s2t_p2g(speech, text_prev=phone_with_slashes)[0][0]
if "<notimestamps>" in pred_p2g:
    pred_p2g = pred_p2g.split("<notimestamps>")[1].strip()
print(pred_p2g)


Audio loaded: 320000 samples at 16000 Hz

Phone Recognition (PR):


Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 59796.59it/s]
  with autocast(False):




Automatic Speech Recognition (ASR):


Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 102657.79it/s]
  with autocast(False):


AN ⁇  ANOT ⁇ ER C ⁇ A ⁇ ER  ⁇  FOR A ⁇ U ⁇ T ⁇ N ⁇  T ⁇ E FREQUENC ⁇  OF AFTE ⁇  CO ⁇ ONENT ⁇  E ⁇ T ⁇ ER  ⁇ A ⁇ E ⁇  OR  ⁇ UZZ ⁇ ER CURRENT ⁇   ⁇ AN ⁇ ER T ⁇ E ⁇ E  ⁇   ⁇ A ⁇  TO Z ⁇ RO AN ⁇   ⁇ ERE  ⁇ N  ⁇ EAR  ⁇ N  ⁇ EAR ⁇  OR  ⁇ UZZ ⁇ ER CURRENT ⁇   ⁇ AN ⁇ ER T ⁇ E ⁇ E  ⁇   ⁇ EA ⁇  TO T ⁇ E ER ⁇ T ⁇ N ⁇   ⁇ N ACT ⁇ ON

Grapheme-to-Phoneme (G2P):


Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 61422.86it/s]
  with autocast(False):




Phoneme-to-Grapheme (P2G):


Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 95948.13it/s]
  with autocast(False):


AN ⁇  ANOT ⁇ ER  ⁇ N ⁇ ER  ⁇  FOR A ⁇ U ⁇ T ⁇ N ⁇  T ⁇ E FREQUENC ⁇  OF AFTE ⁇  CO ⁇ ONENT ⁇  E ⁇ T ⁇ ER  ⁇ A ⁇ E ⁇  OR  ⁇ UZZ ⁇ ER CURRENT ⁇   ⁇ AN ⁇ ER T ⁇ E ⁇ E  ⁇   ⁇ A ⁇  TO Z ⁇ RO AN ⁇   ⁇ ERE  ⁇ N A  ⁇ EAR  ⁇ A ⁇ E ⁇  OR  ⁇ UZZ ⁇ ER CURRENT ⁇   ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER T ⁇ E  ⁇ AN ⁇ ER  ⁇ AN ⁇ ER  ⁇ AN ⁇ ER  ⁇ AN ⁇ ER


In [None]:
# Diagnostic: Check audio properties and try a simple PR example
import soundfile as sf
import numpy as np

# Check the audio file
speech, rate = sf.read("./audio/pi_powsm.wav")
print(f"Audio properties:")
print(f"  Sample rate: {rate} Hz")
print(f"  Duration: {len(speech)/rate:.2f} seconds")
print(f"  Samples: {len(speech)}")
print(f"  Data type: {speech.dtype}")
print(f"  Min/Max values: {speech.min():.4f} / {speech.max():.4f}")
print(f"  RMS (loudness): {np.sqrt(np.mean(speech**2)):.4f}")

# Note: If the audio is not actual speech (e.g., music, tones, digits read aloud),
# the model may produce garbled output. POWSM is designed for natural speech.
# The special character ⁇ indicates unknown tokens, which can happen when:
# 1. The audio content doesn't match the training data distribution
# 2. The audio quality is poor
# 3. The language doesn't match the lang_sym setting
