## Convert audio files under ./powsm into .wav (16kHz 16bit mono 20sec)

In [1]:
import librosa
import soundfile as sf
import numpy as np
import os
from pathlib import Path

# Configuration
base_dir = "./audio/powsm"
target_sr = 16000  # 16kHz
target_duration = 20  # 20 seconds
target_channels = 1  # Mono

# Supported audio extensions
audio_extensions = {'.m4a', '.mp3', '.wav', '.flac', '.ogg', '.aac', '.mp4', '.m4v'}

# Find all numbered subdirectories (sentence directories)
base_path = Path(base_dir)
sentence_dirs = [d for d in base_path.iterdir() if d.is_dir() and d.name.isdigit()]
sentence_dirs.sort(key=lambda x: int(x.name))

if not sentence_dirs:
    print(f"No sentence directories found in {base_dir}")
else:
    print(f"Found {len(sentence_dirs)} sentence directory(ies) to process:\n")

for sentence_dir in sentence_dirs:
    sentence_num = sentence_dir.name
    text_file = sentence_dir / "text"
    
    # Read transcript
    transcript = ""
    if text_file.exists():
        with open(text_file, 'r', encoding='utf-8') as f:
            transcript = f.read().strip()
        print(f"üìù Sentence {sentence_num} transcript:")
        print(f"   {transcript}\n")
    else:
        print(f"‚ö†Ô∏è  Sentence {sentence_num}: No transcript file found\n")
    
    # Find all audio files in this directory (excluding already converted -r.wav files)
    audio_files = [
        f for f in sentence_dir.iterdir() 
        if f.suffix.lower() in audio_extensions 
        and f.is_file()
        and not f.name.endswith('-r.wav')  # Skip already converted files
    ]
    
    if not audio_files:
        print(f"   No audio files to convert in sentence {sentence_num}\n")
        continue
    
    for audio_file in audio_files:
        filename = audio_file.name
        output_filename = audio_file.stem + "-r.wav"
        output_path = sentence_dir / output_filename
        
        print(f"   Processing {filename}...")
        
        # Load audio file
        audio, sr = librosa.load(str(audio_file), sr=None, mono=False)
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio)
        
        # Resample to 16kHz
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        
        # Trim or pad to 20 seconds
        target_samples = target_sr * target_duration
        if len(audio) > target_samples:
            # Trim to 20 seconds
            audio = audio[:target_samples]
        elif len(audio) < target_samples:
            # Pad with zeros to 20 seconds
            padding = target_samples - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
        
        # Save as 16-bit WAV file
        sf.write(str(output_path), audio, target_sr, subtype='PCM_16')
        
        print(f"   ‚úì Saved {output_filename} ({len(audio)/target_sr:.2f}s, {target_sr}Hz, mono, 16-bit)")
    
    print()  # Empty line between sentences

print("Conversion complete!")

Found 2 sentence directory(ies) to process:

üìù Sentence 12 transcript:
   The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.

   Processing umit12.m4a...


  audio, sr = librosa.load(str(audio_file), sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


   ‚úì Saved umit12-r.wav (20.00s, 16000Hz, mono, 16-bit)
   Processing yusuf12.m4a...
   ‚úì Saved yusuf12-r.wav (20.00s, 16000Hz, mono, 16-bit)

üìù Sentence 14 transcript:
   The red car arrived early in the morning. The driver parked near the restaurant and ordered breakfast. The fresh bread was really delicious.

   Processing umit14.m4a...
   ‚úì Saved umit14-r.wav (20.00s, 16000Hz, mono, 16-bit)

Conversion complete!


  audio, sr = librosa.load(str(audio_file), sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


## Phone Recognition

In [2]:
from espnet2.bin.s2t_inference import Speech2Text
from espnet2.bin.s2t_inference_language import Speech2Language
import soundfile as sf
from pathlib import Path
import json
from datetime import datetime

# Configuration
base_dir = "./audio/powsm"
device = "cpu"  # Change to "cuda" if you have GPU
output_file_json = "./powsm_pr_results.json"
output_file_txt = "./powsm_pr_results.txt"

# Initialize results storage
results = {
    "timestamp": datetime.now().isoformat(),
    "model": "espnet/powsm",
    "task": "Phone Recognition (PR)",
    "results": []
}

# Find all numbered subdirectories (sentence directories)
base_path = Path(base_dir)
sentence_dirs = [d for d in base_path.iterdir() if d.is_dir() and d.name.isdigit()]
sentence_dirs.sort(key=lambda x: int(x.name))

if not sentence_dirs:
    print(f"No sentence directories found in {base_dir}")
else:
    print(f"Found {len(sentence_dirs)} sentence directory(ies) to process:\n")
    
    # Initialize language detection model (for automatic language detection)
    print("Loading language detection model...")
    s2lang = Speech2Language.from_pretrained(
        "espnet/powsm",
        device=device,
        nbest=1,
        first_lang_sym="<afr>",
        last_lang_sym="<zul>"
    )
    
    # Initialize PR model
    print("Loading Phone Recognition model...")
    s2t_pr = Speech2Text.from_pretrained(
        "espnet/powsm",
        device=device,
        lang_sym="<eng>",  # Default to English, can be changed per file
        task_sym="<pr>",
    )
    
    for sentence_dir in sentence_dirs:
        sentence_num = sentence_dir.name
        text_file = sentence_dir / "text"
        
        # Read transcript
        transcript = ""
        if text_file.exists():
            with open(text_file, 'r', encoding='utf-8') as f:
                transcript = f.read().strip()
        
        print(f"\n{'='*70}")
        print(f"Sentence {sentence_num}")
        print(f"{'='*70}")
        if transcript:
            print(f"üìù Transcript: {transcript}\n")
        
        # Find all converted audio files (-r.wav)
        audio_files = [
            f for f in sentence_dir.iterdir() 
            if f.is_file() and f.name.endswith('-r.wav')
        ]
        
        if not audio_files:
            print(f"   No converted audio files found in sentence {sentence_num}\n")
            continue
        
        for audio_file in audio_files:
            filename = audio_file.name
            print(f"\nüé§ Processing: {filename}")
            
            # Load audio
            speech, rate = sf.read(str(audio_file))
            print(f"   Audio: {len(speech)/rate:.2f}s at {rate}Hz")
            
            # Detect language (optional - can help with accuracy)
            try:
                lang_pred = s2lang(speech)[0]
                detected_lang = lang_pred[0] if lang_pred else "<eng>"
                print(f"   Detected language: {detected_lang}")
            except:
                detected_lang = "<eng>"
                print(f"   Using default language: {detected_lang}")
            
            # Phone Recognition
            try:
                result = s2t_pr(speech, text_prev="<na>")
                pred = result[0][0]
                
                # Post-processing
                if "<notimestamps>" in pred:
                    pred = pred.split("<notimestamps>")[1].strip()
                else:
                    pred = pred.strip()
                
                # Remove slashes for cleaner output
                pred_clean = pred.replace("/", "")
                
                print(f"\n   üìû Phone Recognition Result:")
                print(f"   {pred_clean}")
                
                # Store result
                result_entry = {
                    "sentence_id": sentence_num,
                    "audio_file": filename,
                    "transcript": transcript,
                    "detected_language": detected_lang,
                    "phone_recognition": {
                        "with_slashes": pred,
                        "clean": pred_clean
                    }
                }
                results["results"].append(result_entry)
                
            except Exception as e:
                print(f"   ‚ùå Error during PR: {e}")
                import traceback
                traceback.print_exc()
                
                # Store error
                result_entry = {
                    "sentence_id": sentence_num,
                    "audio_file": filename,
                    "transcript": transcript,
                    "detected_language": detected_lang,
                    "error": str(e)
                }
                results["results"].append(result_entry)
        
        print()  # Empty line between sentences

# Write results to files
print("\n" + "="*70)
print("Writing results to files...")

# Write JSON file
with open(output_file_json, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print(f"‚úì JSON results saved to: {output_file_json}")

# Write human-readable text file
with open(output_file_txt, 'w', encoding='utf-8') as f:
    f.write("="*70 + "\n")
    f.write("POWSM Phone Recognition Results\n")
    f.write("="*70 + "\n")
    f.write(f"Timestamp: {results['timestamp']}\n")
    f.write(f"Model: {results['model']}\n")
    f.write(f"Task: {results['task']}\n")
    f.write("="*70 + "\n\n")
    
    for entry in results["results"]:
        f.write(f"\nSentence ID: {entry['sentence_id']}\n")
        f.write(f"Audio File: {entry['audio_file']}\n")
        f.write(f"Transcript: {entry.get('transcript', 'N/A')}\n")
        f.write(f"Detected Language: {entry.get('detected_language', 'N/A')}\n")
        
        if 'error' in entry:
            f.write(f"Error: {entry['error']}\n")
        else:
            f.write(f"Phone Recognition (clean): {entry['phone_recognition']['clean']}\n")
            f.write(f"Phone Recognition (with slashes): {entry['phone_recognition']['with_slashes']}\n")
        
        f.write("-"*70 + "\n")

print(f"‚úì Text results saved to: {output_file_txt}")

print("\n" + "="*70)
print("Phone Recognition complete!")
print("="*70)


Failed to import Flash Attention, using ESPnet default: No module named 'flash_attn'


  from .autonotebook import tqdm as notebook_tqdm


Found 2 sentence directory(ies) to process:

Loading language detection model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 26522.25it/s]


Loading Phone Recognition model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 137840.98it/s]



Sentence 12
üìù Transcript: The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.


üé§ Processing: umit12-r.wav
   Audio: 20.00s at 16000Hz


  with autocast(False):


   Detected language: <eng>

   üìû Phone Recognition Result:
   √∞…ôw…õ√∞…úÀû…™z…π…ô√∞…úÀûw…î…πmd√∞…™sŒ∏…úÀûnzde…™a…™Œ∏…™ÃÉ≈ãkwi É ädko ät ∞u√∞…ôŒ∏i…ôt…úÀût ∞…ô…°…õ√∞…úÀûŒ∏√¶ÃÉ≈ãkjuf…π…úÀûŒ∏…™ÃÉ≈ãk…™ÃÉ≈ã…ôba ät√∞…™st…î…πo älÃ¥i

üé§ Processing: yusuf12-r.wav
   Audio: 20.00s at 16000Hz
   Detected language: <eng>

   üìû Phone Recognition Result:
   √∞…ôw…õ√∞…úÀû…™z…π…ô√∞…úÀûw…î…πm√∞…™sŒ∏…úÀûdzde…™a…™Œ∏…™ÃÉ≈ãkwi Éik ∞…îz…™tŒ∏i…ôt…úÀût ∞…ô…°…õ√∞…úÀûŒ∏√¶ÃÉ≈ãkjuf…π…úÀûŒ∏…™ÃÉ≈ãk…™ÃÉ≈ã…ôba ät…™z…ôŒ∏…î…πo älÃ¥i


Sentence 14
üìù Transcript: The red car arrived early in the morning. The driver parked near the restaurant and ordered breakfast. The fresh bread was really delicious.


üé§ Processing: umit14-r.wav
   Audio: 20.00s at 16000Hz
   Detected language: <eng>

   üìû Phone Recognition Result:
   √∞…ô…π…õdk ∞…ë…π…úÀûa…™vd…úÀûlÃ¥i…™ÃÉn√∞…ôm…î…πn…™ÃÉ≈ã√∞…ôt ∞…πa…™v…úÀûp ∞…ë…πkn…™…π√∞…ô…π…õst…úÀû…ëÃÉnt…ônd…î…πd…úÀûdp…π…õkf…ôstt ∞uf…π…õ Ép…π…õdw…ëz…πilÃ¥it…™lÃ¥…™ É…ôs


Writing result

## Audio Guided Grapheme to Phoneme Recognition

In [3]:
from espnet2.bin.s2t_inference import Speech2Text
import soundfile as sf
from pathlib import Path
import json
from datetime import datetime

# Configuration
base_dir = "./audio/powsm"
device = "cpu"  # Change to "cuda" if you have GPU
output_file_json = "./powsm_g2p_results.json"
output_file_txt = "./powsm_g2p_results.txt"

# Initialize results storage
results = {
    "timestamp": datetime.now().isoformat(),
    "model": "espnet/powsm",
    "task": "Audio-guided Grapheme-to-Phoneme (G2P)",
    "results": []
}

# Find all numbered subdirectories (sentence directories)
base_path = Path(base_dir)
sentence_dirs = [d for d in base_path.iterdir() if d.is_dir() and d.name.isdigit()]
sentence_dirs.sort(key=lambda x: int(x.name))

if not sentence_dirs:
    print(f"No sentence directories found in {base_dir}")
else:
    print(f"Found {len(sentence_dirs)} sentence directory(ies) to process:\n")
    
    # Initialize models
    print("Loading models...")
    
    # ASR model (needed to get transcript for G2P)
    s2t_asr = Speech2Text.from_pretrained(
        "espnet/powsm",
        device=device,
        lang_sym="<eng>",
        task_sym="<asr>",
    )
    
    # G2P model
    s2t_g2p = Speech2Text.from_pretrained(
        "espnet/powsm",
        device=device,
        lang_sym="<eng>",
        task_sym="<g2p>",
    )
    
    for sentence_dir in sentence_dirs:
        sentence_num = sentence_dir.name
        text_file = sentence_dir / "text"
        
        # Read transcript
        transcript = ""
        if text_file.exists():
            with open(text_file, 'r', encoding='utf-8') as f:
                transcript = f.read().strip()
        
        print(f"\n{'='*70}")
        print(f"Sentence {sentence_num}")
        print(f"{'='*70}")
        if transcript:
            print(f"üìù Ground Truth Transcript: {transcript}\n")
        
        # Find all converted audio files (-r.wav)
        audio_files = [
            f for f in sentence_dir.iterdir() 
            if f.is_file() and f.name.endswith('-r.wav')
        ]
        
        if not audio_files:
            print(f"   No converted audio files found in sentence {sentence_num}\n")
            continue
        
        for audio_file in audio_files:
            filename = audio_file.name
            print(f"\nüé§ Processing: {filename}")
            
            # Load audio
            speech, rate = sf.read(str(audio_file))
            print(f"   Audio: {len(speech)/rate:.2f}s at {rate}Hz")
            
            # Step 1: Get ASR transcript (needed as prompt for G2P)
            try:
                print(f"\n   Step 1: Getting ASR transcript...")
                result_asr = s2t_asr(speech, text_prev="<na>")
                pred_asr = result_asr[0][0]
                
                # Post-process ASR
                if "<notimestamps>" in pred_asr:
                    pred_asr = pred_asr.split("<notimestamps>")[1].strip()
                else:
                    pred_asr = pred_asr.strip()
                
                print(f"   üìù ASR Result: {pred_asr}")
                
            except Exception as e:
                print(f"   ‚ùå Error during ASR: {e}")
                import traceback
                traceback.print_exc()
                continue
            
            # Step 2: Audio-guided G2P (uses both speech and text prompt)
            try:
                print(f"\n   Step 2: Audio-guided Grapheme-to-Phoneme conversion...")
                result_g2p = s2t_g2p(speech, text_prev=pred_asr)
                pred_g2p = result_g2p[0][0]
                
                # Post-processing
                if "<notimestamps>" in pred_g2p:
                    pred_g2p = pred_g2p.split("<notimestamps>")[1].strip()
                else:
                    pred_g2p = pred_g2p.strip()
                
                # Remove slashes for cleaner output
                pred_g2p_clean = pred_g2p.replace("/", "")
                
                print(f"\n   üî§ G2P Result (phones):")
                print(f"   {pred_g2p_clean}")
                
                # Also show with slashes for reference
                print(f"\n   üî§ G2P Result (with slashes):")
                print(f"   {pred_g2p}")
                
                # Store result
                result_entry = {
                    "sentence_id": sentence_num,
                    "audio_file": filename,
                    "ground_truth_transcript": transcript,
                    "asr_transcript": pred_asr,
                    "g2p_result": {
                        "with_slashes": pred_g2p,
                        "clean": pred_g2p_clean
                    }
                }
                results["results"].append(result_entry)
                
            except Exception as e:
                print(f"   ‚ùå Error during G2P: {e}")
                import traceback
                traceback.print_exc()
                
                # Store error
                result_entry = {
                    "sentence_id": sentence_num,
                    "audio_file": filename,
                    "ground_truth_transcript": transcript,
                    "asr_transcript": pred_asr if 'pred_asr' in locals() else "N/A",
                    "error": str(e)
                }
                results["results"].append(result_entry)
        
        print()  # Empty line between sentences

# Write results to files
print("\n" + "="*70)
print("Writing results to files...")

# Write JSON file
with open(output_file_json, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print(f"‚úì JSON results saved to: {output_file_json}")

# Write human-readable text file
with open(output_file_txt, 'w', encoding='utf-8') as f:
    f.write("="*70 + "\n")
    f.write("POWSM Audio-guided Grapheme-to-Phoneme (G2P) Results\n")
    f.write("="*70 + "\n")
    f.write(f"Timestamp: {results['timestamp']}\n")
    f.write(f"Model: {results['model']}\n")
    f.write(f"Task: {results['task']}\n")
    f.write("="*70 + "\n\n")
    
    for entry in results["results"]:
        f.write(f"\nSentence ID: {entry['sentence_id']}\n")
        f.write(f"Audio File: {entry['audio_file']}\n")
        f.write(f"Ground Truth Transcript: {entry.get('ground_truth_transcript', 'N/A')}\n")
        f.write(f"ASR Transcript: {entry.get('asr_transcript', 'N/A')}\n")
        
        if 'error' in entry:
            f.write(f"Error: {entry['error']}\n")
        else:
            f.write(f"G2P Result (clean): {entry['g2p_result']['clean']}\n")
            f.write(f"G2P Result (with slashes): {entry['g2p_result']['with_slashes']}\n")
        
        f.write("-"*70 + "\n")

print(f"‚úì Text results saved to: {output_file_txt}")

print("\n" + "="*70)
print("Audio-guided Grapheme-to-Phoneme conversion complete!")
print("="*70)


Found 2 sentence directory(ies) to process:

Loading models...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 109552.72it/s]
Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 118387.61it/s]



Sentence 12
üìù Ground Truth Transcript: The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.


üé§ Processing: umit12-r.wav
   Audio: 20.00s at 16000Hz

   Step 1: Getting ASR transcript...


  with autocast(False):


   üìù ASR Result: the weather is rather warm this turnsday  ‚Åá  think we should go to the theatre together think you for thinking about this toroli

   Step 2: Audio-guided Grapheme-to-Phoneme conversion...

   üî§ G2P Result (phones):
   √∞…ôw…õ√∞…úÀû…™z…π…ô√∞…úÀûw…î…πm√∞…™st ∞…úÀûnzde…™a…™Œ∏…™ÃÉ≈ãkwi É ädko ät ∞u√∞…ôŒ∏i…ôt…úÀût ∞…ô…°…õ√∞…úÀûŒ∏…™ÃÉ≈ãkjuf…π…úÀûŒ∏…™ÃÉ≈ãk…™ÃÉ≈ã…ôba ät√∞…™st ∞…î…πo älÃ¥i

   üî§ G2P Result (with slashes):
   /√∞//…ô//w//…õ//√∞//…úÀû//…™//z//…π//…ô//√∞//…úÀû//w//…î//…π//m//√∞//…™//s//t ∞//…úÀû//n//z//d//e//…™//a//…™//Œ∏//…™ÃÉ//≈ã//k//w//i// É// ä//d//k//o// ä//t ∞//u//√∞//…ô//Œ∏//i//…ô//t//…úÀû//t ∞//…ô//…°//…õ//√∞//…úÀû//Œ∏//…™ÃÉ//≈ã//k//j//u//f//…π//…úÀû//Œ∏//…™ÃÉ//≈ã//k//…™ÃÉ//≈ã//…ô//b//a// ä//t//√∞//…™//s//t ∞//…î//…π//o// ä//lÃ¥//i/

üé§ Processing: yusuf12-r.wav
   Audio: 20.00s at 16000Hz

   Step 1: Getting ASR transcript...
   üìù ASR Result: the weather is rather warm this Thursday  ‚Åá  think we she cause it the theater together think yo