In [1]:
from __future__ import annotations
import sys
from pathlib import Path
import json
import hashlib
import numpy as np
import soundfile as sf

# allow `import mod.assessment` when notebook lives in sig/exp
repo_root = Path("../../mod")
sys.path.insert(0, str(repo_root.absolute()))

from espnet2.bin.s2t_inference import Speech2Text
from assessment.edit_distance import edit_operations

DEVICE = "cpu"  # set to "cuda" if available
LANG = "<eng>"
PR_MODEL = "espnet/powsm"
G2P_MODEL = "espnet/powsm"

CACHE_DIR = Path("./cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

pr = Speech2Text.from_pretrained(PR_MODEL, device=DEVICE, task_sym="<pr>", lang_sym=LANG)
g2p = Speech2Text.from_pretrained(G2P_MODEL, device=DEVICE, task_sym="<g2p>", lang_sym=LANG)

Failed to import Flash Attention, using ESPnet default: No module named 'flash_attn'


  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 32442.13it/s]
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 50972.44it/s]


In [2]:
AUDIO_PATH = Path("./audio/12/umit12-r.wav")
TARGET_TEXT = "The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly."

speech, rate = sf.read(AUDIO_PATH)
if rate != 16000:
    raise ValueError(f"Expected 16kHz audio, got {rate}")

In [3]:
def parse_pred(raw: str) -> tuple[str, list[str]]:
    if "<notimestamps>" in raw:
        raw = raw.split("<notimestamps>")[1]
    raw = raw.strip()
    tokens = []
    for part in raw.split("//"):
        part = part.strip()
        if not part:
            continue
        if part.startswith("/"):
            part = part[1:]
        if part.endswith("/"):
            part = part[:-1]
        if part:
            tokens.append(part)
    return raw, tokens


def run_pr(audio):
    raw = pr(audio, text_prev="<na>")[0][0]
    return parse_pred(raw)


def run_g2p(text: str, audio):
    audio_in = audio if audio is not None else np.zeros(1600, dtype=np.float32)
    raw = g2p(audio_in, text_prev=text)[0][0]
    return parse_pred(raw)

In [None]:
import re
import cmudict

ARPA_TO_IPA = {
    "AA": "ɑ", "AE": "æ", "AH": "ə", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
    "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "EH": "ɛ", "ER": "ɜ˞",
    "EY": "eɪ", "F": "f", "G": "ɡ", "HH": "h", "IH": "ɪ", "IY": "i",
    "JH": "dʒ", "K": "k", "L": "l", "M": "m", "N": "n", "NG": "ŋ",
    "OW": "oʊ", "OY": "ɔɪ", "P": "p", "R": "ɹ", "S": "s", "SH": "ʃ",
    "T": "t", "TH": "θ", "UH": "ʊ", "UW": "u", "V": "v", "W": "w",
    "Y": "j", "Z": "z", "ZH": "ʒ",
}

CMU_DICT = cmudict.dict()


def arpa_seq_to_ipa(arpa_seq):
    ipa_seq = []
    for sym in arpa_seq:
        base = sym.rstrip("012")
        ipa_seq.append(ARPA_TO_IPA.get(base, base.lower()))
    return ipa_seq


def cmu_target_ipa_tokens(text: str) -> list[str]:
    words = re.findall(r"[A-Za-z']+", text.lower())
    ipa_parts: list[str] = []
    for w in words:
        pron_list = CMU_DICT.get(w)
        if not pron_list:
            continue  # skip OOV
        ipa_parts.extend(arpa_seq_to_ipa(pron_list[0]))
    return ipa_parts


def tokens_to_str(tokens: list[str]) -> str:
    return " ".join(tokens)

In [5]:
def cache_path(kind: str, key: str) -> Path:
    safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", key).strip("_")
    if len(safe) > 80:
        digest = hashlib.sha1(key.encode("utf-8")).hexdigest()[:8]
        safe = f"{safe[:70]}_{digest}"
    if not safe:
        safe = hashlib.sha1(key.encode("utf-8")).hexdigest()[:8]
    return CACHE_DIR / f"{kind}_{safe}.json"


def load_cache(path: Path):
    if path.exists():
        with path.open() as f:
            return json.load(f)
    return None


def save_cache(path: Path, payload) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)


In [6]:
def get_target_ipa(text: str):
    path = cache_path("target_ipa", text)
    cached = load_cache(path)
    if cached:
        return cached
    ipa_tokens = cmu_target_ipa_tokens(text)
    payload = {
        "text": text,
        "ipa_tokens": ipa_tokens,
        "as_string": tokens_to_str(ipa_tokens),
    }
    save_cache(path, payload)
    return payload


def get_pr_prediction(audio, audio_path: Path):
    key = str(audio_path.resolve())
    path = cache_path("pr", key)
    cached = load_cache(path)
    if cached:
        return cached
    raw, tokens = run_pr(audio)
    payload = {"audio": key, "raw": raw, "tokens": tokens}
    save_cache(path, payload)
    return payload


def get_g2p_prediction(text: str, audio, audio_path: Path):
    key = f"{text}::{audio_path.resolve()}"
    path = cache_path("g2p", key)
    cached = load_cache(path)
    if cached:
        return cached
    raw, tokens = run_g2p(text, audio)
    payload = {
        "text": text,
        "audio": str(audio_path.resolve()),
        "raw": raw,
        "tokens": tokens,
    }
    save_cache(path, payload)
    return payload


In [7]:
target = get_target_ipa(TARGET_TEXT)
pr_pred = get_pr_prediction(speech, AUDIO_PATH)
g2p_pred = get_g2p_prediction(TARGET_TEXT, speech, AUDIO_PATH)

ops = edit_operations(pr_pred["tokens"], target["ipa_tokens"])

results = {
    "target_text": TARGET_TEXT,
    "audio_path": str(AUDIO_PATH.resolve()),
    "target_ipa_tokens": target["ipa_tokens"],
    "target_ipa_str": target["as_string"],
    "g2p_tokens": g2p_pred["tokens"],
    "pr_tokens": pr_pred["tokens"],
    "edit_operations": ops,
}
results


{'target_text': 'The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.',
 'audio_path': '/Users/umitcanevleksiz/Documents/Programming/senior/sig/ipa/audio/12/umit12-r.wav',
 'target_ipa_tokens': ['ð',
  'ə',
  'w',
  'ɛ',
  'ð',
  'ɝ',
  'ɪ',
  'z',
  'ɹ',
  'æ',
  'ð',
  'ɝ',
  'w',
  'ɔ',
  'ɹ',
  'm',
  'ð',
  'ɪ',
  's',
  'θ',
  'ɝ',
  'z',
  'd',
  'eɪ',
  'aɪ',
  'θ',
  'ɪ',
  'ŋ',
  'k',
  'w',
  'i',
  'ʃ',
  'ʊ',
  'd',
  'ɡ',
  'oʊ',
  't',
  'u',
  'ð',
  'ə',
  'θ',
  'i',
  'ə',
  't',
  'ɝ',
  't',
  'ə',
  'ɡ',
  'ɛ',
  'ð',
  'ɝ',
  'θ',
  'æ',
  'ŋ',
  'k',
  'j',
  'u',
  'f',
  'ɔ',
  'ɹ',
  'θ',
  'ɪ',
  'ŋ',
  'k',
  'ɪ',
  'ŋ',
  'ə',
  'b',
  'aʊ',
  't',
  'ð',
  'ɪ',
  's',
  'θ',
  'ɝ',
  'oʊ',
  'l',
  'i'],
 'target_ipa_str': 'ð ə w ɛ ð ɝ ɪ z ɹ æ ð ɝ w ɔ ɹ m ð ɪ s θ ɝ z d eɪ aɪ θ ɪ ŋ k w i ʃ ʊ d ɡ oʊ t u ð ə θ i ə t ɝ t ə ɡ ɛ ð ɝ θ æ ŋ k j u f ɔ ɹ θ ɪ ŋ k ɪ ŋ ə b aʊ t ð ɪ s θ ɝ 

In [8]:
def save_results(payload: dict, text: str, audio_path: Path):
    key = f"{text}::{audio_path.resolve()}"
    path = cache_path("results", key)
    save_cache(path, payload)
    return path


results_path = save_results(results, TARGET_TEXT, AUDIO_PATH)
results_path


PosixPath('cache/results_The_weather_is_rather_warm_this_Thursday._I_think_we_should_go_to_the__32a476b1.json')