In [None]:
from __future__ import annotations
import sys
from pathlib import Path
import numpy as np
import soundfile as sf

# allow `import mod.assessment` when notebook lives in sig/exp
repo_root = Path("../../mod")
sys.path.insert(0, str(repo_root.absolute()))



In [None]:
from espnet2.bin.s2t_inference import Speech2Text
from assessment.edit_distance import edit_operations

DEVICE = "cpu"  # set to "cuda" if available
LANG = "<eng>"
PR_MODEL = "espnet/powsm"
G2P_MODEL = "espnet/powsm"



In [None]:
AUDIO_PATH = Path("/Users/umitcanevleksiz/Documents/Programming/senior/sig/exp/audio/powsm/12/umit12-r.wav")
TARGET_TEXT = "The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly."

speech, rate = sf.read(AUDIO_PATH)
if rate != 16000:
    raise ValueError(f"Expected 16kHz audio, got {rate}")



In [None]:
pr = Speech2Text.from_pretrained(PR_MODEL, device=DEVICE, task_sym="<pr>", lang_sym=LANG)
g2p = Speech2Text.from_pretrained(G2P_MODEL, device=DEVICE, task_sym="<g2p>", lang_sym=LANG)



In [None]:
def parse_pred(raw: str) -> tuple[str, list[str]]:
    if "<notimestamps>" in raw:
        raw = raw.split("<notimestamps>")[1]
    raw = raw.strip()
    tokens = []
    for part in raw.split("//"):
        part = part.strip()
        if not part:
            continue
        if part.startswith("/"):
            part = part[1:]
        if part.endswith("/"):
            part = part[:-1]
        if part:
            tokens.append(part)
    return raw, tokens


def run_pr(audio):
    raw = pr(audio, text_prev="<na>")[0][0]
    return parse_pred(raw)


def run_g2p(text: str, audio=None):
    audio_in = audio if audio is not None else np.zeros(1600, dtype=np.float32)
    raw = g2p(audio_in, text_prev=text)[0][0]
    return parse_pred(raw)



In [None]:
import re

ARPA_TO_IPA = {
    "AA": "ɑ", "AE": "æ", "AH": "ə", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
    "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "EH": "ɛ", "ER": "ɝ",
    "EY": "eɪ", "F": "f", "G": "ɡ", "HH": "h", "IH": "ɪ", "IY": "i",
    "JH": "dʒ", "K": "k", "L": "l", "M": "m", "N": "n", "NG": "ŋ",
    "OW": "oʊ", "OY": "ɔɪ", "P": "p", "R": "ɹ", "S": "s", "SH": "ʃ",
    "T": "t", "TH": "θ", "UH": "ʊ", "UW": "u", "V": "v", "W": "w",
    "Y": "j", "Z": "z", "ZH": "ʒ",
}


def arpa_seq_to_ipa(arpa_seq):
    ipa_seq = []
    for sym in arpa_seq:
        base = sym.rstrip("012")
        ipa_seq.append(ARPA_TO_IPA.get(base, base.lower()))
    return "".join(ipa_seq)


def cmu_target_ipa(text: str) -> str:
    try:
        import cmudict
    except ImportError as e:
        raise ImportError("cmudict not installed; install with `pip install cmudict`") from e

    cmu = cmudict.dict()
    words = re.findall(r"[A-Za-z']+", text.lower())
    ipa_parts = []
    for w in words:
        pron_list = cmu.get(w)
        if not pron_list:
            continue  # skip OOV
        ipa_parts.append(arpa_seq_to_ipa(pron_list[0]))
    return "".join(ipa_parts)

In [None]:
import json

CACHE_DIR = Path("cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def cache_path(audio_path: Path) -> Path:
    return CACHE_DIR / f"{audio_path.stem}.json"

def load_cached(audio_path: Path):
    p = cache_path(audio_path)
    if not p.exists():
        return None
    try:
        return json.loads(p.read_text())
    except Exception:
        return None

def save_cached(audio_path: Path, data: dict):
    cache_path(audio_path).write_text(json.dumps(data, ensure_ascii=False))


In [None]:
guided_CACHE_DIR = Path("cache/guided")
guided_CACHE_DIR.mkdir(parents=True, exist_ok=True)

def guided_cache_path(audio_path: Path) -> Path:
    return guided_CACHE_DIR / f"{audio_path.stem}.guided.json"

def load_cached_guided(audio_path: Path) -> str | None:
    p = guided_cache_path(audio_path)
    if not p.exists():
        return None
    try:
        data = json.loads(p.read_text())
        return data.get("guided_ipa")
    except Exception:
        return None

def save_cached_guided(audio_path: Path, guided_ipa: str):
    guided_cache_path(audio_path).write_text(json.dumps({"guided_ipa": guided_ipa}, ensure_ascii=False))



In [None]:
target_ipa = cmu_target_ipa(TARGET_TEXT)

cached = load_cached(AUDIO_PATH)

if cached and "actual_ipa" in cached:
    actual_ipa = cached["actual_ipa"]
else:
    actual_ipa = run_pr(speech)

if cached and "guided_ipa" in cached:
    guided_ipa = cached["guided_ipa"]
else:
    guided_ipa = run_g2p(TARGET_TEXT, speech)

token_actual = tokenize_ipa(actual_ipa)
token_guided = tokenize_ipa(guided_ipa)
token_target = tokenize_ipa(target_ipa)

if not cached or ("actual_ipa" not in cached or "guided_ipa" not in cached):
    save_cached(AUDIO_PATH, {
        "actual_ipa": actual_ipa,
        "guided_ipa": guided_ipa,
        "token_actual": token_actual,
        "token_guided": token_guided,
        "token_target": token_target,
    })

ops = edit_operations(token_actual, token_target)



In [None]:
import json

summary = {
    "target_text": TARGET_TEXT,
    "target_ipa": token_target,
    "actual_ipa": token_actual,
    "guided_ipa": token_guided,
    "ops": ops,
    "ops_len": len(ops),
}
print(summary)
# save summary to json
with open("powsm_minimal.json", "w") as f:
    json.dump(summary, f, ensure_ascii=False, indent=4)