In [1]:
from pathlib import Path
import json#
import random

In [2]:
data = Path("data") / "evaluation" / "single_turn"

In [6]:
paths = [p for p in data.glob("*.json") if "fake" not in str(p) and "syn" in str(p)]
all_prompts = {p.stem: json.loads(p.read_text()) for p in paths}
{k:len(v) for k,v in all_prompts.items()}

{'closed_until_synthetic': 100,
 'opening_hours_synthetic': 100,
 'open_until_synthetic': 100,
 'open_website_synthetic': 100,
 'wheelchair_accessible_synthetic': 100,
 'navigation_link_synthetic': 50,
 'building_location_synthetic': 85,
 'open_now_synthetic': 100}

In [7]:
K = 10

In [8]:

for name in ("frederik", "thomas"):
    text = ""

    for key, values in all_prompts.items():
        text += f"## {key}\n\n"
        sampled_nums = random.sample(list(range(len(values))), K)
        sampled = [values[i] for i in sampled_nums]
        text += "\n".join("- [ ] " + f"({num:02d}) " + v["prompts"][0]["reformulated_prompt"] for v, num in zip(sampled, sampled_nums))
        text += "\n\n"

    (data.parent / "audio" / f"{name}_audio_samples.md").write_text(text)

In [10]:
# select 15 multiturn examples
K_multi = 15

multi_turn_p = data.parent / "multi_turn" / "multi_turns.json"
data_multi = json.loads(multi_turn_p.read_text())

In [19]:
fst = random.sample(range(100), K_multi)
snd = [i + 100 for i in random.sample(range(100), K_multi)]

for name, indices in zip(("frederik", "thomas"), (fst, snd)):
    text = ""

    for ind in indices:
        text += f"## Multi Turn ({ind})\n\n"
        
        text += "\n".join(
            "- [ ] " + p["reformulated_prompt"] for p in data_multi[ind]["prompts"]
        )
        text += "\n\n"

    (data.parent / "audio" / f"{name}_multi_turn_audio_samples.md").write_text(text)

## Rename Audio Files

In [38]:
voice_notes = list(sorted(Path("data/evaluation/audio/multi_turn").glob("Voice*")))

In [36]:
# if snd / fst not available, parse from markdown

In [41]:
new_names = [
    f"{ind:03d}-{i}.m4a"
    for ind in snd
    for i in range(data_multi[ind]["num_turns"])
]
len(new_names), len(voice_notes)

(45, 45)

In [40]:
for old, new in zip(voice_notes, new_names):
    old.rename(old.with_name(new))

In [43]:
#for p in Path("data/evaluation/audio/multi_turn").glob("*"):
#   p.rename(p.with_name(p.name + ".m4a"))

In [49]:
import re
# Match headings that start with ##
heading_regex = r"^##\s+(.*)$"
# Match task items with numbers in parentheses
task_regex = r"^-\s+\[\s*\]\s*\((\d+)\)\s+(.*)$"

text = Path("data/evaluation/audio/thomas_audio_samples.md").read_text()

headings = re.findall(heading_regex, text, re.MULTILINE)
tasks = re.findall(task_regex, text, re.MULTILINE)

data = {h:[x[0] for x in tasks[i:i+10]] for h, i in zip(headings, range(0, len(tasks), 10))}
data

{'closed_until_synthetic': ['06',
  '53',
  '76',
  '86',
  '60',
  '30',
  '78',
  '47',
  '79',
  '45'],
 'opening_hours_synthetic': ['44',
  '80',
  '50',
  '55',
  '77',
  '58',
  '46',
  '13',
  '69',
  '05'],
 'open_until_synthetic': ['24',
  '33',
  '50',
  '98',
  '22',
  '36',
  '92',
  '38',
  '54',
  '04'],
 'open_website_synthetic': ['13',
  '93',
  '50',
  '43',
  '17',
  '11',
  '46',
  '27',
  '76',
  '98'],
 'wheelchair_accessible_synthetic': ['99',
  '10',
  '51',
  '04',
  '23',
  '08',
  '06',
  '46',
  '91',
  '57'],
 'navigation_link_synthetic': ['15',
  '05',
  '27',
  '36',
  '30',
  '21',
  '02',
  '28',
  '47',
  '08'],
 'building_location_synthetic': ['11',
  '12',
  '63',
  '67',
  '57',
  '64',
  '41',
  '82',
  '40',
  '83'],
 'open_now_synthetic': ['73',
  '42',
  '33',
  '58',
  '40',
  '67',
  '52',
  '31',
  '22',
  '39']}

In [54]:
s_new_names = [f"{key}-{value}.m4a" for key,values in data.items() for value in values]

In [55]:
s_voice_notes = list(sorted(Path("data/evaluation/audio/single_turn").glob("Voice*")))
len(s_new_names), len(s_voice_notes)

(80, 80)

In [56]:
for old, new in zip(s_voice_notes, s_new_names):
    old.rename(old.with_name(new))