Creating a lightweight preprocessing script is almost always the best approach when you want consistent, structured parsing — especially from voice-to-text.

How we did define a Controlled Natural Language (CNL)
Create a semi-natural format that's:

1. Easy to dictate or speak

2. Consistent enough for parsing


GOOD Example:

"Title Operation Coastal Shield. Description Allied forces retreat. Unit U1 British Infantry Infantry 85 Friendly 3 -2.5. Feature Bunker 0 2 size 10. Event 0.01 German Armor fires U3 Fire."

Bad Example:

"We need to pull back. The Brits are getting hit hard. The tanks are closing in. French guys are holding. There's a bunker north of them."
Parsing that consistently into structured data is... tough without some help.

In [4]:
!pip install gtts
!pip install pydub
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg

!pip install whisper

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-wliecetk
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-wliecetk
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktok

In [6]:
from pydantic import BaseModel
from typing import List, Optional, Tuple

class Unit(BaseModel):
    id: str
    name: str
    type: str  # e.g. "infantry", "armor", "air support"
    strength: int  # combat effectiveness, 0–100
    position: Optional[tuple[float, float]] = None  # (x, y) coords
    allegiance: str  # "friendly" or "enemy"
    status: str = "active"  # e.g. "active", "retreating", "destroyed"

class TerrainFeature(BaseModel):
    type: str  # e.g. "hill", "forest", "building"
    position: tuple[float, float]
    size: float  # area/radius in meters

class Terrain(BaseModel):
    type: str
    features: List[TerrainFeature]
    dimensions: Tuple[int, int]  # width x height in meters or grid
    map_size: Optional[str] = None  # e.g., "20x20 hexes"
    terrain_types: Optional[List[str]] = None  # forest, desert, urban...
    obstacles: Optional[List[str]] = None  # walls, bunkers, etc.
    elevation_features: Optional[List[str]] = None  # hills, ridges, etc.
    weather: Optional[str] = None  # clear, fog, night...
    lighting: Optional[str] = None  # daylight, artificial, etc.

class Objective(BaseModel):
    id: str
    description: str
    controlling_unit_ids: List[str] = []
    completed: bool = False
    location: Optional[tuple[float, float]]
    priority: int = 1  # Higher number = more critical

class BattleEvent(BaseModel):
    timestamp: object  # e.g. "00:05", "12:03 PM"
    description: str
    involved_units: List[str] = []
    event_type: str  # e.g. "move", "fire", "retreat", "reinforce"

class Scenario(BaseModel):
    title: str
    description: str
    terrain: Terrain
    units: List[Unit]
    objectives: List[Objective]
    timeline: List[BattleEvent]

In [60]:
import re
from collections import defaultdict
from pprint import pprint


# Define the keyword labels we want to tokenize by
KEYWORDS = ['Title', 'Description', 'Unit', 'Feature', 'Objective', 'Event']


def tokenize_by_keyword(text: str):
    text = text.replace("minus", "-")  # Normalize voice-to-text quirks
    pattern = r'\b(' + '|'.join(KEYWORDS) + r')\b(?:\s+is)?'
    tokens = re.split(pattern, text)

    # re.split gives us a list like: ['', 'Title', ' Operation X.', 'Unit', ' ID equals ...', ...]
    # We need to stitch it back together as {keyword: [chunks]}
    data = defaultdict(list)

    current_key = None
    for token in tokens:
        token = token.strip()
        if not token:
            continue
        if token in KEYWORDS:
            current_key = token
        elif current_key:
            data[current_key].append(token)

    return data


# Sample input
input_text = """Title is Operation Coastal Shield. Description is Allied Forces are retreating under fire.
    Unit is ID equals U1. Name equals British Infantry. Type equals Infantry. Strength equals 85. Allegiance equals Friendly. X equals 3. Y equals minus 2.5.
    Unit is ID equals U2. Name equals French Infantry. Type equals Infantry. Strength equals 80. Allegiance equals Friendly. X equals 1. Y equals minus 2.2.
    Unit is ID equals U3. Name equals German Armor. Type equals Armor. Strength equals 92. Allegiance equals Enemy. X equals 2. Y equals minus 1.8.
    Feature is Type equals Bunker. X equals 0. Y equals 2. Size equals 10.
    Objective is ID equals O1. Desk equals evacuate to Boats. X equals 4. Y equals 0.5. Priority equals 1.
    Event is Time equals 0.00. Desk equals British Infantry fallback. Units equals U1. Type equals Move.
    Event is Time equals 0.01. Desk equals German Armor fires. Units equals U3. Type equals Fire.
    Event is Time equals 0.01. Desk equals French Infantry Holds. Units equals U2. Type equals Hold."""

# Run the tokenizer
tokenized_data = tokenize_by_keyword(input_text)

# Pretty print the result
pprint(dict(tokenized_data))
for item in tokenized_data['Unit']:
  print(item)

{'Description': ['Allied Forces are retreating under fire.'],
 'Event': ['Time equals 0.00. Desk equals British Infantry fallback. Units '
           'equals U1. Type equals Move.',
           'Time equals 0.01. Desk equals German Armor fires. Units equals U3. '
           'Type equals Fire.',
           'Time equals 0.01. Desk equals French Infantry Holds. Units equals '
           'U2. Type equals Hold.'],
 'Feature': ['Type equals Bunker. X equals 0. Y equals 2. Size equals 10.'],
 'Objective': ['ID equals O1. Desk equals evacuate to Boats. X equals 4. Y '
               'equals 0.5. Priority equals 1.'],
 'Title': ['Operation Coastal Shield.'],
 'Unit': ['ID equals U1. Name equals British Infantry. Type equals Infantry. '
          'Strength equals 85. Allegiance equals Friendly. X equals 3. Y '
          'equals - 2.5.',
          'ID equals U2. Name equals French Infantry. Type equals Infantry. '
          'Strength equals 80. Allegiance equals Friendly. X equals 1. Y '
         

In [78]:
unit_id = re.compile(r"ID\s?\w+\s?(\w+)\.")
unit_name= re.compile(r"Name\s+\w+\s(.+?)\.")
unit_type= re.compile(r"Type\s+\w+\s(\w+)\.")
unit_ste= re.compile(r"Strength\s+\w+\s(\d+)\.")
unit_all= re.compile(r"Allegiance\s+\w+\s(\w+)\.")
unit_x= re.compile(r"X\s+\w+\s(-?\d+(?:\.\d+)?)\.")
unit_y= re.compile(r"Y\s+\w+\s(-?\d+(?:\.\d+)?)\.")
unit_status= re.compile(r"Status\s+\w+\s(-?\d+(?:\.\d+)?)\.")

def parse_units(chunks):
    parsed_units = []
    for chunk in chunks:
       # chunk = chunk.replace("minus", "-")  # handle speech-to-text quirks if an
        parsed_units.append(Unit(
               id=unit_id.search(chunk).group(0),
               name=unit_name.search(chunk).group(0),
               type=unit_type.search(chunk)).group(0),
               strength=int(unit_ste.search(chunk)).group(0),
               allegiance=unit_all.search(chunk).group(0),
               position=(float(unit_x.search(chunk).group(0)),
                         float(unit_y.search(chunk).group(0))))

    return parsed_units

# Run the parser
parse_units(tokenized_data['Unit'])

uii ="""U1 British Infantry Infantry 85 Friendly 3 -2.5 active.
U1 British Infantry Infantry 85 Friendly 3 -2.5 destroyed.
U2 French Infantry Infantry 80 Friendly 1 -2.2 active."""

ValidationError: 3 validation errors for Unit
type
  Input should be a valid string [type=string_type, input_value=<re.Match object; span=(4...'Type equals Infantry.'>, input_type=Match]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type
strength
  Field required [type=missing, input_value={'id': 'ID equals U1.', '...Type equals Infantry.'>}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
allegiance
  Field required [type=missing, input_value={'id': 'ID equals U1.', '...Type equals Infantry.'>}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing

In [42]:
def parse_unit(chunk: str) -> Unit:
  units = []
  pattern = re.compile(
        r"ID equals (.*)\. Name equals (.+?)\. Type equals (.+?)\. Strength equals (\d+)\. "
        r"Allegiance equals (.+?)\. X equals (-?\d+\.?\d*)\. Y equals (-?\d+\.?\d*)\.")
  match = pattern.search(chunk)
  return Unit(
        id=match.group(1),
        name=match.group(2),
        type=match.group(3),
        strength=int(match.group(4)),
        allegiance=match.group(5),
        position=(float(match.group(6)), float(match.group(7)))
    )


def parse_feature(chunk: str) -> TerrainFeature:
    pattern = re.compile(
        r"Type equals (.+?)\. X equals (-?\d+\.?\d*)\. Y equals (-?\d+\.?\d*)\. Size equals (\d+)\."
    )
    match = pattern.search(chunk)
    return TerrainFeature(
        type=match.group(1),
        position=(float(match.group(2)), float(match.group(3))),
        size=float(match.group(4))
    )


def parse_objective(chunk: str) -> Objective:
    pattern = re.compile(
        r"ID equals (\w+)\. Desk equals (.+?)\. X equals (-?\d+\.?\d*)\. "
        r"Y equals (-?\d+\.?\d*)\. Priority equals (\d+)\."
    )
    match = pattern.search(chunk)
    return Objective(
        id=match.group(1),
        description=match.group(2),
        location=(float(match.group(3)), float(match.group(4))),
        priority=int(match.group(5))
    )


def parse_event(chunk: str) -> BattleEvent:
    pattern = re.compile(
        r"Time equals (\d+\.\d+)\. Desk equals (.+?)\. Units equals (\w+)\. Type equals (\w+)\."
    )
    match = pattern.search(chunk)
    return BattleEvent(
        timestamp=float(match.group(1)),
        description=match.group(2),
        involved_units=[match.group(3)],
        event_type=match.group(4).lower()
    )


# -------------------- MAIN PARSER --------------------

def parse_scenario(text: str) -> Scenario:
    tokens = dict(tokenize_by_keyword(text))
    print(tokens['Unit'])
    title = tokens['Title'][0] if tokens['Title'] else "Untitled Scenario"
    description = tokens['Description'][0] if tokens['Description'] else ""
    for item in tokens['Unit']:
      units = [parse_unit(item)]
    features = [parse_feature(chunk) for chunk in tokens['Feature']]
    objectives = [parse_objective(chunk) for chunk in tokens['Objective']]
    timeline = [parse_event(chunk) for chunk in tokens['Event']]

    terrain = Terrain(features=features)

    return Scenario(
        title=title,
        description=description,
        terrain=terrain,
        units=units,
        objectives=objectives,
        timeline=timeline
    )
if __name__ == "__main__":
    input_text = """Title is Operation Coastal Shield. Description is Allied Forces are retreating under fire.
    Unit is ID equals U1. Name equals British Infantry. Type equals Infantry. Strength equals 85. Allegiance equals Friendly. X equals 3. Y equals minus 2.5.
    Unit is ID equals U2. Name equals French Infantry. Type equals Infantry. Strength equals 80. Allegiance equals Friendly. X equals 1. Y equals minus 2.2.
    Unit is ID equals U3. Name equals German Armor. Type equals Armor. Strength equals 92. Allegiance equals Enemy. X equals 2. Y equals minus 1.8.
    Feature is Type equals Bunker. X equals 0. Y equals 2. Size equals 10.
    Objective is ID equals O1. Desk equals evacuate to Boats. X equals 4. Y equals 0.5. Priority equals 1.
    Event is Time equals 0.00. Desk equals British Infantry fallback. Units equals U1. Type equals Move.
    Event is Time equals 0.01. Desk equals German Armor fires. Units equals U3. Type equals Fire.
    Event is Time equals 0.01. Desk equals French Infantry Holds. Units equals U2. Type equals Hold."""

    scenario = parse_scenario(input_text)
    print(scenario.json(indent=2))

['ID equals U1. Name equals British Infantry. Type equals Infantry. Strength equals 85. Allegiance equals Friendly. X equals 3. Y equals - 2.5.', 'ID equals U2. Name equals French Infantry. Type equals Infantry. Strength equals 80. Allegiance equals Friendly. X equals 1. Y equals - 2.2.', 'ID equals U3. Name equals German Armor. Type equals Armor. Strength equals 92. Allegiance equals Enemy. X equals 2. Y equals - 1.8.']


AttributeError: 'NoneType' object has no attribute 'group'

In [20]:
from gtts import gTTS
from pydub import AudioSegment
import openai
import os
from gtts import gTTS
from IPython.display import Audio, display

scenario_text = """Title is Operation Coastal Shield
Description is Allied forces are retreating under fire.
Unit is ID=U1, Name=British Infantry, Type=infantry, Strength=85, Allegiance=friendly, X=-3, Y=-2.5
Unit is ID=U2, Name=French Infantry, Type=infantry, Strength=80, Allegiance=friendly, X=-1, Y=-2.2
Unit is ID=U3, Name=German Armor, Type=armor, Strength=92, Allegiance=enemy, X=2, Y=-1.8
Feature is Type=Bunker, X=0, Y=-2, Size=10
Objective is ID=O1, Desc=Evacuate to boats, X=4, Y=0.5, Priority=1
Event is Time=00:00, Desc=British Infantry fallback, Units=U1, Type=move
Event is Time=00:01, Desc=German Armor fires, Units=U3, Type=fire
Event is Time=00:01, Desc=French Infantry holds, Units=U2, Type=hold
"""
#TODO:add delemeinatorto end of every line
tts = gTTS(scenario_text)
tts.save("scenario.mp3")
AudioSegment.from_mp3("scenario.mp3").export("scenario.wav", format="wav")


<_io.BufferedRandom name='scenario.wav'>

In [21]:
import whisper
import openai
import os
from gtts import gTTS
from IPython.display import Audio, display

In [18]:
import whisper
model = whisper.load_model("base")
result = model.transcribe("scenario.wav")
text = result["text"]


print(text)

 Title is Operation Coastal Shield. Description is Allied Forces are retreating under fire. Unit is ID equals U1. Name equals British Infantry. Type equals Infantry. Strength equals 85. Allegiance equals Friendly. X equals 3. Y equals minus 2.5. Unit is ID equals U2. Name equals French Infantry. Type equals Infantry. Strength equals 80. Allegiance equals Friendly. X equals 1. Y equals minus 2.2. Unit is ID equals U3. Name equals German Armor. Type equals Armor. Strength equals 92. Allegiance equals Enemy. X equals 2. Y equals minus 1.8. Feature is Type equals Bunker. X equals 0. Y equals 2. Size equals 10. Objective is ID equals O1. Desk equals evacuate to Boats. X equals 4. Y equals 0.5. Priority equals 1. Event is Time equals 0.00. Desk equals British Infantry fallback. Units equals U1. Type equals Move. Event is Time equals 0.01. Desk equals German Armor fires. Units equals U3. Type equals Fire. Event is Time equals 0.01. Desk equals French Infantry Holds. Units equals U2. Type equa

In [23]:
from google.colab import files
uploaded = files.upload("/content/scenario.mp3")
audio_path = next(iter(uploaded))

KeyboardInterrupt: 

In [None]:
text = text.replace("equals", "=")


 Title, Operation Coastal Shield, Description, Allied Forces are retreating under fire. Unit ID = U1, Name = British Infantry, Type = Infantry, Strength = 85, Allegiance = Friendly, X = 3, Y = minus 2.5, Unit, ID = U2, Name = French Infantry, Type = Infantry, Strength = 80, Allegiance = Friendly, X = 1, Y = minus 2.2, Unit, ID = U3, Name = German Armor, Type = Armor, Strength = 92, Allegiance = Enemy, X = 2, Y = minus 1.8, Feature, Type = Bunker, X = 0, Y = 2, Size = 10, Objective, ID = U1, Desk = Evacuate to Boats, X = 4, Y = 0.5, Priority = 1, Event, Time = 0, 100, Desk = British Infantry, Fullback, Units = U1, Type = Move, Event, Time = 0, 0, 1, Desk = German Armor, Fires, Units = U3, Type = Fire, Event, Time = 0, 0, 1, Desk = French Infantry Holds, Units = U2, Type = Hold,


In [14]:
mytext = parse_natural_scenario_text(text)

IndexError: list index out of range

In [19]:
import re

def parse_scenario_text(text: str) -> Scenario::
    text = text.replace("minus", "-")  # Normalize "minus" to "-"

    title = re.search(r"Title is (.+?)\.", text).group(1).strip()
    description = re.search(r"Description is (.+?)\.", text).group(1).strip()

    unit_pattern = re.compile(
        r"Unit is ID equals (\w+)\. Name equals (.+?)\. Type equals (.+?)\. Strength equals (\d+)\. "
        r"Allegiance equals (.+?)\. X equals (-?\d+(?:\.\d+)?)\. Y equals (-?\d+(?:\.\d+)?)\."
    )
    units = [
        Unit(
            id=m.group(1),
            name=m.group(2),
            type=m.group(3),
            strength=int(m.group(4)),
            allegiance=m.group(5),
            position=Position(x=float(m.group(6)), y=float(m.group(7)))
        )
        for m in unit_pattern.finditer(text)
    ]

    feature_pattern = re.compile(
        r"Feature is Type equals (.+?)\. X equals (-?\d+(?:\.\d+)?)\. "
        r"Y equals (-?\d+(?:\.\d+)?)\. Size equals (\d+)\."
    )
    features = [
        Feature(
            type=m.group(1),
            position=Position(x=float(m.group(2)), y=float(m.group(3))),
            size=int(m.group(4))
        )
        for m in feature_pattern.finditer(text)
    ]

    objective_pattern = re.compile(
        r"Objective is ID equals (\w+)\. Desk equals (.+?)\. X equals (-?\d+(?:\.\d+)?)\. "
        r"Y equals (-?\d+(?:\.\d+)?)\. Priority equals (\d+)\."
    )
    objectives = [
        Objective(
            id=m.group(1),
            description=m.group(2),
            position=Position(x=float(m.group(3)), y=float(m.group(4))),
            priority=int(m.group(5))
        )
        for m in objective_pattern.finditer(text)
    ]

    event_pattern = re.compile(
        r"Event is Time equals (\d+\.\d+)\. Desk equals (.+?)\. Units equals (\w+)\. Type equals (\w+)\."
    )
    events = [
        Event(
            time=float(m.group(1)),
            description=m.group(2),
            units=[m.group(3)],
            type=m.group(4)
        )
        for m in event_pattern.finditer(text)
    ]


NameError: name 're' is not defined

In [7]:
# Recreate the natural parser function
def parse_natural_scenario_text(text: str) -> Scenario:
    units = []
    features = []
    objectives = []
    timeline = []
    title = "Generated Scenario"
    description = ""
    terrain_type = "beach"
    terrain_dims = (1000, 1000)

    # Normalize and tokenize input
    text = text.replace("equals", "=")
    text = text.replace("minus", "-")
    entries = [e.strip() for e in text.split(",") if e.strip()]

    current = {}
    section = None

    def commit_unit():
       for i in range(0,len(entries)):
          if "unit" in entries[i].lower() and "ID" in entries[i+1]:
            unit = Unit(
                id=entries[i+1],
                name=entries[i+2],
                type=entries[i+3],
                strength=int(entries[i+4]),
                allegiance=entries[i+5].lower(),
                position=(float(entries[i+6]), float(entries[i+7]))
            )
            units.append(unit)

    def commit_feature():
        if all(k in current for k in ["Type", "X", "Y", "Size"]):
            features.append(TerrainFeature(
                type=current["Type"],
                position=(float(current["X"]), float(current["Y"])),
                size=float(current["Size"])
            ))

    def commit_objective():
        if all(k in current for k in ["ID", "Desc"]):
            objectives.append(Objective(
                id=current["ID"],
                description=current["Desc"],
                location=(float(current.get("X", 0)), float(current.get("Y", 0))),
                priority=int(current.get("Priority", 1))
            ))

    def commit_event():
        if all(k in current for k in ["Time", "Desc", "Units", "Type"]):
            timeline.append(BattleEvent(
                timestamp=current["Time"],
                description=current["Desc"],
                involved_units=current["Units"].split("="),
                event_type=current["Type"].lower()
            ))

    for entry in entries:
        if entry.lower().startswith("title"):
            section = "title"
            title = entry.split("=", 1)[1].strip()
        elif entry.lower().startswith("description"):
            section = "description"
            description = entry.split("=", 1)[1].strip()
        elif entry.lower().startswith("unit"):
            if section == "unit": commit_unit()
            current = {}
            section = "unit"
        elif entry.lower().startswith("feature"):
            if section == "unit": commit_unit()
            if section == "feature": commit_feature()
            current = {}
            section = "feature"
        elif entry.lower().startswith("objective"):
            if section == "feature": commit_feature()
            if section == "objective": commit_objective()
            current = {}
            section = "objective"
        elif entry.lower().startswith("event"):
            if section == "objective": commit_objective()
            if section == "event": commit_event()
            current = {}
            section = "event"
        else:
            if "=" in entry:
                key, value = entry.split("=", 1)
                current[key.strip().capitalize()] = value.strip()

    # Final commit
    if section == "unit": commit_unit()
    if section == "feature": commit_feature()
    if section == "objective": commit_objective()
    if section == "event": commit_event()

    return Scenario(
        title=title,
        description=description,
        terrain=Terrain(type=terrain_type, features=features, dimensions=terrain_dims),
        units=units,
        objectives=objectives,
        timeline=timeline
    )


In [None]:
text = text.replace("equals", "=")
text = text.replace("minus", "-")
entries = [e.strip() for e in text.split(",") if e.strip()]

In [None]:
#Unit, ID = U3, Name = German Armor, Type = Armor, Strength = 92, Allegiance = Enemy, X = 2, Y = minus 1.8,

for i in range(0,len(entries)):
  if "unit" in entries[i].lower() and "ID" in entries[i+1]:

    print(entries[i+1],entries[i+2],entries[i+3],entries[i+4],entries[i+5],entries[i+6],entries[i+7])

ID = U2 Name = French Infantry Type = Infantry Strength = 80 Allegiance = Friendly X = 1 Y = - 2.2
ID = U3 Name = German Armor Type = Armor Strength = 92 Allegiance = Enemy X = 2 Y = - 1.8
