In [1]:
import re
from dataclasses import dataclass
from typing import List, Optional, Any, Dict, Tuple

## Lexer

In [2]:
TokenSpec = [
    ("NUMBER",      r"\d+"),
    ("NOTE",        r"[A-G](?:#|b)?\d"),        # e.g., C4, F#3, Bb5
    ("DURATION",    r"whole|half|quarter|eighth|sixteenth"), # common durations
    ("SET",         r"Set\b|set\b"),
    ("KEY",         r"key\b|Key\b"),
    ("TIME",        r"time\b|Time\b"),
    ("SIGNATURE",   r"signature\b|Signature\b"),
    ("IN",          r"In\b|in\b"),
    ("MEASURE",     r"measure\b|Measure\b"),
    ("TO",          r"to\b|To\b"),
    ("COLON",       r":"),
    ("COMMA",       r","),
    ("SLASH",       r"/"),
    ("IDENT",       r"[A-Za-z+#]+"),            # words like 'C', 'major', 'minor'
    ("NEWLINE",     r"\n"),
    ("WS",          r"[ \t]+"),
    ("UNKNOWN",     r"."),
]
tok_regex = "|".join(f"(?P<{name}>{pattern})" for name, pattern in TokenSpec)

@dataclass
class Token:
    type: str
    value: str
    pos: int

In [3]:
class Lexer:
    def __init__(self, text: str):
        self.text = text
        self.pos = 0
        self.tokens: List[Token] = []

    def tokenize(self) -> List[Token]:
        for m in re.finditer(tok_regex, self.text):
            kind = m.lastgroup
            value = m.group()
            pos = m.start()
            if kind == "WS" or kind == "NEWLINE":
                continue
            if kind == "UNKNOWN":
                raise SyntaxError(f"Unknown token {value!r} at pos {pos}")
 
            tok = Token(kind, value, pos)
            self.tokens.append(tok)
        self.tokens.append(Token("EOF", "", len(self.text)))
        return self.tokens

### Testing tokens

In [4]:
sample = """
    Set key to C major
    Set time signature to 4/4

    In measure 1:
        C4 quarter, E4 quarter, G4 half

    In measure 2:
        F4 quarter, A4 quarter, C5 half
    """
lexer = Lexer(sample)
tokens = lexer.tokenize()
for token in tokens:
    print(token)

Token(type='SET', value='Set', pos=5)
Token(type='KEY', value='key', pos=9)
Token(type='TO', value='to', pos=13)
Token(type='IDENT', value='C', pos=16)
Token(type='IDENT', value='major', pos=18)
Token(type='SET', value='Set', pos=28)
Token(type='TIME', value='time', pos=32)
Token(type='SIGNATURE', value='signature', pos=37)
Token(type='TO', value='to', pos=47)
Token(type='NUMBER', value='4', pos=50)
Token(type='SLASH', value='/', pos=51)
Token(type='NUMBER', value='4', pos=52)
Token(type='IN', value='In', pos=59)
Token(type='MEASURE', value='measure', pos=62)
Token(type='NUMBER', value='1', pos=70)
Token(type='COLON', value=':', pos=71)
Token(type='NOTE', value='C4', pos=81)
Token(type='DURATION', value='quarter', pos=84)
Token(type='COMMA', value=',', pos=91)
Token(type='NOTE', value='E4', pos=93)
Token(type='DURATION', value='quarter', pos=96)
Token(type='COMMA', value=',', pos=103)
Token(type='NOTE', value='G4', pos=105)
Token(type='DURATION', value='half', pos=108)
Token(type='IN',