In [19]:
import re
from collections import namedtuple

# Defining a simple token structure
Token = namedtuple('Token', ['type', 'value'])

# Token definitions
TOKEN_REGEX = re.compile(r"""
    (\{\%\s*endmessage\s*\%\})|    # End of a message block
    (\{\%\s*message\s+[a-z]+\s*\%\})| # Start of a message block with role
    (\{\%\s*endfor\s*\%\})|        # End of a loop
    (\{\%\s*for\s+.+?\s+in\s+.+?\s*\%\})| # Start of a loop
    (\{\{.+?\}\})|                 # Expression
    (\{\%.*?\%\})|                 # Other tag (for future use)
    ([^{]+)                        # Text
    """, re.VERBOSE)

# Token types
TOKEN_TYPES = [
    ('ENDMESSAGE', 'endmessage'),
    ('MESSAGE_START', 'message'),
    ('ENDFOR', 'endfor'),
    ('FOR', 'for'),
    ('EXPRESSION', 'expression'),
    ('TAG', 'tag'),
    ('TEXT', 'text'),
]

def tokenize(text) -> list[Token]:
    tokens = []
    for match in TOKEN_REGEX.finditer(text):
        match_groups = match.groups()
        for i, group in enumerate(match_groups):
            if group:
                # Determine the type based on which group matched
                token_type = TOKEN_TYPES[i][0]
                token_value = group.strip()
                
                # Special handling for message and for tags to include additional info
                if token_type in ['MESSAGE_START', 'FOR']:
                    # Extracting role for MESSAGE_START or loop variables for FOR
                    token_value = re.sub(r"^\{\%\s*|\s*\%\}$", "", token_value) # Remove the tag delimiter
                    if token_type == 'MESSAGE_START':
                        token_value = token_value.split(' ')[1]
                
                if token_type == 'EXPRESSION':
                    # remove the {{ and }} delimiters
                    token_value = token_value[2:-2].strip()

                tokens.append(Token(token_type, token_value))
                break
    # filter out text tokens that are empty
    tokens = [token for token in tokens if token.type != 'TEXT' or token.value]
    return tokens

# Example usage
example_text = """
{% message system %}
You are a genius
{% endmessage %}

{% message user %}
My name is {{name}}. What is my name?
{% endmessage %}
"""

tokens = tokenize(example_text)
for token in tokens:
    print(token)

Token(type='MESSAGE_START', value='system')
Token(type='TEXT', value='You are a genius')
Token(type='ENDMESSAGE', value='{% endmessage %}')
Token(type='MESSAGE_START', value='user')
Token(type='TEXT', value='My name is')
Token(type='EXPRESSION', value='name')
Token(type='TEXT', value='. What is my name?')
Token(type='ENDMESSAGE', value='{% endmessage %}')


In [23]:
from prooompt.datatypes import Template, TemplateItem, TemplateItemUnion, TemplateMessage, MessageRole, TemplateText, TemplateEval, TemplateLoop

def parse(tokens: list[Token]) -> Template:
    items = parse_template_items(tokens)
    return Template(items=items)

def parse_template_items(tokens: list[Token]) -> list[TemplateItemUnion]:
    token_types = [token.type for token in tokens]
    parsed = []
    for token_idx in range(len(tokens)):
        token = tokens[token_idx]
        if token.type == "MESSAGE_START":
            # Extract the role from the message start token
            role = token.value

            # Get the tokens between the message start and endmessage tokens
            message_end_idx = token_types[token_idx:].index("ENDMESSAGE")
            message_tokens = tokens[token_idx+1:token_idx+message_end_idx]

            message = TemplateMessage(role=MessageRole(role), template=parse_template_items(message_tokens))
            parsed.append(message)
        
        elif token.type == "FOR":
            raise NotImplementedError("For loops are not yet supported")
    
        elif token.type == "EXPRESSION":
            parsed.append(TemplateEval(value=token.value))
        
        elif token.type == "TEXT":
            parsed.append(TemplateText(content=token.value))

    return parsed


print(parse(tokens).model_dump_json(indent=2))

{
  "items": [
    {
      "type": "message",
      "role": "system",
      "template": [
        {
          "type": "text",
          "content": "You are a genius"
        }
      ]
    },
    {
      "type": "text",
      "content": "You are a genius"
    },
    {
      "type": "message",
      "role": "user",
      "template": [
        {
          "type": "text",
          "content": "My name is"
        },
        {
          "type": "eval",
          "value": "name"
        },
        {
          "type": "text",
          "content": ". What is my name?"
        }
      ]
    },
    {
      "type": "text",
      "content": "My name is"
    },
    {
      "type": "eval",
      "value": "name"
    },
    {
      "type": "text",
      "content": ". What is my name?"
    }
  ]
}


In [4]:
from prooompt.datatypes import Template, TemplateItem, TemplateItemUnion, TemplateMessage, MessageRole, TemplateText, TemplateEval, TemplateLoop
from typing import List
import re

# Assuming 'tokens' is the list of tokens produced by the lexer for the given text

def parse(tokens):
    """
    Parses the list of tokens and returns the structured Pydantic model.
    """
    template_items = parse_template_items(tokens)
    return Template(items=template_items)

def parse_template_items(tokens) -> List[TemplateItemUnion]:
    """
    Parses template items, recursively handling nested structures.
    """
    items = []
    while tokens:
        token = tokens.pop(0)
        
        if token.type == 'MESSAGE_START':
            message, consumed_tokens = parse_message(tokens)
            items.append(message)
            tokens = tokens[consumed_tokens:]
        
        elif token.type == 'FOR':
            loop, consumed_tokens = parse_for_loop(tokens)
            items.append(loop)
            tokens = tokens[consumed_tokens:]
        
        elif token.type == 'TEXT':
            if token.value.strip():  # Ignore purely whitespace texts
                items.append(TemplateText(content=token.value))
        
        elif token.type == 'EXPRESSION':
            expression_value = re.sub(r"^\{\{|\}\}$", "", token.value)  # Remove the expression delimiters
            items.append(TemplateEval(value=expression_value))
    
    return items

def parse_message(tokens) -> tuple[TemplateMessage, int]:
    """
    Parses a message block, collecting contained template items.
    """
    role = tokens.pop(0).value.split()[1]  # Extract the role from the 'message <role>' format
    message_items = []
    consumed_tokens = 0
    
    while tokens and tokens[0].type != 'ENDMESSAGE':
        token = tokens.pop(0)
        consumed_tokens += 1
        
        if token.type == 'TEXT' or token.type == 'EXPRESSION':
            # Place the token back and let parse_template_items handle it
            tokens.insert(0, token)
            nested_items = parse_template_items(tokens)
            message_items.extend(nested_items)
            # Update consumed_tokens based on the number of tokens processed inside parse_template_items
            consumed_tokens += len(nested_items)
        # Extend this to handle nested loops or other structures
    
    # Skip the ENDMESSAGE token
    if tokens and tokens[0].type == 'ENDMESSAGE':
        consumed_tokens += 1
        tokens.pop(0)
    
    return (TemplateMessage(role=MessageRole[role.upper()], template=message_items), consumed_tokens)

def parse_for_loop(tokens) -> tuple[TemplateLoop, int]:
    """
    Parses a for-loop, including its templated content.
    """
    loop_start_token = tokens.pop(0)
    iterator, iterable = re.match(r"for (.+?) in (.+)", loop_start_token.value).groups()
    loop_items = []
    consumed_tokens = 1  # Already consumed the FOR token
    
    while tokens and tokens[0].type != 'ENDFOR':
        token = tokens.pop(0)
        consumed_tokens += 1
        
        if token.type == 'TEXT' or token.type == 'EXPRESSION':
            # Similar handling as in parse_message
            tokens.insert(0, token)
            nested_items = parse_template_items(tokens)
            loop_items.extend(nested_items)
            consumed_tokens += len(nested_items)
        # Extend this to handle nested structures
    
    # Skip the ENDFOR token
    if tokens and tokens[0].type == 'ENDFOR':
        consumed_tokens += 1
        tokens.pop(0)
    
    return (TemplateLoop(iterator=iterator, iterable=iterable, template=loop_items), consumed_tokens)

# Now, use the parser function on the list of tokens from step 1
# Example:
template_structure = parse(tokens)
print(template_structure.model_dump_json(indent=2))

KeyError: 'ARE'