# Tutorial - parse_yaml
- Modified: 13 Mar 2025
- Parses the LLM output as a YAML, and converts it to dict
- Uses concise `output_format` to save tokens
- Converts `output_format` into pydantic schema automatically, and uses pydantic to validate output
- Able to process datatypes: `int`, `float`, `str`, `bool`, `list`, `dict`, `date`, `datetime`, `time`, `UUID`, `Decimal`
- Able to process: `None`, `Any`, `Union`, `Optional`
- Default datatype when not specified is `Any`
- Error correction of up to `num_tries` times (default: 3)

- Tested on:
    - Claude 3.5 Sonnet
    - Claude 3.7 Sonnet
    - gpt-o3-mini
    - gpt-o1-mini
    - gpt-4o-mini
    - gpt-4o
    - Meta Llama 3.3 70B
    - Meta Llama 3.2 90B (Note: Smaller versions of Llama 3.2 do not work well with YAML)
    - Meta Llama 3.1 70B (Note: Smaller versions of Llama 3.1 do not work well with YAML)
    - DeepSeek-V3
    - DeepSeek-R1
    - QwW 32B
    - Gemini 2.0 Flash
    - Gemini 2.0 Flash-Lite

- Let me know if you would like me to try out more models available on OpenRouter (https://openrouter.ai/) via discord:
https://discord.gg/bzp87AHJy5

In [1]:
import os
from strictjson import parse_yaml, parse_yaml_async, convert_schema_to_pydantic

In [2]:
from dotenv import load_dotenv
# put your secret keys in your .env
# For instance, if you are using OpenAI, your .env should contain
# export OPENAI_API_KEY = "sk-......."
load_dotenv()

True

# Define LLMs
- LLMs take in a `system_prompt` and a `user_prompt` and outputs a `str`

### Define LLM using OpenRouter - https://openrouter.ai/
- This enables rapid testing of various LLMs

In [3]:
MODEL = "anthropic/claude-3.7-sonnet"
# MODEL = "anthropic/claude-3.5-sonnet"
# MODEL = "openai/o3-mini"
# MODEL = "openai/o1-mini"
# MODEL = "openai/gpt-4o"
# MODEL = "openai/gpt-4o-mini" 
# MODEL = "meta-llama/llama-3.3-70b-instruct"
# MODEL = "meta-llama/llama-3.2-90b-vision-instruct"
# MODEL = "meta-llama/llama-3.1-70b-instruct"
# MODEL = "deepseek/deepseek-chat"
# MODEL = "deepseek/deepseek-r1-zero:free"
# MODEL = "qwen/qwq-32b"
# MODEL = "google/gemini-2.0-flash-001"
# MODEL = "google/gemini-2.0-flash-lite-001"
# MODEL = "google/gemini-2.0-flash-thinking-exp:free"

In [4]:
def llm(system_prompt: str, user_prompt: str, **kwargs):
    ''' Use your favourite LLM here - we use OpenRouter here to test various LLMs '''
    from openai import OpenAI

    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=os.environ["OPENROUTER_API_KEY"],
    )

    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })
    messages.append({
        "role": "user",
        "content": user_prompt
    })

    # change the model as you wish
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages
    )
    return response.choices[0].message.content

In [5]:
async def llm_async(system_prompt: str, user_prompt: str, **kwargs):
    ''' Use your favourite LLM here - we use OpenRouter here to test various LLMs '''
    from openai import AsyncOpenAI

    client = AsyncOpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=os.environ["OPENROUTER_API_KEY"],
    )

    messages = []
    if system_prompt:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        })
    messages.append({
        "role": "user",
        "content": user_prompt
    })

    # change the model as you wish
    response = await client.chat.completions.create(
        model=MODEL,
        messages=messages
    )
    return response.choices[0].message.content

# Basic Syntax
- `system_prompt`: Instructions for the LLM
- `user_prompt`: Content to perform instructions on
- `output_format`: Concise description-type infused dictionary to specify format LLM should output in. If type is given, should specify it as a standalone string, or after a comma (e.g. `"int"` or `"an age, int"` or `"type: int"` or `"an age, type: int"`)
- `llm`: Your llm that takes in `system_prompt` and `user_prompt` and outputs a `str`

In [6]:
# Sync
parse_yaml(system_prompt = "Give me 5 names on a topic", 
           user_prompt = "weather",
           output_format = {"Names": "Great sounding names, List[str]",
                            "Meanings": "Name and meaning, dict", 
                            "Chinese Meanings": "Name and meaning in chinese, dict",
                            "Lucky Name or Number": "List[Union[int, str]]",
                            "Code": "Python code to generate 5 names",
                            "Sentiment": "Enum['Happy', 'Sad', 'Other']"},
           llm = llm)

{'Names': ['Tempest', 'Zephyr', 'Aurora', 'Gale', 'Nimbus'],
 'Meanings': {'Tempest': 'Violent windstorm or thunderstorm',
  'Zephyr': 'Gentle breeze or west wind',
  'Aurora': 'Dawn light often with colorful atmospheric display',
  'Gale': 'Very strong wind',
  'Nimbus': 'Rain cloud or cloud formation'},
 'Chinese Meanings': {'暴风雨': 'Violent storm',
  '微风': 'Gentle breeze',
  '曙光': 'Dawn light',
  '强风': 'Strong wind',
  '雨云': 'Rain cloud'},
 'Lucky Name or Number': ['Zephyr', 7, 'Aurora', 22, 'Nimbus'],
 'Code': 'import random\n\nweather_names = [\n    "Tempest", "Zephyr", "Aurora", "Gale", "Nimbus", \n    "Storm", "Breeze", "Cyclone", "Thunder", "Mist"\n]\n\ndef generate_weather_names():\n    return random.sample(weather_names, 5)\n    \nprint(generate_weather_names())\n',
 'Sentiment': 'Happy'}

In [7]:
# Async
await parse_yaml_async(system_prompt = "Give me 5 names on a topic", 
           user_prompt = "weather",
           output_format = {"Names": "Great sounding names, List[str]",
                            "Meanings": "Name and meaning, dict", 
                            "Chinese Meanings": "Name and meaning in chinese, dict",
                            "Lucky Name or Number": "List[Union[int, str]]",
                            "Code": "Python code to generate 5 names"},
           llm = llm_async)

{'Names': ['Tempest', 'Aurora', 'Zephyr', 'Nimbus', 'Gale'],
 'Meanings': {'Tempest': 'A violent windy storm',
  'Aurora': 'Dawn, often refers to colorful atmospheric displays',
  'Zephyr': 'A gentle, mild breeze',
  'Nimbus': 'A rain cloud',
  'Gale': 'A very strong wind'},
 'Chinese Meanings': {'暴风雨': 'Violent storm (Tempest)',
  '曙光': 'Dawn light (Aurora)',
  '和风': 'Gentle breeze (Zephyr)',
  '雨云': 'Rain cloud (Nimbus)',
  '大风': 'Strong wind (Gale)'},
 'Lucky Name or Number': ['Zephyr', 7, 'Aurora', 22, 'Nimbus'],
 'Code': 'import random\n\nweather_terms = [\n    "Storm", "Rain", "Thunder", "Lightning", "Cyclone", \n    "Typhoon", "Breeze", "Blizzard", "Fog", "Mist"\n]\n\ndef generate_weather_names(count=5):\n    return random.sample(weather_terms, count)\n    \nweather_names = generate_weather_names()\nprint(weather_names)\n'}

# Long Context processing

In [8]:
text = '''SINGAPORE – Singapore will study the potential deployment of nuclear power here and take further steps to systematically build up capabilities in this area, Prime Minister Lawrence Wong said on Feb 18.

Noting that interest in nuclear energy is increasing worldwide, with several countries within the region planning to include nuclear in their energy mix, PM Wong said Singapore will need new capabilities to evaluate options and consider if there is a solution that the island-state can deploy in a safe, cost-effective way. 

Malaysia and Indonesia, for example, have operated research reactors for some time, he noted. 


ADVERTISING


“These capabilities will also be needed for nuclear safety, which will become more salient given the growing regional interest in nuclear power,” said PM Wong. 

The Government will also pump in another $5 billion into its existing Future Energy Fund to support Singapore’s efforts to secure clean power, he said in his Budget speech.

“Be it electricity imports, hydrogen or nuclear, we will need to make major investments in new infrastructure,” added PM Wong, who is also Finance Minister.


The Future Energy Fund was announced during Budget 2024 with an initial $5 billion investment. It was set up to catalyse investments into clean energy technology that may involve high upfront costs and significant commercial, technological and geopolitical risks. 


Catch up on the news that everyone’s talking about
Enter your e-mail
 Sign up
By signing up, I accept SPH Media's Terms & Conditions and Privacy Policy as amended from time to time.


Yes, I would also like to receive SPH Media Group's SPH Media Limited, its related corporations and affiliates as well as their agents and authorised service providers. marketing and promotions.
The fund is part of Singapore’s efforts to address its resource constraints. 

PM Wong cited how the Republic overcame its water challenges through innovations such as recycling used water to form Newater, and building up its water industry. 


“Today, we face a different challenge. The industries of the future – artificial intelligence, semiconductors, biopharmaceuticals – are highly energy-intensive. To meet these growing energy needs and to bring down our carbon emissions at the same time, we will need more clean power,” he said.

“Expanding access to clean energy is therefore a major national imperative.” 

Singapore has not made a decision to adopt nuclear energy. But given that the Republic has limitations in accessing renewable energy, nuclear is among various low-carbon sources that the country is looking into amid considerations of the nation’s energy security, affordability and carbon footprint. 

“Our options are inherently limited because we do not have the natural resources nor the land to meet our needs using hydro, wind or solar power,” PM Wong said.


Singapore now relies on natural gas, a fossil fuel, for some 95 per cent of its energy needs. The power sector contributes about 40 per cent of the country’s total emissions.

Achieving Singapore’s long-term climate target of reaching net-zero emissions by 2050 would require reducing carbon emissions from this sector. 

On Feb 10, the Republic published its 2035 climate target – to reduce its emissions to between 45 million tonnes and 50 million tonnes, down from the 60 million tonnes it expects to emit in 2030. 

PM Wong said that while Singapore had earlier assessed that conventional nuclear technologies were not suitable for Singapore, the country had continued to keep a close watch on developments in this space to keep its options open. 

“Since then, we have seen significant advancements in nuclear technologies,” he added, citing small modular reactors (SMRs) as one advanced nuclear technology that has better safety features than conventional, large plants. 

SMRs are compact systems that can be factory-assembled and installed in dense urban areas. The power capacity of one SMR is about a third of that of a traditional reactor.

PM Wong added that a few SMRs have been deployed elsewhere, and more could become operational by the end of the decade. 

Over the past couple of years, Singapore’s exploration of nuclear energy has been hotting up. 

It started around 2022, when a local report on future energy scenarios mentioned that emerging energy technologies, including nuclear and geothermal, could potentially supply around 10 per cent of Singapore’s energy needs by 2050.

More on this Topic
PM Wong unveils bumper SG60 Budget for all Singaporeans
Singapore Budget 2025: Read more
In July 2024, the Republic inked the 123 Agreement on Nuclear Cooperation with the US, which will allow Singapore to learn more about nuclear technologies and scientific research from American organisations.

PM Wong noted that Singapore is working on similar cooperation with other countries that have capabilities and experience in civilian nuclear power, particularly SMRs. 

In the nearer-term, PM Wong said one immediate solution to green the country’s energy mix is to import low-carbon electricity from the region, and the Republic has been progressing on this front. 

Singapore has inked deals with Indonesia, Cambodia and Vietnam to import 5.6 gigawatts of low-carbon electricity by 2035, and much of the green electricity is expected to come from solar, hydropower and wind.

Under a pilot that was expanded in 2024, Singapore is importing hydropower from Laos via Thailand and Malaysia. In late 2024, it was said that additional energy supply will come from Malaysia, increasing the total electricity import capacity to 200MW from 100MW. Malaysia’s grid comprises coal and natural gas. 

“By 2035, we expect that about one-third of our projected electricity demand can be met through electricity imports,” said PM Wong. 

On low-carbon hydrogen – an emerging fuel that does not produce planet-warming emissions when burned – PM Wong said that Singapore has been closely evaluating its use. 

But there are inherent challenges in its production, storage and transportation, he said, which makes it hard to scale up in a commercially viable manner.'''

In [9]:
parse_yaml("Extract information from text",
           text,
           output_format = {
    "Entities": "organisations only, list[str]",  
    "Sentiment": "Enum['Happy', 'Sad', 'Neutral']",
    "News about DeepSeek": "if any, Optional[str]",
    "Summary": "str",
    "Code": "code to print out entities",         
    "Latex": "latex code to write article in latex" 
},
    llm = llm)

{'Entities': ['Singapore',
  'Malaysia',
  'Indonesia',
  'US',
  'Thailand',
  'Vietnam',
  'Cambodia',
  'Laos'],
 'Sentiment': 'Neutral',
 'News about DeepSeek': None,
 'Summary': 'Singapore is exploring nuclear power deployment as part of its clean energy strategy. Prime Minister Lawrence Wong announced an additional $5 billion for the Future Energy Fund to support clean power initiatives. Singapore faces limitations in renewable energy sources and currently relies on natural gas for 95% of its energy needs. The country is considering small modular reactors (SMRs) as a potentially safer nuclear technology option, while also pursuing electricity imports from neighboring countries and evaluating low-carbon hydrogen. By 2035, Singapore aims to meet one-third of its electricity demand through imports, supporting its goal to reach net-zero emissions by 2050.\n',
 'Code': 'def print_entities():\n    entities = [\n        "Singapore", "Malaysia", "Indonesia", "US", \n        "Thailand", "

# Previous StrictJSON tests

In [10]:
parse_yaml(system_prompt = 'You are to process the user prompt',
                    user_prompt = 'It is a beautiful and sunny day',
                    output_format = {'Sentiment': 'Type of Sentiment',
                                    'Adjectives': 'Array of adjectives',
                                    'Words': 'Number of words'},
                    llm = llm)

{'Sentiment': 'Positive', 'Adjectives': ['beautiful', 'sunny'], 'Words': 6}

In [11]:
parse_yaml(system_prompt = 'You are to process the user prompt',
                    user_prompt = 'It is a beautiful and sunny day',
                    output_format = {'Sentiment': 'Type of Sentiment, type: Enum["Pos", "Neg", "Other"]',
                                    'Adjectives': 'Array of Adjectives, type: List[str]',
                                    'Words': 'Number of words, type: int',
                                    'In English': 'Whether sentence is in English, type: bool'},
                    llm = llm)

{'Sentiment': 'Pos',
 'Adjectives': ['beautiful', 'sunny'],
 'Words': 6,
 'In English': True}

In [12]:
parse_yaml(system_prompt = 'You are a code generator, generating code to fulfil a task',
                    user_prompt = 'Given array p, output a function named func_sum to return its sum',
                    output_format = {'Elaboration': 'How you would do it',
                                     'C': 'Code',
                                    'Python': 'Code'},
                    llm = llm)

{'Elaboration': 'To calculate the sum of an array, I need to:\n1. Create a function that takes an array as input\n2. Use a loop to iterate through each element and add it to a running total\n3. Return the final sum\n',
 'C': 'int func_sum(int p[], int size) {\n    int sum = 0;\n    for (int i = 0; i < size; i++) {\n        sum += p[i];\n    }\n    return sum;\n}\n',
 'Python': 'def func_sum(p):\n    return sum(p)\n'}

In [13]:
parse_yaml(system_prompt = 'You are to process the user prompt',
                    user_prompt = 'It is a beautiful and sunny day',
                    output_format = {'Sentiment': 'Type of Sentiment', 
                                     'Strength of Sentiment': 'Enum[1, 2, 3, 4, 5]',
                                    'Adjectives': "Name and Description as separate keys, type: List[Dict[str, str]]",
                                    'Words': {
                                        'Number of words': 'Word count', 
                                        'Language': {
                                              'English': 'Whether it is English, type: bool',
                                              'Chinese': 'Whether it is Chinese, type: bool'
                                                  },
                                        'Proper Words': 'Whether the words are proper in the native language, type: bool'
                                        },
                                     'Sentences': 'Each word of text as key, Sentence with word as value, Dict[str, str]',
                                    },
                 llm = llm)

{'Sentiment': 'Positive',
 'Strength of Sentiment': 4,
 'Adjectives': [{'Name': 'beautiful',
   'Description': 'pleasing to the senses or mind aesthetically'},
  {'Name': 'sunny', 'Description': 'bright with sunlight'}],
 'Words': {'Number of words': 6,
  'Language': {'English': True, 'Chinese': False},
  'Proper Words': True},
 'Sentences': {'It': 'It is a beautiful and sunny day',
  'is': 'It is a beautiful and sunny day',
  'a': 'It is a beautiful and sunny day',
  'beautiful': 'It is a beautiful and sunny day',
  'and': 'It is a beautiful and sunny day',
  'sunny': 'It is a beautiful and sunny day',
  'day': 'It is a beautiful and sunny day'}}

# How to use Structured Output with models that provide it
- If your LLM has Structured Output, you can also opt to use it instead of the Pydantic-based error correcting done in `parse_yaml`
- Method 1: Specify `output_format` in the inputs to `parse_yaml`, which will be automatically converted to `pydantic_model`, and then this can be referenced in the `llm()` function
- Method 2: Just directly specify `pydantic_model` in place of `output_format` in the inputs to `parse_yaml`, which can be referenced in the `llm()` function

In [14]:
def llm(system_prompt: str, user_prompt: str, **kwargs) -> str:
    ''' Here, we use OpenAI for illustration, you can change it to your own LLM '''
    # ensure your LLM imports are all within this function
    from openai import OpenAI

    client = OpenAI()
    params = {
    "model": "gpt-4o-mini",
    "temperature": 0,
    "messages": [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    }
    
    # Only add 'response_format' if a pydantic_model is provided.
    if kwargs.get("pydantic_model") is not None:
        params["response_format"] = kwargs["pydantic_model"]

        print("For debugging purposes, this is the json schema for the Pydantic Model:")
        print(kwargs["pydantic_model"].model_json_schema())
    
    response = client.beta.chat.completions.parse(**params)
    return response.choices[0].message.content

### Method 1: Using the pydantic model automatically generated via output_format

In [15]:
parse_yaml(system_prompt = "You are a helpful assistent",
    user_prompt = "Generate a birthday event for Alex",
    output_format = {"name": "str",
                     "date": "str",
                     "participants": "only male names, list[str]"},
                    llm = llm)

For debugging purposes, this is the json schema for the Pydantic Model:
{'properties': {'name': {'title': 'Name', 'type': 'string'}, 'date': {'title': 'Date', 'type': 'string'}, 'participants': {'description': 'only male names', 'items': {'type': 'string'}, 'title': 'Participants', 'type': 'array'}}, 'required': ['name', 'date', 'participants'], 'title': 'Yaml_Schema', 'type': 'object'}


{'name': 'Alex',
 'date': '2023-10-15',
 'participants': ['John', 'Michael', 'David', 'Chris', 'James']}

### Method 2: Using the pydantic model specified in `parse_yaml` input

In [16]:
from pydantic import BaseModel, Field

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str] = Field(..., description = "only male names")

In [17]:
parse_yaml(system_prompt = "You are a helpful assistent",
    user_prompt = "Generate a birthday event for Alex", 
    pydantic_model = CalendarEvent,
    llm = llm)

For debugging purposes, this is the json schema for the Pydantic Model:
{'properties': {'name': {'title': 'Name', 'type': 'string'}, 'date': {'title': 'Date', 'type': 'string'}, 'participants': {'description': 'only male names', 'items': {'type': 'string'}, 'title': 'Participants', 'type': 'array'}}, 'required': ['name', 'date', 'participants'], 'title': 'CalendarEvent', 'type': 'object'}


{'name': "Alex's Birthday Party",
 'date': '2023-11-15',
 'participants': ['John', 'Michael', 'David', 'Chris']}

# How `parse_yaml` works under the hood
- Converts `output_format` into Pydantic format, then use Pydantic error parsers to test and error correct
- You can also use `convert_schema_to_pydantic` to convert the `output_format` to a Pydantic model, so you can use it for structured outputs

In [18]:
from strictjson import convert_schema_to_pydantic

In [19]:
convert_schema_to_pydantic({"name": "str",
                     "date": "str",
                     "participants": "only male names, list[str]"}).model_json_schema()

{'properties': {'name': {'title': 'Name', 'type': 'string'},
  'date': {'title': 'Date', 'type': 'string'},
  'participants': {'description': 'only male names',
   'items': {'type': 'string'},
   'title': 'Participants',
   'type': 'array'}},
 'required': ['name', 'date', 'participants'],
 'title': 'Yaml_Schema',
 'type': 'object'}