# Pydantic parser
This output parser allows users to specify an arbitrary Pydantic Model and query LLMs for outputs that conform to that schema.

Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed JSON. In the OpenAI family, DaVinci can do reliably but Curie's ability already drops off dramatically.

Use Pydantic to declare your data model. Pydantic's BaseModel is like a Python dataclass, but with actual type checking + coercion.

In [1]:
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI

In [2]:
model = ChatOpenAI(model="gpt-4o-mini",temperature=0.9)

In [4]:
# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Joke)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}, # partial variables will be injected into the template
)

chain = prompt | model | parser

In [5]:
chain.invoke({"query": joke_query})

Joke(setup="Why don't scientists trust atoms?", punchline='Because they make up everything!')

In [6]:
from langchain_community.callbacks.manager import get_openai_callback

llm = ChatOpenAI(model="gpt-4o-mini", stream_usage=True)

chain = prompt | model | parser

In [7]:
with get_openai_callback() as callback:
    result = chain.invoke({"query": joke_query})
    print(result)
    print(callback)

setup='Why did the scarecrow win an award?' punchline='Because he was outstanding in his field!'
Tokens Used: 236
	Prompt Tokens: 206
	Completion Tokens: 30
Successful Requests: 1
Total Cost (USD): $4.8899999999999996e-05


In [12]:
print(callback.total_tokens)
print(callback.completion_tokens)
print(callback.prompt_tokens)

236
30
206


### Example from [【Day 15】LCEL 結合自訂 & 原生 Runnable 實戰](https://ithelp.ithome.com.tw/articles/10345264)

In [5]:
# 定義資料結構
class Kpop(BaseModel):
	singer: str = Field(description="Name of the singer")
	gender: str = Field(description="Gender of the singer")
	group: str = Field(description="Group of the singer")
	age: int = Field(description="Age of the singer")
	song : List[str] = Field(description="Song of the singer")

	# 可以透過 Pydantic 自訂簡單的邏輯驗證
	@validator("gender")
	def validate_gender(cls, field):
		if field not in ["Male", "Female"]:
			raise ValueError("Invalid gender value")
		return field

# 將定義的結構傳入 Pydantic Parser
parser = PydanticOutputParser(pydantic_object=Kpop)

# 在 template 的部分設定兩個變數，一個是我們要 input 的變數，另一個是 Pydantic parser 結構的描述
prompt = PromptTemplate(
	template="Tell me a girl Kpop idols from {company}, {format_instructions}",
	input_variables=["company"],
	partial_variables={"format_instructions": parser.get_format_instructions()}
	)

# 將這些部分 Chain 起來，並顯示結果
chain = prompt | model | parser
print(chain.invoke("SM Entertainment"))

singer='Taeyeon' gender='Female' group="Girls' Generation" age=32 song=['Into the New World', 'Gee', 'I', 'Rain']


In [8]:
# Here's another example, but with a compound typed field.
class Actor(BaseModel):
    name: str = Field(description="name of an actor")
    film_names: List[str] = Field(description="list of names of films they starred in")


actor_query = "Generate the filmography for a random actor."

parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Answer the user query.\n\
            {format_instructions}\n\
            {query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

chain.invoke({"query": actor_query})

Actor(name='Tom Hanks', film_names=['Forrest Gump', 'Cast Away', 'Saving Private Ryan', 'Toy Story', 'Apollo 13', 'The Green Mile'])

In [15]:
class Output(BaseModel):
    key_words: List[str] = Field(description="list of key words from user's input")

parser = PydanticOutputParser(pydantic_object=Output)

prompt = PromptTemplate(
    template="Distill the key words from the user's input, if no available return empty.\n\
            {format_instructions}\n\
            User:{query}\n",
            input_variables=["query"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
            )

chain = prompt | model | parser

# res = chain.invoke({"query": "I want to know the weather in New York."})
res = chain.invoke({"query": "Hello, how are you doing today?"})
print(type(res))
print(res.key_words)

<class '__main__.Output'>
['Hello', 'doing', 'today']
