# [Build an Extraction Chain](https://python.langchain.com/docs/tutorials/extraction/)

In [None]:
from dotenv import load_dotenv
load_dotenv()

True

## The Schema
First, we need to describe what information we want to extract from the text.

We'll use Pydantic to define an example schema to extract personal information.

In [3]:
from typing import Optional

from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

There are two best practices when defining schema:

1. Document the attributes and the schema itself: This information is sent to the LLM and is used to improve the quality of information extraction.
2. Do not force the LLM to make up information! Above we used Optional for the attributes allowing the LLM to output None if it doesn't know the answer.

## The Extractor
Let's create an information extractor using the schema we defined above.

In [15]:
# from typing import Optional

# from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# from pydantic import BaseModel, Field

# # Define a custom prompt to provide instructions and any additional context.
# # 1) You can add examples into the prompt template to improve extraction quality
# # 2) Introduce additional parameters to take context into account (e.g., include metadata
# #    about the document from which the text was extracted.)
# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are an expert extraction algorithm. "
#             "Only extract relevant information from the text. "
#             "If you do not know the value of an attribute asked to extract, "
#             "return null for the attribute's value.",
#         ),
#         # Please see the how-to about improving performance with
#         # reference examples.
#         # MessagesPlaceholder('examples'),
#         ("human", "{text}"),
#     ]
# )

In [25]:
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.prompts import HumanMessagePromptTemplate, AIMessagePromptTemplate, StringPromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="""You are an expert extraction algorithm.
            Only extract relevant information from the text. 
            If you do not know the value of an attribute asked to extract, 
            return null for the attribute's value."""),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),

        HumanMessagePromptTemplate.from_template("{text}")
    ]
)

In [26]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

runnable = prompt | llm.with_structured_output(schema=Person)

In [27]:
text = "Alan Smith is 6 feet tall and has blond hair."
runnable.invoke({"text": text})

Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')

## Multiple Entities
In most cases, you should be extracting a list of entities rather than a single entity.

This can be easily achieved using pydantic by nesting models inside one another.

In [28]:
from typing import List, Optional

from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )


class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [29]:
runnable = prompt | llm.with_structured_output(schema=Data)
text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
runnable.invoke({"text": text})

Data(people=[Person(name='Jeff', hair_color='black', height_in_meters='1.83'), Person(name='Anna', hair_color='black', height_in_meters=None)])