In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from typing import Optional

from langchain_core.documents import Document
from pydantic import BaseModel, Field, RootModel

from langchain.chains.summarization.map_reduce import create_map_reduce_extractor

# from langchain.extraction import create_batch_extractor, create_parallel_extractor
from langchain.chat_models import init_chat_model


class Person(BaseModel):
    """Person to extract."""

    name: str
    age: Optional[str] = None
    hair_color: Optional[str] = None
    source_doc_ids: list[str] = Field(
        default=[],
        description="The IDs of the documents where the information was found.",
    )


class PeopleRoot(BaseModel):
    people: list[Person] = []

In [5]:
People = RootModel(list[Person])

documents = [
    Document(
        id="1",
        page_content="""Bobby Luka was 10 years old.
Synthetic fuels—produced from captured carbon and green hydrogen—are gaining traction in aviation. The EU’s “ReFuelEU” mandate requires increasing blends of sustainable aviation fuel (SAF) starting in 2025. Airbus and Rolls-Royce have completed long-haul test flights powered entirely by synthetic kerosene. However, current production is less than 0.1% of global jet fuel demand, and costs remain 3–5x higher than fossil alternatives.

New refineries are under construction in Norway, Chile, and Texas. Analysts expect rapid scaling between 2026–2030 as demand certainty from mandates intersects with cheaper electrolyzer technology. Airlines like Lufthansa and United are signing multi-year offtake agreements, often at fixed above-market prices, to secure early access.
""",
        metadata={"source": "synthetic_fuel_aviation"},
    ),
    Document(
        id="2",
        page_content="""
AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data. However, the biggest bottleneck is not compute—it’s high-quality training data. Much of the biomedical literature is unstructured, biased, or contradictory.

To address this, some startups are partnering with CROs (contract research organizations) to generate clean, proprietary datasets. Others are building large-scale knowledge graphs across modalities—genomics, imaging, EMRs—enabling more robust model fine-tuning. The FDA has begun pilot programs to assess how AI-generated candidates can be validated through adaptive trial design.
""",
        metadata={"source": "ai_drug_discovery"},
    ),
    Document(
        page_content="""
I love to eat tomatoes!!!!!
""",
        metadata={"source": "ai_drug_discovery"},
    ),
    Document(
        id="3",
        page_content="""Jack Johnson was 23 years old and blonde.
AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data. However, the biggest bottleneck is not compute—it’s high-quality training data. Much of the biomedical literature is unstructured, biased, or contradictory.

To address this, some startups are partnering with CROs (contract research organizations) to generate clean, proprietary datasets. Others are building large-scale knowledge graphs across modalities—genomics, imaging, EMRs—enabling more robust model fine-tuning. The FDA has begun pilot programs to assess how AI-generated candidates can be validated through adaptive trial design.

Bobby Luka's hair is brown.
""",
        metadata={"source": "ai_drug_discovery"},
    ),
]


model = init_chat_model("claude-opus-4-20250514")
summarizer = create_map_reduce_extractor(
    model,
    response_format=PeopleRoot,
).compile(name="MapReducerExtractor")


output = summarizer.invoke({"documents": documents})

In [6]:
output["results"]

[{'indexes': [0],
  'result': PeopleRoot(people=[Person(name='Bobby Luka', age='10', hair_color=None, source_doc_ids=['1'])])},
 {'indexes': [1], 'result': PeopleRoot(people=[])},
 {'indexes': [2], 'result': PeopleRoot(people=[])},
 {'indexes': [3],
  'result': PeopleRoot(people=[Person(name='Jack Johnson', age='23', hair_color='blonde', source_doc_ids=['3']), Person(name='Bobby Luka', age=None, hair_color='brown', source_doc_ids=['3'])])}]

In [8]:
output["result"]

PeopleRoot(people=[Person(name='Bobby Luka', age='10', hair_color='brown', source_doc_ids=['1', '3']), Person(name='Jack Johnson', age='23', hair_color='blonde', source_doc_ids=['3'])])

# Sequential extraction

In [15]:
documents = [
    Document(
        id="1",
        page_content="""Bobby Luka was 10 years old.
Synthetic fuels—produced from captured carbon and green hydrogen—are gaining traction in aviation. The EU’s “ReFuelEU” mandate requires increasing blends of sustainable aviation fuel (SAF) starting in 2025. Airbus and Rolls-Royce have completed long-haul test flights powered entirely by synthetic kerosene. However, current production is less than 0.1% of global jet fuel demand, and costs remain 3–5x higher than fossil alternatives.

New refineries are under construction in Norway, Chile, and Texas. Analysts expect rapid scaling between 2026–2030 as demand certainty from mandates intersects with cheaper electrolyzer technology. Airlines like Lufthansa and United are signing multi-year offtake agreements, often at fixed above-market prices, to secure early access.
""",
        metadata={"source": "synthetic_fuel_aviation"},
    ),
    Document(
        id="2",
        page_content="""
AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data. However, the biggest bottleneck is not compute—it’s high-quality training data. Much of the biomedical literature is unstructured, biased, or contradictory.

To address this, some startups are partnering with CROs (contract research organizations) to generate clean, proprietary datasets. Others are building large-scale knowledge graphs across modalities—genomics, imaging, EMRs—enabling more robust model fine-tuning. The FDA has begun pilot programs to assess how AI-generated candidates can be validated through adaptive trial design.
""",
        metadata={"source": "ai_drug_discovery"},
    ),
    Document(
        page_content="""
I love to eat tomatoes!!!!!
""",
        metadata={"source": "ai_drug_discovery"},
    ),
    Document(
        id="3",
        page_content="""Jack Johnson was 23 years old and blonde.
AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data. However, the biggest bottleneck is not compute—it’s high-quality training data. Much of the biomedical literature is unstructured, biased, or contradictory.

To address this, some startups are partnering with CROs (contract research organizations) to generate clean, proprietary datasets. Others are building large-scale knowledge graphs across modalities—genomics, imaging, EMRs—enabling more robust model fine-tuning. The FDA has begun pilot programs to assess how AI-generated candidates can be validated through adaptive trial design.

Bobby Luka's hair is brown.
""",
        metadata={"source": "ai_drug_discovery"},
    ),
]

In [16]:
from typing import Optional

from langchain_core.documents import Document
from pydantic import BaseModel, Field

from langchain.chains.summarization import create_summarizer
from langchain.chat_models import init_chat_model


class Person(BaseModel):
    """Person to extract."""

    name: str
    age: Optional[str] = None
    hair_color: Optional[str] = None
    source_doc_ids: list[str] = Field(
        default=[],
        description="The IDs of the documents where the information was found.",
    )


class PeopleRoot(BaseModel):
    people: list[Person]


model = init_chat_model("claude-opus-4-20250514")

summarizer = create_summarizer(
    model,
    initial_prompt="Extract information from the text to match the expected output format.",
    refine_prompt="You are responsible for updating extracted information. The currently extracted information is provided in the context. Make sure to carry it forward and / or update it with any new found facts in the given text. Also if there's ne winformation that should be extracted in the given text please do so.",
    response_format=PeopleRoot,
    strategy="batch",
).compile(name="Summarizer")

output = summarizer.invoke({"documents": documents})
output

In [17]:
output

{'result': PeopleRoot(people=[Person(name='Bobby Luka', age='10', hair_color='brown', source_doc_ids=['1', '3']), Person(name='Jack Johnson', age='23', hair_color='blonde', source_doc_ids=['3'])])}