In [1]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter your Open API key: ")

Enter your Open API key: ········


In [2]:
# Use getpass.getpass() to prompt for the API key securely
os.environ["LLAMA_CLOUD_API_KEY"] = getpass.getpass("Enter your Llama Cloud API key: ")

Enter your Llama Cloud API key: ········


In [3]:
spider_api_key = getpass.getpass("Enter your Spider API key: ")

Enter your Spider API key: ········


In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import SummaryIndex
from llama_index.core import Document
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.openai import OpenAI

from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.postprocessor import KeywordNodePostprocessor

from llama_index.core.tools import QueryEngineTool
from llama_index.core.tools import FunctionTool

from llama_index.core.agent import ReActAgent


from llama_index.readers.web import SimpleWebPageReader
from llama_index.readers.web import SpiderWebReader

from IPython.display import Markdown, display

from llama_parse import LlamaParse

import logging
import sys
import nest_asyncio


In [11]:
import json

In [7]:
spider_reader = SpiderWebReader(
    api_key= spider_api_key,  # Get one at https://spider.cloud
    mode="scrape",
    # params={} # Optional parameters see more on https://spider.cloud/docs/api
)


documents_delta = spider_reader.load_data(url="https://r.jina.ai/https://www.delta.com/us/en/baggage/carry-on-baggage?srsltid=AfmBOopJ1ha7OEiwm46qaLxEQ_tvi6lrtK7NOxt0dttocsarTY1-pm1V")
documents_delta

[Document(id_='a334a475-294c-47a5-b377-093ce44b216c', embedding=None, metadata={'description': '', 'domain': 'r.jina.ai', 'extracted_data': None, 'file_size': 8634, 'keywords': None, 'pathname': '/https://www.delta.com/us/en/baggage/carry-on-baggage?srsltid=AfmBOopJ1ha7OEiwm46qaLxEQ_tvi6lrtK7NOxt0dttocsarTY1-pm1V', 'resource_type': '.md', 'title': '', 'url': None, 'user_id': '5600b1d3-d5ea-4943-8739-c84ddd149cae'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Title: Carry-On Baggage | Delta Air Lines\n\nURL Source: https://www.delta.com/us/en/baggage/carry-on-baggage?srsltid=AfmBOopJ1ha7OEiwm46qaLxEQ_tvi6lrtK7NOxt0dttocsarTY1-pm1V\n\nMarkdown Content:\n* * *\n\nNot sure whether to check your bag or carry it on? Take a few minutes to learn the can-do and no-can-do rules of carry-on baggage. If you don’t want to carry-on, it’s sti

# Pydantic Class

In [8]:
from pydantic import BaseModel
from typing import List, Optional

# Define Pydantic model for a rule entry
class Rule(BaseModel):
    item: str
    allowed_quantity: str
    exemption: Optional[str] = None

# Define a model for a collection of rules
class RulesDocument(BaseModel):
    rules: List[Rule]


In [9]:
from pydantic import ValidationError

# Set up LLM
llm = OpenAI(model_name="gpt-4o-mini", temperature=0.1)

# Step 3: Use documents_delta to create an index
index = VectorStoreIndex.from_documents(documents_delta)
query_engine = index.as_query_engine()

# Step 4: Query the document for structured data
query = """
Extract the following structured data from the rules document as a JSON array:
- "item": name of the item
- "allowed_quantity": the allowed quantity
- "exemption": any relaxation/exemption condition (if applicable).
"""
response = query_engine.query(query)

In [13]:
import json

# Validate and print the structured data
try:
    validated_data = RulesDocument(rules=extracted_data)
    print("Validated Structured Data:")
    print(json.dumps(validated_data.model_dump(), indent=2))  # Correct way to format the output
except ValidationError as e:
    print("Validation Error:", e)
    print("Raw Extracted Data:")
    print(response.response)


Validated Structured Data:
{
  "rules": [
    {
      "item": "liquids, gels, aerosols and pastes",
      "allowed_quantity": "maximum of 3.4 ounces or 100 milliliters",
      "exemption": "Special provisions for necessary items such as medication, breast milk, and juice or formula for infants"
    },
    {
      "item": "personal items",
      "allowed_quantity": "1 purse, briefcase, small backpack, camera bag or diaper bag; 1 laptop bag; 1 item of similar or smaller size",
      "exemption": "Flight attendants can assist with larger items like strollers"
    },
    {
      "item": "free items to carry on",
      "allowed_quantity": "A jacket and/or umbrella; Food or drink purchased past the security checkpoint; Duty-free merchandise; Special items like strollers, wheelchairs, child safety seats, assistive devices like crutches",
      "exemption": null
    }
  ]
}


In [16]:
# Save validated data to a file
with open("validated_rules.json", "w") as f:
    json.dump(validated_data.model_dump(), f, indent=2)  # Properly format and save the JSON


# Using Structured LLM

#### What It Does:
The as_structured_llm() method creates a wrapper around the LLM that allows it to produce outputs directly mapped to a predefined structured schema (e.g., a Pydantic class like RulesDocument).
This eliminates the need to manually parse or validate the LLM’s output because the output is automatically converted to the specified schema.

#### Use Case in Your Scenario:
If you are frequently using a specific schema (like RulesDocument), this feature can:

1. Simplify the Workflow:
2. Directly enforce structured responses without manually validating the output with Pydantic.
3. Reduce Errors: Automatically ensure the response conforms to the schema, reducing the chance of malformed data

In [23]:
from llama_index.core.prompts import PromptTemplate



In [25]:
# Step 2: Define the LLM and structured LLM
llm = OpenAI(model_name="gpt-4o", temperature=0.1)
sllm = llm.as_structured_llm(RulesDocument)

# Step 3: Define the prompt template
prompt_template = PromptTemplate(
    template="""
    Extract structured data about items, allowed quantities, and exemptions from the given document.
    Provide the data in JSON format conforming to the schema:
    - "item": name of the item
    - "allowed_quantity": the allowed quantity
    - "exemption": any relaxation/exemption condition (if applicable).
    Document: {documents}
    """
)

# Step 4: Use the structured LLM for prediction
try:
    response = sllm.structured_predict(
        documents=documents_delta,
        prompt=prompt_template,  # Use the structured prompt template
        output_cls=RulesDocument  # Ensure output conforms to this schema
    )
    print("Validated Structured Data:")
    print(response.model_dump(indent=2))  # Output validated data
except ValidationError as e:
    print("Validation Error:", e)

IndexError: list index out of range

In [28]:
# Define the structured prompt
prompt_template = PromptTemplate(
    template="""
    Extract the following structured data from the document:
    - "item": The name of the item.
    - "allowed_quantity": The allowed quantity.
    - "exemption": Any exemption condition.
    Return the data in JSON format according to the schema.

    Document: {documents}
    """
)

# Initialize the LLM
llm = OpenAI(model_name="gpt-4o", temperature=0.1)
    

# Step 4: Use the structured prediction
try:
    response = llm.structured_predict(
        prompt=prompt_template,         # The structured prompt
        documents=documents_delta,      # The input document
        output_cls=RulesDocument        # The Pydantic schema for validation
    )
    print("Validated Structured Data:")
    print(json.dumps(response.model_dump(), indent=2))  # Pretty-print the output
except ValidationError as e:
    print("Validation Error:", e)
except Exception as e:
    print("Error during structured prediction:", e)


Validated Structured Data:
{
  "rules": [
    {
      "item": "Carry-On Bag",
      "allowed_quantity": "1",
      "exemption": null
    },
    {
      "item": "Personal Item",
      "allowed_quantity": "1",
      "exemption": "Passengers traveling on Delta Connection flights with 50 seats or less are permitted to carry personal items onboard the aircraft"
    }
  ]
}
