> **NOTE**: Following the [use case of Llama-Index](https://docs.llamaindex.ai/en/stable/understanding/extraction/)

# Create Data Structure

- Using `doc string` and `pydantic.Field(description='')` to insert meta data

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from pydantic import BaseModel, Field

from datetime import datetime


class LineItem(BaseModel):
    """A line item in an invoice."""

    item_name: str = Field(description="The name of this item")
    price: float = Field(description="The price of this item")


class Invoice(BaseModel):
    """A representation of information from an invoice."""

    invoice_id: str = Field(
        description="A unique identifier for this invoice, often a number"
    )
    date: datetime = Field(description="The date this invoice was created")
    line_items: list[LineItem] = Field(
        description="A list of all the items in this invoice"
    )

class FullName(BaseModel):
    """A representation of a full name."""

    last_name: str = Field(description="The last name")
    first_name: str = Field(description="The first name")
    middle_name: str = Field(description="The middle name or initial", default="")
    title: str = Field(description="The title of this contact", default="")
    cadency: str = Field(description="The cadency of this contact", default="")

class PostInfo(BaseModel):
    """A representation of a post information."""

    org: str = Field(description="The organization this contact belongs to", default="")
    service_branch: str = Field(description="The service branch of this contact", default="")
    is_contractor: bool = Field(description="Is this contact a contractor?", default=False)

class Contact(BaseModel):
    """A representation of a contact."""

    full_name: FullName = Field(description="The full name of this contact")

    post_info: PostInfo = Field(description="The post information of this contact")

    email: str = Field(description="The email address of this contact")
    phone: str = Field(description="The phone number of this contact")

    contact_reason: str = Field(description="Project involved", default="")

In [5]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data/email").load_data()

In [6]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o", timeout=300)
sllm = llm.as_structured_llm(Contact)

> NOTE: 
> 
> This method can ONLY  handle "single structure". It CANNOT parse the text into structures then extract.

In [16]:
responses = []
for person in documents[0].text.split("\n"):
    if person.strip():
        responses.append(sllm.complete(person))

In [None]:
for contact in responses:
    print(json.dumps(json.loads(contact.text), indent=2))

In [26]:
# Convert into Pydentic objects
Contact(**json.loads(contact.text))

Contact(full_name=FullName(last_name='Jones', first_name='Dominic', middle_name='R', title='', cadency='01'), post_info=PostInfo(org='FAA', service_branch='', is_contractor=False), email='Dominic.R.Jones01@faa.gov', phone='', contact_reason='')

In [20]:
for att in dir(contact):
    if not att.startswith("_"):
        print(att)

additional_kwargs
construct
copy
delta
dict
from_orm
json
logprobs
model_computed_fields
model_config
model_construct
model_copy
model_dump
model_dump_json
model_extra
model_fields
model_fields_set
model_json_schema
model_parametrized_name
model_post_init
model_rebuild
model_validate
model_validate_json
model_validate_strings
parse_file
parse_obj
parse_raw
raw
schema
schema_json
text
update_forward_refs
validate


In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader
from pathlib import Path

pdf_reader = PDFReader()
documents = pdf_reader.load_data(file=Path("./uber_receipt.pdf"))
text = documents[0].text

In [5]:
documents = reader.load_data()