> **NOTE**: Following the [use case of Llama-Index](https://docs.llamaindex.ai/en/stable/understanding/extraction/)

# Create Data Structure

- Using `doc string` and `pydantic.Field(description='')` to insert meta data

In [1]:
from pydantic import BaseModel, Field

from datetime import datetime

In [2]:
class LineItem(BaseModel):
    """A line item in an invoice."""

    item_name: str = Field(description="The name of this item")
    price: float = Field(description="The price of this item")


class Invoice(BaseModel):
    """A representation of information from an invoice."""

    invoice_id: str = Field(
        description="A unique identifier for this invoice, often a number"
    )
    date: datetime = Field(description="The date this invoice was created")
    line_items: list[LineItem] = Field(
        description="A list of all the items in this invoice"
    )

class FullName(BaseModel):
    """A representation of a full name."""

    last_name: str = Field(description="The last name")
    first_name: str = Field(description="The first name")
    middle_name: str = Field(description="The middle name or initial", default="")
    title: str = Field(description="The title of this contact", default="")
    cadency: str = Field(description="The cadency of this contact", default="")

class PostInfo(BaseModel):
    """A representation of a post information."""

    org: str = Field(description="The organization this contact belongs to", default="")
    service_branch: str = Field(description="The service branch of this contact", default="")
    is_contractor: bool = Field(description="Is this contact a contractor?", default=False)

class Contact(BaseModel):
    """A representation of a contact."""

    full_name: FullName = Field(description="The full name of this contact")

    post_info: PostInfo = Field(description="The post information of this contact")

    email: str = Field(description="The email address of this contact")
    phone: str = Field(description="The phone number of this contact")

    contact_reason: str = Field(description="Project involved", default="")

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader
from pathlib import Path

#pdf_reader = PDFReader()
#documents = pdf_reader.load_data(file=Path("./uber_receipt.pdf"))
#text = documents[0].text

reader = SimpleDirectoryReader(input_dir=Path("../data"))

In [5]:
documents = reader.load_data()

In [11]:
for att in dir(documents[0]):
    if not att.startswith("_"):
        print(att)

as_related_node_info
audio_resource
child_nodes
class_name
construct
copy
custom_model_dump
dict
doc_id
embedding
example
excluded_embed_metadata_keys
excluded_llm_metadata_keys
extra_info
from_cloud_document
from_dict
from_embedchain_format
from_haystack_format
from_json
from_langchain_format
from_orm
from_semantic_kernel_format
get_content
get_doc_id
get_embedding
get_metadata_str
get_type
hash
id_
image_resource
json
metadata
metadata_separator
metadata_template
model_computed_fields
model_config
model_construct
model_copy
model_dump
model_dump_json
model_extra
model_fields
model_fields_set
model_json_schema
model_parametrized_name
model_post_init
model_rebuild
model_validate
model_validate_json
model_validate_strings
next_node
node_id
parent_node
parse_file
parse_obj
parse_raw
prev_node
ref_doc_id
relationships
schema
schema_json
set_content
source_node
text
text_resource
text_template
to_cloud_document
to_dict
to_embedchain_format
to_haystack_format
to_json
to_langchain_format
to_