In [35]:
import os
from dotenv import load_dotenv, find_dotenv

from langchain_core.documents import Document

from unstructured.partition.pdf import partition_pdf

import unstructured_client
from unstructured_client.models import operations, shared

load_dotenv(find_dotenv())

True

### 1.0 Using the Unstructured API

Lesson: https://youtu.be/gvY4FgMjZUE?si=_8VXysvehyvjUG7L

Here we'll be using the `unstructured-client` [library](https://docs.unstructured.io/api-reference/api-services/sdk-python). However, for production/specific use cases, the authors do recommend using the `unstructured-ingest` library instead. We'll take note of this for future exploration.

In [36]:
client = unstructured_client.UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url=os.getenv("UNSTRUCTURED_API_URL"),
)

In [37]:
filename = "../data/gpt4all.pdf"

req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=open(filename, "rb"),
            file_name=filename,
        ),
        strategy=shared.Strategy.HI_RES,
        languages=['eng'],
        split_pdf_page=True,            # If True, splits the PDF file into smaller chunks of pages.
        split_pdf_allow_failed=True,    # If True, the partitioning continues even if some pages fail.
        split_pdf_concurrency_level=15  # Set the number of concurrent request to the maximum value: 15.
    ),
)

try:
    res = client.general.partition(request=req)
    element_dicts = [element for element in res.elements]

    # Print the processed data's first element only.
    print(element_dicts[0])

except Exception as e:
    print(e)

INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


{'type': 'Title', 'element_id': '35611b6c4191eb4f3163beed2e924f73', 'text': 'GPT4All: An Ecosystem of Open Source Compressed Language Models', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'gpt4all.pdf'}}


In [39]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in element_dicts:
    category = element['type']
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{'Title': 17,
 'NarrativeText': 80,
 'Image': 5,
 'Table': 1,
 'UncategorizedText': 3}

### 2.0 Table Extraction from PDF (Non-API)

Lesson: https://youtu.be/m_3q3XnLlTI?si=FlQjVgCDfc0mAzXO

In [22]:
filename = "../data/llama3technicalreport.pdf"

pdf_elements = partition_pdf(
    filename = filename,
    extract_images_in_pdf = False,
    strategy = "hi_res",
    hi_res_model_name = "yolox",
    infer_table_structure = True,
    chunking_strategy="by_title", # https://docs.unstructured.io/api-reference/api-services/chunking
    max_characters=3000,
    combine_text_under_n_chars=200
)

In [23]:
pdf_elements[0].to_dict()

{'type': 'CompositeElement',
 'element_id': 'b88de6484c5b9e42df1a145f60d240b8',
 'text': '4\n\n2024\n\n2\n\n0\n\n2 v o N 3 2 ] I A . s c [ 3 v 3 8 7 1 2 . 7 0 4 2 :\n\nv\n\narXiv\n\ni\n\nX\n\nr\n\na\n\n© Meta\n\nThe Llama 3 Herd of Models\n\nLlama Team, AI @ Meta1\n\n1A detailed contributor list can be found in the appendix of this paper.\n\nModern artificial intelligence (AI) systems are powered by foundation models. This paper presents a new set of foundation models, called Llama 3. It is a herd of language models that natively support multilinguality, coding, reasoning, and tool usage. Our largest model is a dense Transformer with 405B parameters and a context window of up to 128K tokens. This paper presents an extensive empirical evaluation of Llama 3. We find that Llama 3 delivers comparable quality to leading language models such as GPT-4 on a plethora of tasks. We publicly release Llama 3, including pre-trained and post-trained versions of the 405B parameter language model and o

In [24]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 194,
 "<class 'unstructured.documents.elements.Table'>": 1}

In [27]:
tables = [el for el in pdf_elements if el.category == "Table"]
tables

[<unstructured.documents.elements.Table at 0x7f157189f790>]

In [28]:
table_html = tables[0].metadata.text_as_html
table_html

'<table><tr><td/><td>Contam.</td><td>Performance gain est</td></tr><tr><td/><td/><td>8B</td><td>70B</td><td>405B</td></tr><tr><td>AGIEval</td><td>98</td><td>8.5</td><td>19.9</td><td>16.3</td></tr><tr><td>BIG-Bench Hard</td><td>95</td><td>26.0</td><td>36.0</td><td>41.0</td></tr><tr><td>BoolQ</td><td>96</td><td>4.0</td><td>47</td><td>3.9</td></tr><tr><td>CommonSenseQA</td><td>30</td><td>0.1</td><td>0.8</td><td>0.6</td></tr><tr><td>DROP</td><td/><td/><td/><td/></tr><tr><td>GSM8k</td><td>41</td><td>0.0</td><td>0.1</td><td>1.3</td></tr><tr><td>HellaSwag</td><td>85</td><td>14.8</td><td>14.8</td><td>14.3</td></tr><tr><td>HumanEval</td><td/><td/><td/><td/></tr><tr><td>MATH</td><td>1</td><td>0.0</td><td>-0.1</td><td>-0.2</td></tr><tr><td>MBPP</td><td/><td/><td/><td/></tr><tr><td>MMLU</td><td/><td/><td/><td/></tr><tr><td>MMLU-Pro</td><td/><td/><td/><td/></tr><tr><td>NaturalQuestions</td><td>52</td><td>16</td><td>0.9</td><td>0.8</td></tr><tr><td>OpenBookQA</td><td>21</td><td>3.0</td><td>3.3</td><

In [29]:
documents = []
for element in pdf_elements:
    metadata = element.metadata.to_dict()
    del metadata["languages"]
    metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))
    

In [30]:
len(documents)

195

In [31]:
documents[0]

Document(metadata={'filetype': 'application/pdf', 'last_modified': '2024-12-22T19:47:51', 'page_number': 1, 'orig_elements': 'eJzNWG1v3LgR/ivEfuoBpkJSlEj6U1MUuEvb3AWoizvANQy+jLxCtJIqUX5pcP+9Q2nX2WTtqxNgjf1gr4acIYfPPDND6fLTChrYQBuv67A6JysjczAaClpI5qisTEmd0RX+88KEUlkd2OqMrDYQbbDRos2nle+6IdStjTDOcmMfuiler6G+WUccEYIxtNkO39UhrnGUq3m07+o2JrvLS1lk8owURmTq6oxsxVKwrEgiZywzh/KijgOr8WGMsEmn+FDfQ/PP3npY/Y4TVd1AfOhhnvrwfjX70t5M9mZ2+HIF7c3qah4d4/WmC3VVwwyHYEJSLqgQF9ycS3Ve8GTdo+V1O20cDOkgaY8I9+moK5nmd5v9q/UIyk031P+FcJE0UPUAcla44LikebCKSm9z6oL3FLixwAPDINijQV7wjCUM84wlTBcRtRZR6yfERflkAE8W34w5V85XygGtHCuQ5lpQo0pPg2WssJgCBviRaf7I460odZbv0/xreVY/HdS/GXJnc8EBSuoBsLKYylCtraRgK1YUzJbW6mNDvsN0K6oy4/sQfy3P6icDOftmyAXLZVGpQEuZCyqVCFRXSlNWes6UDo7Zo7O8MJn4DDmXUmTlHsYHA4vBH4IeIIKPdddee4R1vO6HzqEay6Q2uXjtRCC3pCM/k5wIckXekbckIyPx5BJHbvFPE0U4zmX4y4jEp/P9OP4ENuCiTwSPFcFjvw1UMQ9UFgpLVFloCsoJ5w3jPD92vnApzJIRO7koM/1F9A4GZouTyZnbfawv6tjAk93A5MFjIKhVwlDJvMU8ySWVsmIMtHYg3fE6sEhNdQZuacGLXBT5ftM9kBf9kwHaDr/VLwNbc4dtwOKVUoZ03a