In [78]:
from llmsherpa.readers import LayoutPDFReader


llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
pdf = pdf_reader.read_pdf("../../data/apple/AAPL.pdf")

In [79]:
documents = pdf.chunks()

for doc in documents:
    
    print(doc.to_html())
    print("---------------------------------------------------")


<p>UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K</p>
---------------------------------------------------
<p>☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</p>
---------------------------------------------------
<p>For the fiscal year ended September 24, 2022 or</p>
---------------------------------------------------
<li>☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</li>
---------------------------------------------------
<p>For the transition period from to.
Commission File Number: 001-36743 (Exact name of Registrant as specified in its charter) Apple Inc.</p>
---------------------------------------------------
<table><th><td colSpan=1>California</td><td colSpan=1>94-2404110</td></th><tr><td colSpan=1>(State or other jurisdiction of incorporation or organization)</td><td colSpan=1>(I.R.S. Employer Identification No.)</td></tr><tr><td colSpan=1>One Apple Park Way Cupe

In [80]:
sections = pdf.sections()

for section in sections:
        print(section.to_html(include_children=True, recurse=True))
        print("---------------------------------------------------")

<h2>(Mark One)</h2><p>☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</p><p>For the fiscal year ended September 24, 2022 or</p><li>☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</li><p>For the transition period from to.
Commission File Number: 001-36743 (Exact name of Registrant as specified in its charter) Apple Inc.</p><table><th><td colSpan=1>California</td><td colSpan=1>94-2404110</td></th><tr><td colSpan=1>(State or other jurisdiction of incorporation or organization)</td><td colSpan=1>(I.R.S. Employer Identification No.)</td></tr><tr><td colSpan=1>One Apple Park Way Cupertino, California</td><td colSpan=1>95014</td></tr><tr><td colSpan=1>(Address of principal executive offices)</td><td colSpan=1>(Zip Code)</td></tr></table><p>(Registrant’s telephone number, including area code) (408) 996-1010 Securities registered pursuant to Section 12(b) of the Act:</p><table><th><td colSpan=1>Title of each class

In [81]:
from bs4 import BeautifulSoup
def get_tables():
    str_tables = pdf.tables()
    str_tables = [table.to_html() for table in str_tables]
    if str_tables:
        return str_tables
    else:
        documents = pdf.chunks()
        html_docs = []
        for doc in documents:
            html_docs.append(doc.to_html())

        def contains_table(html):
            soup = BeautifulSoup(html, "html.parser")
            return bool(soup.find("table"))

        # Identify documents that contain tables
        str_tables = [html for html in html_docs if contains_table(html)]
        return str_tables

In [82]:
from bs4 import BeautifulSoup

def extract_items_table():
    all_tables = get_tables()
    items_table = None
    for table in all_tables:
        soup = BeautifulSoup(table, 'html.parser')
        if soup.find('td', string='Risk Factors'):
            items_table = table
        if items_table:
            return items_table


In [83]:
items_table = extract_items_table()
items_table

'<table><th><td colSpan=1>Item 1.</td><td colSpan=1>Business</td><td colSpan=1>1</td></th><tr><td colSpan=1>Item 1A.</td><td colSpan=1>Risk Factors</td><td colSpan=1>5</td></tr><tr><td colSpan=1>Item 1B.</td><td colSpan=1>Unresolved Staff Comments</td><td colSpan=1>17</td></tr><tr><td colSpan=1>Item 2.</td><td colSpan=1>Properties</td><td colSpan=1>17</td></tr><tr><td colSpan=1>Item 3.</td><td colSpan=1>Legal Proceedings</td><td colSpan=1>17</td></tr><tr><td colSpan=1>Item 4.</td><td colSpan=1>Mine Safety Disclosures</td><td colSpan=1>17</td></tr><tr><td>Part II</td></tr><tr><td colSpan=1>Item 5.</td><td colSpan=1>Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity</td><td colSpan=1></td></tr><tr><td>Securities</td></tr><tr><td>18</td></tr><tr><td colSpan=1>Item 6.</td><td colSpan=1>[Reserved]</td><td colSpan=1>19</td></tr><tr><td colSpan=1>Item 7.</td><td colSpan=1>Management’s Discussion and Analysis of Financial Condition and Results of 

In [84]:
from bs4 import BeautifulSoup


def extract_items(items_table):
    soup = BeautifulSoup(items_table, 'html.parser')
    table_rows = soup.find_all('tr')

    items = []

    for row in table_rows:
        row_content = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
        if len(row_content) > 1:
            row_content = f"{row_content[0]} {row_content[1]}"
            items.append(row_content)

    return items

items = extract_items(items_table)
items

['Item 1A. Risk Factors',
 'Item 1B. Unresolved Staff Comments',
 'Item 2. Properties',
 'Item 3. Legal Proceedings',
 'Item 4. Mine Safety Disclosures',
 'Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity',
 'Item 6. [Reserved]',
 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Item 7A. Quantitative and Qualitative Disclosures About Market Risk',
 'Item 8. Financial Statements and Supplementary Data',
 'Item 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure',
 'Item 9A. Controls and Procedures',
 'Item 9B. Other Information',
 'Item 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections',
 'Item 10. Directors, Executive Officers and Corporate Governance',
 'Item 11. Executive Compensation',
 'Item 12. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters',
 'Item 13. Certain Relationsh

In [85]:
from langchain.text_splitter import HTMLHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.text_splitter import SentenceSplitter


def get_documents(pdf):
    headers = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3")
    ]

    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers, return_each_element=True)
    documents = []
    
    sections = pdf.sections()
    for section in sections:
        content = section.to_html(include_children=True, recurse=True)
        splits = splitter.split_text(content)
        splits
        documents.extend(splits)
    
    return documents

In [86]:
documents = get_documents(pdf)
print(len(documents))
documents

1248


[Document(page_content='☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='For the fiscal year ended September 24, 2022 or', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='For the transition period from to. Commission File Number: 001-36743 (Exact name of Registrant as specified in its charter) Apple Inc.', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='(Registrant’s telephone number, including area code) (408) 996-1010 Securities registered pursuant to Section 12(b) of the Act:', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.', metadata={'Header 2': '(Mark One)', 'Header 3': 'Securities registered pursuant to Section 12(g) of the Act: None'}),
 Document(page_content='Indicate by check mark if the Registrant is not required t

In [87]:
def extract_specific_tags(html_doc):
    """
    Extract specific tags (header and list item tags) from the HTML document.
    Return a list of these tags in their HTML representation.
    """
    soup = BeautifulSoup(html_doc, "html.parser")
    extracted_tags = []

    # Extract header tags (h1, h2, ..., h6)
    for header_tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        extracted_tags.append(str(header_tag))
    
    # Extract list item tags (li)
    for li_tag in soup.find_all('li'):
        extracted_tags.append(str(li_tag))

    return extracted_tags

In [88]:

def get_mapped_sections(pdf):
    all_headers = []

    sections = pdf.sections()
    for header in sections:
        all_headers.append(header.to_html(recurse=True))

    def strip_html_tags(html):
        return BeautifulSoup(html, "html.parser").get_text()

    mapped_sections = {}
    current_item = None
    for header in all_headers:
        text = strip_html_tags(header)
        if 'Item' in text or 'ITEM' in text:
            current_item = text
        else:
            if current_item:
                mapped_sections[text] = current_item

    return mapped_sections

mapped_sections = get_mapped_sections(pdf)
mapped_sections

{'Fiscal Year Highlights': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Fiscal 2022 Highlights': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'First Quarter 2022:': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Second Quarter 2022:': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Third Quarter 2022:': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Fourth Quarter 2022:': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'COVID-19': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Products and Services Performance': 'Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'Mac': 'Item 7. Mana

In [89]:

def add_item_metadata(documents, mapped_sections):
    final_documents = []
    for doc in documents:
        
        if 'Header 1' in doc.metadata:
            header1 = doc.metadata['Header 1']
            if header1 in mapped_sections:
                print(f"{header1} : {mapped_sections[header1]}")
                doc.metadata['Item'] = mapped_sections[header1]
        elif 'Header 2' in doc.metadata:
            header2 = doc.metadata['Header 2']
            if header2 in mapped_sections:
                print(f"{header2}: {mapped_sections[header2]}")
                doc.metadata['Item'] = mapped_sections[header2]
        elif 'Header 3' in doc.metadata:
            header3 = doc.metadata['Header 3']
            if header3 in mapped_sections:
                print(f"{header3}: {mapped_sections[header3]}")
                doc.metadata['Item'] = mapped_sections[header3]

        final_documents.append(doc)
    return final_documents

final_documents = add_item_metadata(documents, mapped_sections)
final_documents
            

Fiscal Year Highlights : Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal Year Highlights : Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal Year Highlights : Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal Year Highlights : Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal Year Highlights : Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal 2022 Highlights: Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal 2022 Highlights: Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fiscal 2022 Highlights: Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Fourth Quarter 2022:: Item 7. Management’s Discussi

[Document(page_content='☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='For the fiscal year ended September 24, 2022 or', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='For the transition period from to. Commission File Number: 001-36743 (Exact name of Registrant as specified in its charter) Apple Inc.', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='(Registrant’s telephone number, including area code) (408) 996-1010 Securities registered pursuant to Section 12(b) of the Act:', metadata={'Header 2': '(Mark One)'}),
 Document(page_content='Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.', metadata={'Header 2': '(Mark One)', 'Header 3': 'Securities registered pursuant to Section 12(g) of the Act: None'}),
 Document(page_content='Indicate by check mark if the Registrant is not required t

In [90]:
for doc in final_documents:
    print(f"Metadata: {doc.metadata}")
    print(doc.page_content)
    print("-"*50)

Metadata: {'Header 2': '(Mark One)'}
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
--------------------------------------------------
Metadata: {'Header 2': '(Mark One)'}
For the fiscal year ended September 24, 2022 or
--------------------------------------------------
Metadata: {'Header 2': '(Mark One)'}
For the transition period from to. Commission File Number: 001-36743 (Exact name of Registrant as specified in its charter) Apple Inc.
--------------------------------------------------
Metadata: {'Header 2': '(Mark One)'}
(Registrant’s telephone number, including area code) (408) 996-1010 Securities registered pursuant to Section 12(b) of the Act:
--------------------------------------------------
Metadata: {'Header 2': '(Mark One)', 'Header 3': 'Securities registered pursuant to Section 12(g) of the Act: None'}
Indicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.
-------------

In [91]:
import pandas as pd


doc_len = []
for doc in final_documents:
    doc_len.append(len(doc.page_content.split()))

doc_len = pd.Series(doc_len)
doc_len.describe()

count    1248.000000
mean       72.069712
std        64.572115
min         1.000000
25%        27.000000
50%        56.500000
75%        96.000000
max       533.000000
dtype: float64