In [175]:
from llmsherpa.readers import LayoutPDFReader


llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

company = "Meta"

if company == "Meta":
    path = "../../data/meta/meta.pdf"
elif company == "Apple":
    path = "../../data/apple/AAPL.pdf"
elif company == "Microsoft":
    path = "../../data/microsoft/MSFT.pdf"

pdf = pdf_reader.read_pdf(path)

In [176]:
documents = pdf.chunks()

for doc in documents:
    
    print(doc.to_html())
    print("---------------------------------------------------")


<p>UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-K</p>
---------------------------------------------------
<p>(Mark One) ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31, 2022 or ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to Commission File Number: 001-35551</p>
---------------------------------------------------
<p>(Exact name of registrant as specified in its charter) Meta Platforms, Inc.</p>
---------------------------------------------------
<p>Delaware</p>
---------------------------------------------------
<p>20-1665019</p>
---------------------------------------------------
<p>(Address of principal executive offices and Zip Code) 1601 Willow Road, Menlo Park, California 94025 (Registrant's telephone number, including area code) (650) 543-4800 Securities registered pursuant to Section 12(b) of 

In [177]:
sections = pdf.sections()

for section in sections:
        print(section.to_html(include_children=True, recurse=True))
        print("---------------------------------------------------")

<h2>(State or other jurisdiction of incorporation or organization)</h2><p>Delaware</p>
---------------------------------------------------
<h2>(I.R.S. Employer Identification Number)</h2><p>20-1665019</p><p>(Address of principal executive offices and Zip Code) 1601 Willow Road, Menlo Park, California 94025 (Registrant's telephone number, including area code) (650) 543-4800 Securities registered pursuant to Section 12(b) of the Act:</p><table><th><td colSpan=1>Title of each class</td><td colSpan=1>Trading symbol(s)</td><td colSpan=1>Name of each exchange on which registered</td></th><tr><td colSpan=1>Class A Common Stock, $0.000006 par value</td><td colSpan=1>META</td><td colSpan=1>The Nasdaq Stock Market LLC</td></tr></table><p>Securities registered pursuant to Section 12(g) of the Act: None</p><table><th><td colSpan=1>Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.</td><td colSpan=1>Yes ☒</td><td colSpan=1>No ☐</t

In [178]:
from bs4 import BeautifulSoup
def get_tables():
    str_tables = pdf.tables()
    str_tables = [table.to_html() for table in str_tables]
    if str_tables:
        return str_tables
    else:
        documents = pdf.chunks()
        html_docs = []
        for doc in documents:
            html_docs.append(doc.to_html())

        def contains_table(html):
            soup = BeautifulSoup(html, "html.parser")
            return bool(soup.find("table"))

        # Identify documents that contain tables
        str_tables = [html for html in html_docs if contains_table(html)]
        return str_tables

In [179]:
from bs4 import BeautifulSoup

def extract_items_table():
    all_tables = get_tables()
    items_table = None
    for table in all_tables:
        soup = BeautifulSoup(table, 'html.parser')
        if soup.find('td', string='Risk Factors'):
            items_table = table
        if items_table:
            return items_table


In [180]:
items_table = extract_items_table()
items_table

'<table><th><td colSpan=1>Item 1.</td><td colSpan=1>Business</td><td colSpan=1>7</td></th><tr><td colSpan=1>Item 1A.</td><td colSpan=1>Risk Factors</td><td colSpan=1>14</td></tr><tr><td colSpan=1>Item 1B.</td><td colSpan=1>Unresolved Staff Comments</td><td colSpan=1>48</td></tr><tr><td colSpan=1>Item 2.</td><td colSpan=1>Properties</td><td colSpan=1>48</td></tr><tr><td colSpan=1>Item 3.</td><td colSpan=1>Legal Proceedings</td><td colSpan=1>48</td></tr><tr><td colSpan=1>Item 4.</td><td colSpan=1>Mine Safety Disclosures</td><td colSpan=1>51</td></tr></table>'

In [181]:
from bs4 import BeautifulSoup


def extract_items(items_table):
    soup = BeautifulSoup(items_table, 'html.parser')
    table_rows = soup.find_all('tr')

    items = []

    for row in table_rows:
        row_content = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
        if len(row_content) > 1:
            row_content = f"{row_content[0]} {row_content[1]}"
            items.append(row_content)

    return items

items = extract_items(items_table)
items

['Item 1A. Risk Factors',
 'Item 1B. Unresolved Staff Comments',
 'Item 2. Properties',
 'Item 3. Legal Proceedings',
 'Item 4. Mine Safety Disclosures']

In [182]:
from langchain.text_splitter import HTMLHeaderTextSplitter, TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.text_splitter import SentenceSplitter


def get_documents(pdf):
    headers = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3")
    ]

    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers)
    documents = []

    recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    is_separator_regex = False,
    )

    token_splitter = TokenTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100,
        )
    
    sections = pdf.sections()
    for section in sections:
        content = section.to_html(include_children=True, recurse=True)
        splits = splitter.split_text(content)
        # splits = recursive_splitter.split_documents(splits)
        splits = token_splitter.split_documents(splits)
        documents.extend(splits)
    
    return documents

In [183]:
documents = get_documents(pdf)
print(len(documents))
documents

844


[Document(page_content='Delaware', metadata={'Header 2': '(State or other jurisdiction of incorporation or organization)'}),
 Document(page_content='20-1665019  \n(Address of principal executive offices and Zip Code) 1601 Willow Road, Menlo Park, California 94025 (Registrant\'s telephone number, including area code) (650) 543-4800 Securities registered pursuant to Section 12(b) of the Act:  \nSecurities registered pursuant to Section 12(g) of the Act: None  \nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐  \nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation

In [184]:
def extract_specific_tags(html_doc):
    """
    Extract specific tags (header and list item tags) from the HTML document.
    Return a list of these tags in their HTML representation.
    """
    soup = BeautifulSoup(html_doc, "html.parser")
    extracted_tags = []

    # Extract header tags (h1, h2, ..., h6)
    for header_tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        extracted_tags.append(str(header_tag))
    
    # Extract list item tags (li)
    for li_tag in soup.find_all('li'):
        extracted_tags.append(str(li_tag))

    return extracted_tags

In [185]:

def get_mapped_sections(pdf):
    all_headers = []

    sections = pdf.sections()
    for header in sections:
        all_headers.append(header.to_html(recurse=True))

    def strip_html_tags(html):
        return BeautifulSoup(html, "html.parser").get_text()

    mapped_sections = {}
    current_item = None
    for header in all_headers:
        text = strip_html_tags(header)
        if 'Item' in text or 'ITEM' in text:
            current_item = text
        else:
            if current_item:
                mapped_sections[text] = current_item

    return mapped_sections

mapped_sections = get_mapped_sections(pdf)
mapped_sections

{'Family of Apps Products': 'Item 1. Business Overview',
 'Reality Labs Products': 'Item 1. Business Overview',
 'Competition': 'Item 1. Business Overview',
 'Technology': 'Item 1. Business Overview',
 'Sales and Operations': 'Item 1. Business Overview',
 'Marketing': 'Item 1. Business Overview',
 'Intellectual Property': 'Item 1. Business Overview',
 'Government Regulation': 'Item 1. Business Overview',
 'Human Capital': 'Item 1. Business Overview',
 'Employee Learning and Development': 'Item 1. Business Overview',
 'The Pulse of Our Workforce': 'Item 1. Business Overview',
 'Health and Well-being': 'Item 1. Business Overview',
 'Diversity, Equity and Inclusion': 'Item 1. Business Overview',
 'Compensation and Benefits': 'Item 1. Business Overview',
 'Corporate Information': 'Item 1. Business Overview',
 'Available Information': 'Item 1. Business Overview',
 'Summary Risk Factors': 'Item 1A. Risk Factors',
 'Risks Related to Our Product Offerings': 'Item 1A. Risk Factors',
 'Risks Rel

In [186]:

def add_item_metadata(documents, mapped_sections):
    final_documents = []
    for doc in documents:
        
        if 'Header 1' in doc.metadata:
            header1 = doc.metadata['Header 1']
            if header1 in mapped_sections:
                print(f"{header1} : {mapped_sections[header1]}")
                doc.metadata['Item'] = mapped_sections[header1]
        elif 'Header 2' in doc.metadata:
            header2 = doc.metadata['Header 2']
            if header2 in mapped_sections:
                print(f"{header2}: {mapped_sections[header2]}")
                doc.metadata['Item'] = mapped_sections[header2]
        elif 'Header 3' in doc.metadata:
            header3 = doc.metadata['Header 3']
            if header3 in mapped_sections:
                print(f"{header3}: {mapped_sections[header3]}")
                doc.metadata['Item'] = mapped_sections[header3]

        final_documents.append(doc)
    return final_documents

final_documents = add_item_metadata(documents, mapped_sections)
final_documents
            

PART II: Item 4. Mine Safety Disclosures
PART II: Item 4. Mine Safety Disclosures
PART II: Item 4. Mine Safety Disclosures
PART II: Item 4. Mine Safety Disclosures
Reality Labs Products: Item 1. Business Overview
Competition: Item 1. Business Overview
Technology: Item 1. Business Overview
Sales and Operations: Item 1. Business Overview
Marketing: Item 1. Business Overview
Intellectual Property: Item 1. Business Overview
Government Regulation: Item 1. Business Overview
Government Regulation: Item 1. Business Overview
Government Regulation: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Human Capital: Item 1. Business Overview
Employee Learning and Development: Item 1. Business Overview
The Pulse of Our Workforce: Item 1. Business Overview
Health and Well-being: Item 1. Business Overview
Diversity, Equity a

[Document(page_content='Delaware', metadata={'Header 2': '(State or other jurisdiction of incorporation or organization)'}),
 Document(page_content='20-1665019  \n(Address of principal executive offices and Zip Code) 1601 Willow Road, Menlo Park, California 94025 (Registrant\'s telephone number, including area code) (650) 543-4800 Securities registered pursuant to Section 12(b) of the Act:  \nSecurities registered pursuant to Section 12(g) of the Act: None  \nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 (Exchange Act) during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐  \nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation

In [187]:
import pandas as pd


doc_len = []
for doc in final_documents:
    doc_len.append(len(doc.page_content.split()))

doc_len = pd.Series(doc_len)
doc_len.describe()

count    844.000000
mean     286.296209
std      292.809907
min        1.000000
25%       65.000000
50%      158.500000
75%      436.500000
max      892.000000
dtype: float64

In [188]:
import json
doc_json = []

for doc in final_documents:
    if len(doc.page_content.split()) < 30:
        continue
    d = {}
    d['content'] = doc.page_content
    d['metadata'] = doc.metadata
    doc_json.append(d)

with open(f"../data/chunks/{company}.json", "w") as f:
    json.dump(doc_json, f, indent=4)

In [189]:
tables = pdf.tables()

In [190]:
print(tables[15].to_context_text())

PART II > Results of Operations > Foreign Exchange Impact on Revenue
 |  | Year Ended December 31,
 | --- | ---
 | 2022 |  | 2021 | 2020 2022 vs 2021 % change |  | 2021 vs 2020 % change | 
 | --- | --- | --- | --- | --- | --- | ---
 | (in millions, except percentages)
 | Marketing and sales $ 15,262 |  | $ 14,043 | $ 11,591 | 9 % |  | 21 %
 | Percentage of revenue | 13 % | 12 % | 13 % |  |  | 

