In [5]:
"""
 Copyright 2024 Adobe
 All Rights Reserved.

 NOTICE: Adobe permits you to use, modify, and distribute this file in
 accordance with the terms of the Adobe license agreement accompanying it.
"""

import logging
import os
from datetime import datetime

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult

# Initialize the logger
logging.basicConfig(level=logging.INFO)

from dotenv import load_dotenv
load_dotenv()
import os

# Access environment variables
client_id = os.getenv("ADOBE-client-id")
client_secret = os.getenv("ADOBE-client-secret")


In [6]:

file = open('extractPdfInput.pdf', 'rb')
input_stream = file.read()
file.close()

# Initial setup, create credentials instance
credentials = ServicePrincipalCredentials(
    client_id=client_id,
    client_secret=client_secret
)

# Creates a PDF Services instance
pdf_services = PDFServices(credentials=credentials)

# Creates an asset(s) from source file(s) and upload
input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)

# Create parameters for the job
extract_pdf_params = ExtractPDFParams(
    elements_to_extract=[ExtractElementType.TEXT],
)

# Creates a new job instance
extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)

# Submit the job and gets the job result
location = pdf_services.submit(extract_pdf_job)
pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)

# Get content from the resulting asset(s)
result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
stream_asset: StreamAsset = pdf_services.get_content(result_asset)

INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished uploading asset
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started submitting EXTRACT_PDF job
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished polling for status
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting job result
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Started getting content
INFO:adobe.pdfservices.operation.internal.pdf_services_helper:Finished getting content


In [7]:
print("Extracted text: ")
print(stream_asset.get_input_stream())

Extracted text: 
b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x16w\x14Y\x9f\xbbA\x11\xd4)\x00\x00\x11.\x01\x00\x13\x00\x00\x00structuredData.json\xed}[s\xdb\xc6\xb6\xe6_A\xf9\xc9\xaeQ\xe0\xbe_\xf6<\xf9\x92\x1c\xfb\x8cw\xa2\xb1\x9d\x9d]\xc9\xa4\\\x10\xd9\x12\xb1\r\x02<\x00hY\x93\xda\xff}\xd6\xea\x06H\x90\x84$\x10\xa23G\x15\xf8A\x96pm\xf4\xb7\xeek\xf5\xea?\x9e|qe\x95\x16\xf9\x93\xbfE\x7f<\xf9WU\xe4\x9f\xdc\xd7UQ\xd6\xf0\xf7\x13F\xf8\x93\xb3\xe8\xc9*\xb9r\x9f*w\xb5ty\x9d\xd4\xe1\xda\'\x12\xcfT\xb3\x85[&\xf8\'\x8diL\xfc\xa1\xba\\\xcf\xeau\xe9\x9a\xa3\xc4\xdap\xa2N.2xL\xf7\xb4|\xf2o8\xe1\xbe\xd6.\x9f\xbb\xf9\xa7\xa5\xab\x93yR\'~(o_\x7fJ\xf3\xaaN\xf2\x99\xbf\xf4\x15\x89^\x8b\xc8\xd8\xe8\x07\x12\xbd\xa0\xd1\xcb\x17\xd1K\x16Q\x1a\x91\x17\x11!\x91\xd2\x91\x90\x91y\x19\xa9\x97\xd1+\x151\x1e\xe1\x1b\xe1\x11+W.\x93\x1c\xc6\x8d\xcf\xe0"\x12p\x92\xe0\xb5\xe1\x17n".\xfd\x11\x15q\xe2\x7f\xa1\x11\xa7\xfe\x17\x86\xbf\x87_8\xf3\x17\xb7\xa78iNq\xd2\xdc\xc5\xe1v\xed\x7f\x11\xcd\x03\xe1\xc9\x825\xa7\xc2/\x82\xe3\xe

In [54]:
import zipfile
import io
import json

# Assuming stream_asset.get_input_stream() returns a file-like object
input_stream = stream_asset.get_input_stream()

# Create a BytesIO object from the input stream
bytes_io = io.BytesIO(input_stream)

# Open the zip file
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
    # List all files in the zip archive
    with zip_ref.open('structuredData.json') as f:
        content = f.read()
        print(content.decode('utf-8'))  # Assuming the file is text; decode accordingly
        parsed = json.loads(content.decode('utf-8'))
        print(json.dumps(parsed, indent=4))  # Assuming the file is JSON; decode accordingly

{"version": {"json_export": "203", "page_segmentation": "5", "schema": "1.1.0", "structure": "1.1099.0", "table_structure": "5"}, "extended_metadata": {"ID_instance": "C0 D4 89 F0 A1 BA B2 11 0A 00 67 45 8B 6B C6 23 ", "ID_permanent": "34 46 20 45 46 20 38 35 20 46 30 20 41 31 20 42 41 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ", "has_acroform": false, "has_embedded_files": false, "is_XFA": false, "is_certified": false, "is_encrypted": false, "is_digitally_signed": false, "language": "EN-US", "page_count": 3, "pdf_version": "1.6", "pdfa_compliance_level": "", "pdfua_compliance_level": ""}, "elements": [{"Bounds": [44.8800048828125, 757.0929565429688, 244.90650939941406, 767.5725555419922], "Font": {"alt_family_name": "Clean", "embedded": true, "encoding": "WinAnsiEncoding", "family_name": "Adobe Clean", "font_type": "Type1", "italic": false, "monospaced": false, "name": "UVJLFE+AdobeClean-Regular", "subset": true, "weight": 400}, "HasC

In [55]:
keys_to_remove = ['Bounds', 'Font', "HasClip", "Lang", "ObjectID", "attributes", "Path", "TextSize"]
for element in parsed['elements']:
    for key in keys_to_remove:
        if key in element:
            del element[key]

indices_to_remove = []
for i, element in enumerate(parsed['elements']):
    if 'Text' not in element:
        indices_to_remove.append(i)
        continue
    if element.get('Text') == "\u2022 ":
        indices_to_remove.append(i)
        
for index in reversed(indices_to_remove):
    del parsed['elements'][index]


print(json.dumps(parsed, indent=4))

{
    "version": {
        "json_export": "203",
        "page_segmentation": "5",
        "schema": "1.1.0",
        "structure": "1.1099.0",
        "table_structure": "5"
    },
    "extended_metadata": {
        "ID_instance": "C0 D4 89 F0 A1 BA B2 11 0A 00 67 45 8B 6B C6 23 ",
        "ID_permanent": "34 46 20 45 46 20 38 35 20 46 30 20 41 31 20 42 41 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ",
        "has_acroform": false,
        "has_embedded_files": false,
        "is_XFA": false,
        "is_certified": false,
        "is_encrypted": false,
        "is_digitally_signed": false,
        "language": "EN-US",
        "page_count": 3,
        "pdf_version": "1.6",
        "pdfa_compliance_level": "",
        "pdfua_compliance_level": ""
    },
    "elements": [
        {
            "Page": 0,
            "Text": "Adobe Vendor Security Review Program White Paper "
        },
        {
            "Page": 0,
            "Text": 