In [1]:
# ! pip install pdfservices-sdk -U

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
import re

import time 
import glob
import json
import shutil
from tqdm.notebook import tqdm
from zipfile import ZipFile


In [4]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import TableStructureType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation

In [5]:
# Initial setup, create credentials instance.
credentials = Credentials.service_principal_credentials_builder(). \
    with_client_id(os.getenv('PDF_SERVICES_CLIENT_ID')). \
    with_client_secret(os.getenv('PDF_SERVICES_CLIENT_SECRET')). \
    build()

# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()

In [6]:
# Filter protocol only pdf
datadir = "data"
filenames = sorted(glob.glob(os.path.join(datadir, r"*.pdf")))
display(filenames)
display(len(filenames))

['data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.pdf',
 'data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.pdf',
 'data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .pdf',
 'data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.pdf',
 'data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.pdf',
 'data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.pdf',
 'data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.pdf',
 'data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.pdf',
 'data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.pdf',
 'data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.pdf',
 'data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.pdf',
 'data/Shen et al. - 2022 - Novel-view X-ray pr

21

In [7]:
def extract_text_from_pdf(filename):
    try:
        print(filename)
        time.sleep(1)
        # Set operation input from a source file.
        source = FileRef.create_from_local_file(filename)
        extract_pdf_operation.set_input(source)

        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_element_to_extract(ExtractElementType.TEXT) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)

        # Save the result to the specified location.
        result.save_as(filename.replace(".pdf", ".zip"))

        with ZipFile(filename.replace(".pdf", ".zip"), 'r') as f:
            f.extractall( os.getcwd() )
            
        # Move structuredData.json to pdf/filename.json
        shutil.move('structuredData.json', filename.replace(".pdf", ".json"))
    except Exception as e:
        print(e)

In [8]:
for filename in tqdm(filenames):
    extract_text_from_pdf(filename)

  0%|          | 0/21 [00:00<?, ?it/s]

data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.pdf
data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.pdf
data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .pdf
data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.pdf
data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.pdf
data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.pdf
data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.pdf
data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.pdf
data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.pdf
data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.pdf
data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.pdf
data/Shen et al. - 2022 - Novel-view X-ray projection synthesis through geom.pdf
data/Sun e

In [9]:
# Filter json
datadir = "data"
filenames = sorted(glob.glob(os.path.join(datadir, r"*.json")))
display(filenames)
display(len(filenames))

['data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.json',
 'data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.json',
 'data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .json',
 'data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.json',
 'data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.json',
 'data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.json',
 'data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.json',
 'data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.json',
 'data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.json',
 'data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.json',
 'data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.json',
 'data/Shen et al. - 2022 - Novel-vi

20

In [10]:
# Title, images, headers, paragraphs, lists, tables
# JSON, PNG, CSV

In [29]:
def extract_text_from_json(filename):
    try:
        print(filename)
        time.sleep(1)
        """Extracts the text from an Adobe JSON file and concatenates it.

        Args:
            adobe_json (str): The path to the Adobe JSON file.

        Returns:
            str: The concatenated text.
        """

        with open(filename, "r") as f:
            data = json.load(f)

        texts = []
        for item in data["elements"]:
            # print(item)
            if item["Path"].startswith("//Document/Title"):
                texts.append(item["Text"]+"\n")
            elif item["Path"].startswith("//Document/H"):
                texts.append(item["Text"]+"\n")
            elif item["Path"].startswith("//Document/P"):
                pass
                # paragraph_text = []
                # # for paragraph_span in item["ParagraphSpan"]:
                # # paragraph_text.append(paragraph_span["Text"])
                # paragraph_text.append(item["Text"])
                # texts.append(" ".join(paragraph_text))
                # texts.append(item["Text"]+"\n")

        text = "\n".join(texts)
        text = "".join(c for c in text if ord(c) < 128)
        
        with open(filename.replace(".json", ".txt"), "w") as f:
            f.write(text)
    except Exception as e:
        print(e)

In [30]:
for filename in tqdm(filenames[1:2]):
    extract_text_from_json(filename)

  0%|          | 0/1 [00:00<?, ?it/s]

data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.json
