In [1]:
# ! pip install pdfservices-sdk -U
# ! pip install from latexcodec -U

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
import re

import time 
import glob
import json
import shutil
from pylatexenc.latexencode import UnicodeToLatexEncoder
from tqdm.notebook import tqdm
from zipfile import ZipFile


In [4]:
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import TableStructureType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation

In [5]:
# Initial setup, create credentials instance.
credentials = Credentials.service_principal_credentials_builder(). \
    with_client_id(os.getenv('PDF_SERVICES_CLIENT_ID')). \
    with_client_secret(os.getenv('PDF_SERVICES_CLIENT_SECRET')). \
    build()

# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()

In [6]:
# Filter protocol only pdf
datadir = "data"
filenames = sorted(glob.glob(os.path.join(datadir, r"*.pdf")))
display(filenames)
display(len(filenames))

['data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.pdf',
 'data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.pdf',
 'data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .pdf',
 'data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.pdf',
 'data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.pdf',
 'data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.pdf',
 'data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.pdf',
 'data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.pdf',
 'data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.pdf',
 'data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.pdf',
 'data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.pdf',
 'data/Shen et al. - 2022 - Novel-view X-ray pr

20

In [7]:
def extract_text_from_pdf(filename):
    try:
        print(filename)
        time.sleep(1)
        # Set operation input from a source file.
        source = FileRef.create_from_local_file(filename)
        extract_pdf_operation.set_input(source)

        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_element_to_extract(ExtractElementType.TEXT) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)

        # Save the result to the specified location.
        result.save_as(filename.replace(".pdf", ".zip"))

        with ZipFile(filename.replace(".pdf", ".zip"), 'r') as f:
            f.extractall( os.getcwd() )
            
        # Move structuredData.json to pdf/filename.json
        shutil.move('structuredData.json', filename.replace(".pdf", ".json"))
    except Exception as e:
        print(e)

In [8]:
# for filename in tqdm(filenames):
#     extract_text_from_pdf(filename)

In [9]:
# Filter json
datadir = "data"
filenames = sorted(glob.glob(os.path.join(datadir, r"*.json")))
display(filenames)
display(len(filenames))

['data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.json',
 'data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.json',
 'data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .json',
 'data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.json',
 'data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.json',
 'data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.json',
 'data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.json',
 'data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.json',
 'data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.json',
 'data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.json',
 'data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.json',
 'data/Shen et al. - 2022 - Novel-vi

20

In [10]:
# Title, images, headers, paragraphs, lists, tables
# JSON, PNG, CSV

In [11]:
def extract_text_from_json(filename):
    try:
        print(filename)
        time.sleep(1)
        """Extracts the text from an Adobe JSON file and concatenates it.

        Args:
            adobe_json (str): The path to the Adobe JSON file.

        Returns:
            str: The concatenated text.
        """

        with open(filename, "r") as f:
            data = json.load(f)
        
        data_elements = data['elements']
        # https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/howtos/extract-api/
        # ISO standard , a summary is included below for convenience :
        # Aside : Content which is not part of regular content flow of the document
        # Figure : Non-reflowable constructs like graphs, images, flowcharts
        # Footnote : FootNote
        # H, H1, H2, etc : Heading Level
        # L : List
        # Li : List Item
        # Lbl : List Item label
        # Lbody : List item body
        # P : Paragraph
        # ParagraphSpan : Denotes part of a paragraph. Reported when paragraph is broken (generally due to page break or column break)
        # Reference : Link
        # Sect : Logical section of the document
        # StyleSpan : Denotes difference in styling of text relative to the parent container
        # Sub : Single line of a multiline paragraph (e.g. addresses). Such paras are created in html using \<br> inside \<p> tags
        # Table : Table
        # TD : Table cell
        # TH : Table header cell
        # TR : Table row
        # Title : Title of the document. This is the most prominent heading which can define the whole document.
        # TOC : Table of contents
        # TOCI : Table of contents item
        # Watermark : Watermark

        # Define regex patterns for different key types
        key_patterns = {
            "Title": r"^//Document/*Title$",
            "Heading": r"^//Document/*H1\[\d+\]$",
            "List": r"^//Document/*L\[\d+\]$",
            "ListItem": r"^//Document/*Li\[\d+\]$",
            "Label": r"^//Document/*Lbl\[\d+\]$",
            "Lbody": r"^//Document/*Lbody\[\d+\]$",
            "P": r"^//Document/*P\[\d+\]$",
            "ParagraphSpan": r"^//Document/*ParagraphSpan\[\d+\]$",
            # "Sect": r"^//Document/*Sect\[\d+\]$",
            # "Sub": r"^//Document/*Sub\[\d+\]$",
            # "Table": r"^//Document/*Table\[\d+\]$",
            # "Figure": r"^//Document/*Figure\[\d+\]$",
            "Reference": r"^//Document/*Link\[\d+\]$"
        }

        # combined_text = []
        # for element in data_elements:
        #     for key_type, pattern in key_patterns.items():
        #         if re.search(pattern, element['Path']):
        #             # print(key_type, pattern)
        #             combined_text.append(element['Text'])

        # Initialize a list to store combined text
        combined_text = []

        # Temporary variable to hold concatenated ParagraphSpan text
        temp_paragraphspan_text = ""

        # Iterate through elements and combine values for each key type
        for element in data_elements:
            combined = False
            for key_type, pattern in key_patterns.items():
                if re.search(pattern, element['Path']):
                    if key_type == "ParagraphSpan":
                        temp_paragraphspan_text += element['Text'] + " "
                        combined = True
                    else:
                        if combined:
                            combined_text.append(temp_paragraphspan_text.strip())
                            temp_paragraphspan_text = ""
                        combined_text.append(element['Text'] + "\n")
                        break
        

        # Print the combined text
        text = "\n".join(combined_text)
        # print(text)

        # Handle this special characters
        # text = "".join(c for c in text if ord(c) < 128)
        text = UnicodeToLatexEncoder(unknown_char_policy='ignore').unicode_to_latex(text)
        
        with open(filename.replace('.json', '.txt'), "w") as output_file:
            output_file.write(text)
            
    except Exception as e:
        print(e)

In [12]:
for filename in tqdm(filenames):
    extract_text_from_json(filename)

  0%|          | 0/20 [00:00<?, ?it/s]

data/Corona-Figueroa et al. - 2022 - MedNeRF Medical Neural Radiance Fields for Recons.json
data/Fridovich-Keil et al. - 2022 - Plenoxels Radiance Fields without Neural Networks.json


No known latex representation for character: U+21E4 - ‘⇤’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex

data/Ge et al. - 2022 - X-CTRSNet 3D cervical vertebra CT reconstruction .json
data/Jiang et al. - 2021 - Reconstruction of 3D CT from A Single X-ray Projec.json
data/Lin et al. - 2021 - BARF Bundle-Adjusting Neural Radiance Fields.json


No known latex representation for character: U+E03E - ‘’
No known latex representation for character: U+E03E - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’


data/Loyen et al. - 2023 - Patient-specific three-dimensional image reconstru.json


No known latex representation for character: U+FFFD - ‘�’
No known latex representation for character: U+FFFD - ‘�’
No known latex representation for character: U+FFFD - ‘�’


data/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.json


No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E00F - ‘’
No known latex representation for character: U+E03E - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E03E - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex representation for character: U+E030 - ‘’
No known latex

data/Muller et al. - 2022 - Instant neural graphics primitives with a multires.json


No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D709 - ‘𝜉’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D70B - ‘𝜋’
No known latex representation for character: U+1D70B - ‘𝜋’
No known latex representation for character: U+1D70B - ‘𝜋’
No known latex representation for character: U+1D70B - ‘𝜋’
No known latex representation for character: U+1D709 - ‘𝜉’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+E000 - ‘’
No known latex representation for character: U+E001 - ‘’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D703 - ‘𝜃’
No known latex representation for character: U+1D715 - ‘𝜕’

data/Ratul et al. - 2021 - CCX-rayNet A Class Conditioned Convolutional Neur.json


No known latex representation for character: U+E00C - ‘’


data/Shen et al. - 2019 - Harnessing the power of deep learning for volumetr.json
data/Shen et al. - 2019 - Patient-specific reconstruction of volumetric comp.json


No known latex representation for character: U+E001 - ‘’
No known latex representation for character: U+E001 - ‘’
No known latex representation for character: U+E001 - ‘’


data/Shen et al. - 2022 - Novel-view X-ray projection synthesis through geom.json
data/Sun et al. - 2022 - Direct Voxel Grid Optimization Super-fast Converg.json


No known latex representation for character: U+E071 - ‘’
No known latex representation for character: U+E064 - ‘’
No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E065 - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E063 - ‘’


data/Tan et al. - 2022 - XctNet Reconstruction network of volumetric image.json


No known latex representation for character: U+FF0C - ‘，’


data/Tan et al. - 2023 - Semi-XctNet Volumetric images reconstruction netw.json


No known latex representation for character: U+FF1A - ‘：’
No known latex representation for character: U+FF0C - ‘，’


data/Tancik et al. - 2022 - Block-NeRF Scalable Large Scene Neural View Synth.json


No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+21E5 - ‘⇥’
No known latex representation for character: U+E022 - ‘’
No known latex representation for character: U+E021 - ‘’
No known latex representation for character: U+21E5 - ‘⇥’


data/Wang et al. - 2022 - Neural Rendering for Stereo 3D Reconstruction of D.json


No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E06B - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E00F - ‘’
No known latex representation for character: U+E062 - ‘’
No known latex representation for character: U+E062 - ‘’


data/Yen-Chen et al. - 2021 - iNeRF Inverting Neural Radiance Fields for Pose E.json
data/Ying et al. - 2019 - X2CT-GAN Reconstructing CT From Biplanar X-Rays W.json


No known latex representation for character: U+E001 - ‘’
No known latex representation for character: U+E004 - ‘’
No known latex representation for character: U+E002 - ‘’


data/Yu et al. - pixelNeRF Neural Radiance Fields From One or Few .json


No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E001 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex representation for character: U+E020 - ‘’
No known latex