# 1. PDF to text

Two readers:

1. PDFReader
2. Document Intelligence

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import tiktoken
encoding = tiktoken.get_encoding("o200k_base")

def token_size(text):
    return len(encoding.encode(text))

In [3]:
sample_file = '../examples/KFE_paper_sample.pdf'

## PDFReader

In [4]:
from pypdf import PdfReader

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [5]:
filename = sample_file.split('/')[-1]
filename = filename.split('.')[0]
filename

'KFE_paper_sample'

In [6]:
text = ""
with open(sample_file, "rb") as f:
    reader = PdfReader(f)
    text += "\n\n".join([page.extract_text() for page in reader.pages])

with open(f"{filename}_pdfreader.txt", "w", encoding="utf-8") as f:
    f.write(text)

In [7]:
print(f"token size: {token_size(text)}")

token size: 5430


## Azure AI Services - Document Intelligence

Document intelligence document:
- [Extract Layout](https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-preview&preserve-view=true#extract-layout)
- [Extract Figures from Documents](https://learn.microsoft.com/en-us/python/api/overview/azure/ai-documentintelligence-readme?view=azure-python-preview&preserve-view=true#extract-figures-from-documents)

In [8]:
import os

AZDOCINT_ENDPOINT = os.getenv("AZDOCINT_ENDPOINT")
AZDOCINT_KEY = os.getenv("AZDOCINT_KEY")
    

In [9]:
import base64

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult

document_analysis_client = DocumentIntelligenceClient(
    endpoint=AZDOCINT_ENDPOINT, credential=AzureKeyCredential(AZDOCINT_KEY)
)

In [10]:
print(f"converting `{sample_file}`...")
# document intelligence - access files locally
with open(sample_file, "rb") as f:
    analyze_request = {
        "base64Source": base64.b64encode(f.read()).decode('utf-8')
    }
    poller = document_analysis_client.begin_analyze_document("prebuilt-layout", 
        analyze_request,
        output=[AnalyzeOutputOption.FIGURES],
        output_content_format="markdown")


converting `../examples/KFE_paper_sample.pdf`...


In [11]:
result = poller.result()
md_content = result["content"]
print(f"token size: {token_size(md_content)}")

token size: 6655


In [12]:
with open(f"{filename}_docint.md", "w", encoding="utf-8") as f:
    f.write(md_content)

### Extra: Save figures

In [13]:
details = poller.details
operation_id = details['operation_id']
print(f"operation_id: {operation_id}")

operation_id: 3e64eeca-4539-4110-b640-2806726efd8a


In [14]:
result['figures']

[{'id': '1.1', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.5176, 0.8314, 1.3586, 0.8313, 1.3585, 1.7936, 0.5174, 1.7936]}], 'spans': [{'offset': 75, 'length': 29}], 'elements': ['/paragraphs/1']},
 {'id': '1.2', 'boundingRegions': [{'pageNumber': 1, 'polygon': [6.944, 2.274, 7.3601, 2.274, 7.3601, 2.6827, 6.944, 2.6826]}], 'spans': [{'offset': 405, 'length': 38}], 'elements': ['/paragraphs/6']},
 {'id': '2.1', 'boundingRegions': [{'pageNumber': 2, 'polygon': [0.5763, 0.7661, 3.9334, 0.7663, 3.9337, 3.3616, 0.5766, 3.3615]}], 'spans': [{'offset': 5335, 'length': 139}], 'elements': ['/paragraphs/25', '/paragraphs/26', '/paragraphs/27', '/paragraphs/28', '/paragraphs/29'], 'caption': {'content': 'Fig. 1. Schematic of K-DEMO tokamak design [15,18,19].', 'boundingRegions': [{'pageNumber': 2, 'polygon': [0.9527, 3.4362, 3.5587, 3.437, 3.5587, 3.5755, 0.9527, 3.5747]}], 'spans': [{'offset': 5356, 'length': 54}], 'elements': ['/paragraphs/30']}},
 {'id': '2.2', 'boundingRegions': [{'pa

In [15]:
# check if the directory exists
if not os.path.exists("fig"):
    os.mkdir("fig")