In [None]:
import os
import sys
import pprint
import pretty_errors
import PyPDF2
from unstructured.partition.pdf import partition_pdf

The PDF contains several pages with irrelevant or uninformative content. To optimize processing, we will first identify and select only the relevant pages before extracting elements from them.

In [76]:
import tempfile
from io import BytesIO

irrelevant_content_pages = list(range(0, 8)) + list(range(51, 54))

with open('../see_report.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    total_pages = len(reader.pages)
    writer = PyPDF2.PdfWriter()
 
    remaining_pages = filter(lambda x: x not in irrelevant_content_pages, range(total_pages))   

    for page_num in remaining_pages:
        writer.add_page(reader.pages[page_num])
    
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
    writer.write(temp_file)
    temp_file.close()

In [77]:
raw_pdf_elements = partition_pdf(
    temp_file.name, 
    strategy='hi_res',
    extract_images_in_pdf=True,
    infer_table_structure=True,

    extract_image_block_types=["Image"],   
    
    extract_image_block_to_payload=True,
    
    max_characters=10000,
    new_after_n_chars=6000,
    
    languages=['eng'],
)

os.unlink(temp_file.name)

In [None]:
import re
from tqdm import tqdm
from unstructured.documents.elements import Table, Image, NarrativeText, FigureCaption, Formula

table_caption_pattern = re.compile(r'^Table \d+\.\d+: ')
image_caption_pattern = re.compile(r'^Figure \d+\.\d+: ')

texts, tables, images = [], [], []
formulas, table_captions, image_captions = [], [], []

for element in tqdm(raw_pdf_elements):
    text = getattr(element, "text", "")

    if isinstance(element, (FigureCaption, NarrativeText)):
        if table_caption_pattern.match(text):
            table_captions.append(text)
        elif image_caption_pattern.match(text):
            image_captions.append(text)
        elif isinstance(element, NarrativeText):
            texts.append(text)

    elif isinstance(element, Table):
        tables.append(element.metadata.text_as_html) # Add the table as html for more clarity

    elif isinstance(element, Image):
        images.append(element)

    elif isinstance(element, Formula):
        formulas.append(text)

100%|██████████| 741/741 [00:00<00:00, 293177.93it/s]


In [114]:
print("No of Textual Chunks:", len(texts))
print("No of Table Elements:", len(tables))
print("No of Images:", len(images))

No of Textual Chunks: 168
No of Table Elements: 11
No of Images: 45


In [116]:
import random
from pprint import pprint

sample = 5  
sampled = random.sample(list(zip(tables, table_captions)), min(sample, len(tables)))

for i, (table, caption) in enumerate(sampled):
    print('-' * 20 + f' Table {i+1}: {caption} ' + '-' * 20)
    pprint(str(table))
    print('\n\n')


-------------------- Table 1: Table 3.1: Questionnaire subsections: Factors affecting software --------------------
('<table><tbody><tr><td>Organization environment</td><td>Income policies, '
 'development environment, impact of public policy and economic '
 'instability.</td><td></td></tr><tr><td></td><td>Requirements stability and '
 'flexibility,</td><td></td></tr><tr><td>Users</td><td>top management support, '
 'user availability and '
 'resistance.</td><td>13</td></tr><tr><td></td><td>experience, cohesion, '
 'continuity, and capability Scheduling, outsourcing, '
 'reuse,</td><td>18</td></tr><tr><td>Project Management</td><td>technical '
 'stability, risk management, use of '
 'standards.</td><td>20</td></tr><tr><td>Product</td><td>Reusability and '
 'documentation.</td><td>a</td></tr><tr><td>Product complexity</td><td>straint '
 'Technical and quality c</td><td>a</td></tr></tbody></table>')



-------------------- Table 2: Table 3.6: New dataset features --------------------
('<t

In [None]:
# Filter retrieved chunks that only contains page numbers

texts = list(filter(lambda x: 'Page' not in x, texts))
len(texts)

In [None]:
for i, text in enumerate(texts[:10]):
    pprint.pp('-'*20 + f'Text {i+1}' + '-'*20)
    pprint.pp(text)
    print('\n\n')

In [None]:
import base64
from IPython.display import Image

base64_images = [el.metadata.image_base64 for el in images]

# Displaying some images
for image in base64_images[:10]:
    image_data = base64.b64decode(image)
    display(Image(data=image_data))

In [None]:
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Text" in str(type(element)):
        print(element)

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000, chunk_overlap = 100
)

joined_texts = " ".join(texts)
texts_token = text_splitter.split_text(joined_texts)

print("No of Text Chunks after Tokenization:", len(texts_token))