In [None]:
import os
import sys
import pprint
import pretty_errors
import PyPDF2
from unstructured.partition.pdf import partition_pdf

The PDF contains several pages with irrelevant or uninformative content. To optimize processing, we will first identify and select only the relevant pages before extracting elements from them.

In [76]:
import tempfile
from io import BytesIO

irrelevant_content_pages = list(range(0, 8)) + list(range(51, 54))

with open('../see_report.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    total_pages = len(reader.pages)
    writer = PyPDF2.PdfWriter()
 
    remaining_pages = filter(lambda x: x not in irrelevant_content_pages, range(total_pages))   

    for page_num in remaining_pages:
        writer.add_page(reader.pages[page_num])
    
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
    writer.write(temp_file)
    temp_file.close()

In [77]:
raw_pdf_elements = partition_pdf(
    temp_file.name, 
    strategy='hi_res',
    extract_images_in_pdf=True,
    infer_table_structure=True,

    extract_image_block_types=["Image"],   
    
    extract_image_block_to_payload=True,
    
    max_characters=10000,
    new_after_n_chars=6000,
    
    languages=['eng'],
)

os.unlink(temp_file.name)

In [112]:
_dict

[{'type': 'Header',
  'element_id': '1eabb484ae23e187e24749f400e3dbd5',
  'text': 'CHAPTER 1. INTRODUCTION:',
  'metadata': {'detection_class_prob': 0.8799676895141602,
   'coordinates': {'points': ((np.float64(156.4019012451172),
      np.float64(56.02817888888865)),
     (np.float64(156.4019012451172), np.float64(83.70206777777771)),
     (np.float64(584.2160034179688), np.float64(83.70206777777771)),
     (np.float64(584.2160034179688), np.float64(56.02817888888865))),
    'system': 'PixelSpace',
    'layout_width': 1654,
    'layout_height': 2339},
   'last_modified': '2025-07-19T02:45:22',
   'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'file_directory': '/tmp',
   'filename': 'tmp3_qjhu5n.pdf'}},
 {'type': 'Title',
  'element_id': '43aaeeba645f5f2ac7bd20c1adcc6773',
  'text': 'Chapter 1',
  'metadata': {'detection_class_prob': 0.716379702091217,
   'coordinates': {'points': ((np.float64(154.98431396484375),
      np.float64(346.4500694444442)),


In [None]:
{'type': 'Table',
  'element_id': '9117fa97857d73f850281543cb4fe9e6',
  'text': 'Model Parameter name Best value alpha 0.4321608040201005 max iter 1000 Lasso positive False precompute False selection cyclic tol 0.0001 n estimators 220 max depth 11 Random Forest min samples split 2 Regressor min samples leaf 1 max features None bootstrap True criterion absolute error momentum 0.1 Neural Network batch size 10 learning rate 0.01 dropout 0.01 epsilon 0 tol 0.03526763381690846 LinearSVR max iter 1000 C 1.250625312656328',
  'metadata': {'detection_class_prob': 0.8254163861274719,
   'coordinates': {'points': ((np.float64(427.7747497558594),
      np.float64(1185.3609619140625)),
     (np.float64(427.7747497558594), np.float64(2023.6842041015625)),
     (np.float64(1219.9080810546875), np.float64(2023.6842041015625)),
     (np.float64(1219.9080810546875), np.float64(1185.3609619140625))),
    'system': 'PixelSpace',
    'layout_width': 1654,
    'layout_height': 2339},
   'last_modified': '2025-07-19T02:45:22',
   'text_as_html': '<table><thead><tr><th>Model</th><th>Parameter name</th><th>Best value</th></tr></thead><tbody><tr><td rowspan="6">Lasso</td><td>alpha</td><td>0.4321608040201005</td></tr><tr><td>max_iter</td><td>1000</td></tr><tr><td>positive</td><td>False</td></tr><tr><td>precompute</td><td>False</td></tr><tr><td>selection</td><td>cyclic</td></tr><tr><td>tol</td><td>0.0001</td></tr><tr><td></td><td>n_estimators max-_depth</td><td>220 ll</td></tr><tr><td>Random Forest</td><td></td><td>2</td></tr><tr><td></td><td>min_samples-split</td><td></td></tr><tr><td>Regressor</td><td>min_samples_leaf</td><td>1</td></tr><tr><td></td><td>max_features</td><td>None</td></tr><tr><td></td><td>bootstrap</td><td>True</td></tr><tr><td></td><td>criterion</td><td>absolute_error</td></tr><tr><td rowspan="6">Neural Network</td><td>momentum</td><td>0.1</td></tr><tr><td>batch_size</td><td>10</td></tr><tr><td>learning _rate</td><td>0.01</td></tr><tr><td>dropout</td><td>0.01</td></tr><tr><td>epsilon</td><td>0</td></tr><tr><td>tol</td><td>0.03526763381690846</td></tr><tr><td rowspan="2">LinearSVR</td><td>max_iter</td><td>1000</td></tr><tr><td>Cc</td><td>1.250625312656328</td></tr></tbody></table>',
   'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 34,
   'file_directory': '/tmp',
   'filename': 'tmp3_qjhu5n.pdf',
   'parent_id': '869e8c1d972d85d3866ec8a561b2d15a'}},

In [None]:
import re
from tqdm import tqdm
from unstructured.documents.elements import Table, Image, NarrativeText, FigureCaption, Formula

table_caption_pattern = re.compile(r'^Table \d+\.\d+: ')
image_caption_pattern = re.compile(r'^Figure \d+\.\d+: ')

texts, tables, images = [], [], []
formulas, table_captions, image_captions = [], [], []

for element in tqdm(raw_pdf_elements):
    text = getattr(element, "text", "")

    if isinstance(element, (FigureCaption, NarrativeText)):
        if table_caption_pattern.match(text):
            table_captions.append(text)
        elif image_caption_pattern.match(text):
            image_captions.append(text)
        elif isinstance(element, NarrativeText):
            texts.append(text)

    elif isinstance(element, Table):
        tables.append(element.metadata.text_as_html) # Add the table as html for more lisibility

    elif isinstance(element, Image):
        images.append(element)

    elif isinstance(element, Formula):
        formulas.append(text)

100%|██████████| 741/741 [00:00<00:00, 293177.93it/s]


In [114]:
print("No of Textual Chunks:", len(texts))
print("No of Table Elements:", len(tables))
print("No of Images:", len(images))

No of Textual Chunks: 168
No of Table Elements: 11
No of Images: 45


In [111]:
import random
from pprint import pprint

sample = 5  
sampled = random.sample(list(zip(tables, table_captions)), min(sample, len(tables)))

for i, (table, caption) in enumerate(sampled):
    print('-' * 20 + f' Table {i+1}: {caption} ' + '-' * 20)
    pprint(str(table))
    print('\n\n')


-------------------- Table 1: Table 3.6: New dataset features --------------------
('the software system re-programming the software system Requirement Accuracy '
 'level the re-analysis of the software requirements the re-design of the '
 'software system')



-------------------- Table 2: Table 2.2: The use of algorithms in each paper --------------------
('SVR [6], [8], [13], [14], [15], [19], [20] SVM [2], [3], [7], [11] '
 'Regression Trees [8], [9], [17], [18], [20] Random Forest [3], [11], [10], '
 '[11], [12], [18],[19] ,[20] Neural Networks Linear Regression [3], [13], '
 '[16], [4], [7], [12], [19] K-Star [7], [12] Na¨ıve Bayes [10], [5] Logistic '
 'Regression [10] KNN [2], [20] M5P [12], [12], [14] Additive-Regression [12] '
 'Decision Trees [16] ADABOOST [20], [19] RepTree [12] Ridge Regression [16] '
 'Lasso Regression [16] [16] [8], [15], [17]')



-------------------- Table 3: Table 3.5: Anova test results --------------------
('Column Anova score p-value Role in organi

In [None]:
# Filter retrieved chunks that only contains page numbers

texts = list(filter(lambda x: 'Page' not in x, texts))
len(texts)

In [None]:
for i, text in enumerate(texts[:10]):
    pprint.pp('-'*20 + f'Text {i+1}' + '-'*20)
    pprint.pp(text)
    print('\n\n')

In [None]:
import base64
from IPython.display import Image

base64_images = [el.metadata.image_base64 for el in images]

# Displaying some images
for image in base64_images[:10]:
    image_data = base64.b64decode(image)
    display(Image(data=image_data))

In [None]:
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Text" in str(type(element)):
        print(element)

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000, chunk_overlap = 100
)

joined_texts = " ".join(texts)
texts_token = text_splitter.split_text(joined_texts)

print("No of Text Chunks after Tokenization:", len(texts_token))