In [2]:
from ultralytics import YOLO
from PIL import Image
from pdf2image import convert_from_path
from io import BytesIO
import pathlib
import os

# Path to the input PDF
pdf_path = 'STANDARD GLASS_Price Band Ad_Material.pdf'

# Convert PDF pages to images with lower dpi to reduce memory usage
print("Converting PDF to images...")
pdf_images = convert_from_path(pdf_path, dpi=150)

# Process images in memory using BytesIO instead of saving to disk
temp_images = []
for page in pdf_images:
    img_bytes = BytesIO()
    page.save(img_bytes, format='PNG')
    temp_images.append(img_bytes.getvalue())

# Load the document segmentation model
print("Loading YOLO model...")
docseg_model = YOLO('yolov8x-doclaynet-epoch64-imgsz640-initiallr1e-4-finallr1e-5.pt').to('cpu')

# Initialize a dictionary to store results
mydict = {}

# Process images in batches to avoid memory overflow
batch_size = 1
print("Processing images with YOLO...")
for i in range(0, len(temp_images), batch_size):
    batch = temp_images[i:i + batch_size]

    # Save batch to temporary files (YOLO requires file paths)
    batch_paths = []
    for idx, img_data in enumerate(batch):
        img_path = f'temp_page_{i + idx + 1}.png'
        with open(img_path, 'wb') as f:
            f.write(img_data)
        batch_paths.append(img_path)

    # Process batch with YOLO
    results = docseg_model(source=batch_paths, save=True, show_labels=True, show_conf=True, boxes=True)

    # Extract and store results
    for entry in results:
        thepath = pathlib.Path(entry.path)
        thecoords = entry.boxes.xyxy.numpy() if entry.boxes else []
        mydict.update({str(thepath): thecoords})

    # Remove temporary files after processing
    for img_path in batch_paths:
        os.remove(img_path)

# Output the results
print("Processed results:")
for path, coords in mydict.items():
    print(f"Path: {path}")
    print("Coordinates:", coords)


Converting PDF to images...
Loading YOLO model...
Processing images with YOLO...


  return torch.load(file, map_location='cpu'), file  # load

0: 640x448 1 Page-footer, 3 Pictures, 8 Section-headers, 2 Tables, 15 Texts, 575.1ms
Speed: 3.6ms preprocess, 575.1ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 448)
Results saved to [1mruns/detect/predict3[0m

0: 640x448 7 List-items, 1 Section-header, 6 Tables, 10 Texts, 498.8ms
Speed: 2.3ms preprocess, 498.8ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 448)
Results saved to [1mruns/detect/predict3[0m

0: 640x448 37 List-items, 8 Section-headers, 6 Tables, 14 Texts, 497.0ms
Speed: 3.0ms preprocess, 497.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 448)
Results saved to [1mruns/detect/predict3[0m

0: 640x448 13 List-items, 5 Section-headers, 4 Tables, 24 Texts, 606.2ms
Speed: 2.6ms preprocess, 606.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 448)
Results saved to [1mruns/detect/predict3[0m

0: 352x640 15 Texts, 409.3ms
Speed: 1.6ms preprocess, 409

Processed results:
Path: /home/yash/Desktop/GithubDesktop/hdfc-securities/DocumentParsing/temp_page_1.png
Coordinates: [[      28.55      537.69      1918.1      699.59]
 [     22.458      422.03      1918.8      538.67]
 [     39.496      1605.3      1904.4      1703.1]
 [     41.393      1183.7      1901.8      1282.4]
 [     24.956      298.84      1921.4      353.13]
 [     29.924      722.83      1921.3      950.86]
 [     107.24      224.68      1833.7       298.1]
 [     571.65      1729.3      1372.4      1799.1]
 [     401.53      1013.5      1508.4      1055.7]
 [     25.743      1132.9      1898.2        1183]
 [     39.025      10.421      1897.5      49.402]
 [     29.718      982.93      1508.7      1003.6]
 [     51.913      389.18      1911.6      420.51]
 [     42.503        1544      1861.6      1591.4]
 [     1781.9      70.496      1905.1      217.43]
 [     197.82      349.69      1504.8      386.59]
 [     32.143      1815.3      1918.1      2915.7]
 [     560.19 