In [1]:
import cv2
import pytesseract
import pathlib
from pathlib import Path
from pytesseract import Output
from pdf2image import convert_from_path

#### To install tesseract in Windows, here a good reference: https://www.youtube.com/watch?v=HNCypVfeTdw

## First step: Convert the pdf file into image

In [3]:
def pdf_to_image(pdf_path, output_folder: str = "."):
    """
    Convert the pdf file into an image.
    
    Args:
        - pdf_path : 
        - output_folder : 

    """
    # Create the output folder if it doesn't exist
    if not Path(output_folder).exists():
        Path(output_folder).mkdir()

    pages = convert_from_path(pdf_path, output_folder=output_folder, poppler_path=r'C:\Program Files\Release-24.08.0-0\poppler-24.08.0\Library\bin', fmt="png")
    
    return pages


In [4]:
pdf_path = "./data/camel.pdf"

pdf_to_image(pdf_path, output_folder="data")


[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1700x2200>]

#### An image is added in the "data" folder.

In [6]:
scanned_img = "data/ced4c354-856c-435e-93d8-36064ccabc6a-1.png"

## Second step : Extract text from image

In [7]:
def image_to_text(input_path):
   """
   Read text from images.

   Args:
      - input_path : 
   """
   img = cv2.imread(input_path)
   text = pytesseract.image_to_string(img)

   return text

In [8]:
print(image_to_text(scanned_img)[:600])

CAMEL: Communicative Agents for “Mind”
Exploration of Large Language Model Society

https: //www.camel-ai.org

Guohao Li* Hasan Abed Al Kader Hammoud* = HaniItani’ — Dmitrii Khizbullin

Bernard Ghanem

King Abdullah University of Science and Technology (KAUST)

Abstract

The rapid advancement of chat-based language models has led to remarkable
progress in complex task-solving. However, their success heavily relies on human
input to guide the conversation, which can be challenging and time-consuming.
This paper explores the potential of building scalable techniques to facilitate au-
tonomous co


### To convert the image into an interactive pdf

In [9]:
def image_to_interactif_pdf(input_path):
    raw_pdf = pytesseract.image_to_pdf_or_hocr(input_path)
    with open("data/camel_interactive.pdf", "w+b") as f:
        f.write(bytearray(raw_pdf))
    

In [10]:
image_to_interactif_pdf(scanned_img)

In [None]:
# Extract recognized data from easy text
data = pytesseract.image_to_data(scanned_img, output_type=Output.DICT)

In [14]:
len(data["text"])

516

In [15]:
def draw_bounding_boxes(input_img_path, output_path):
   img = cv2.imread(input_img_path)

   # Extract data
   data = pytesseract.image_to_data(img, output_type=Output.DICT)
   n_boxes = len(data["text"])

   for i in range(n_boxes):
       if data["conf"][i] == -1:
           continue
       # Coordinates
       x, y = data["left"][i], data["top"][i]
       w, h = data["width"][i], data["height"][i]

       # Corners
       top_left = (x, y)
       bottom_right = (x + w, y + h)

       # Box params
       green = (0, 255, 0)
       thickness = 1  # The function-version uses thinner lines

       cv2.rectangle(img, top_left, bottom_right, green, thickness)

   # Save the image with boxes
   cv2.imwrite(output_path, img)


In [17]:
output_path = "data/caac3ece-5a4a-4585-88c4-71e814801ced-1_with_boxes.png"
draw_bounding_boxes(scanned_img, output_path)

In [None]:
raw_pdf = pytesseract.image_to_pdf_or_hocr(scanned_img)

with open("camel_img.pdf", "w+b") as f:
   f.write(bytearray(raw_pdf))
