In [None]:
import pytesseract
import os
from PIL import Image
from paddleocr import PaddleOCR
import easyocr
from google.cloud import vision

def ocr_with_google_vision(image_path):
    client = vision.ImageAnnotatorClient()
    with open(image_path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    return texts[0].description if texts else ""

def ocr_with_easyocr(image_path, lang_list=['en']):
    reader = easyocr.Reader(lang_list)
    results = reader.readtext(image_path)
    # Each result is a tuple: (bbox, text, confidence)
    text_lines = [result[1] for result in results]
    return "\n".join(text_lines).strip()

def ocr_with_pytesseract(image_path):
    """
    Perform OCR using Tesseract.
    """
    image = Image.open(image_path)
    return pytesseract.image_to_string(image, config="--psm 6").strip()

def ocr_with_paddleocr(image_path, lang="en"):
    """
    Perform OCR using PaddleOCR.
    """
    # Initialize PaddleOCR (disable angle classification for speed)
    ocr = PaddleOCR(lang=lang, use_gpu=True, use_angle_cls=False)
    results = ocr.ocr(image_path, cls=False)[0]
    
    # Extract text from results: each result is [box, (text, confidence)]
    text_lines = [line[1][0] for line in results]
    return "\n".join(text_lines).strip()

def compare_ocr_engines(image_path, output_dir="ocr_results"):
    """
    Compare OCR results from Tesseract and PaddleOCR.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Perform OCR with Tesseract
    tesseract_text = ocr_with_pytesseract(image_path)
    with open(os.path.join(output_dir, "tesseract_results.txt"), "w", encoding="utf-8") as f:
        f.write(tesseract_text)

    # Perform OCR with PaddleOCR
    paddle_text = ocr_with_paddleocr(image_path)
    with open(os.path.join(output_dir, "paddleocr_results.txt"), "w", encoding="utf-8") as f:
        f.write(paddle_text)

    # Perform OCR with PaddleOCR
    easyocr_text = ocr_with_easyocr(image_path)
    with open(os.path.join(output_dir, "easyocr_results.txt"), "w", encoding="utf-8") as f:
        f.write(easyocr_text)

    return {
        "Tesseract Results": tesseract_text,
        "PaddleOCR Results": paddle_text,
        "Easyocr Results": easyocr_text,
    }


image_path = r"debug_imagesOLD\1040_p1_2024_12_12_002416\page_1\2_denoised.png"  # Replace with your image path
ocr_results = compare_ocr_engines(image_path)
print("OCR Comparison:")
print("Tesseract Results:")
print(ocr_results["Tesseract Results"])
print("\nPaddleOCR Results:")
print(ocr_results["PaddleOCR Results"])


In [93]:
# image_path = r"debug_imagesOLD\1040_p1_2024_12_12_002416\page_1\2_denoised.png"  # Replace with your image path
image_paths = [r"debug_imagesOLD\25-GL\page_1\preprocessed.png",
              r"debug_imagesOLD\1120S_p1_2024_12_27_085137_940\page_1\2_denoised.png"
              ]  # Replace with your image path

In [None]:
import io
import json
from google.cloud import vision
import os
from pdf2image import convert_from_path
from nltk import stopwords

output_dir="ocr_results"

stop_words = set(stopwords.words("english"))

def detect_document_local(image_paths):
    """
    Perform document text detection on a local image file using Google Cloud Vision.
    Note: Synchronous detection supports images (e.g., JPEG, PNG) but not PDFs.
    """
    resps = []
    for image_path in image_paths:
        client = vision.ImageAnnotatorClient()

        image = vision.Image(content=image_path)

        # Perform document text detection (synchronous request)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(response.error.message)

        annotation = response.full_text_annotation

        # Print the full text extracted from the document.
        # print("Full text:\n")
        # print(annotation.text)
        with open(os.path.join(output_dir, "cloud_vision_results.txt"), "w", encoding="utf-8") as f:
            f.write(annotation.text)

        # Optionally, return the annotation for further processing.
        resps.append(response)
    return resps

# Get list of images -- passing in whole PDF, expecting 
# response to be list of elements corresponding to each page
image_paths = convert_from_path(r"uploaded_samples\ilovepdf_merged.pdf")

# Convert each image in list to bytes -- each element should be uploaded to S3
# Make sure s3_object_key works as it does in original code
image_bytes = [image._repr_png_() for image in image_paths]

# Pass in list of image bytes to API.
# Get response. 
# `words`, `coords` start from the second element, 
# as the first element in each sublist is the full text annotation
r = detect_document_local(image_bytes)

# List of words per page
words = [[s.description for s in t.text_annotations[1:]] for t in r]

# Base coordinates
x1_coords = [[s.bounding_poly.vertices[0].x for s in t.text_annotations[1:]] for t in r]
y1_coords = [[s.bounding_poly.vertices[0].y for s in t.text_annotations[1:]] for t in r]
x2_coords = [[s.bounding_poly.vertices[2].x for s in t.text_annotations[1:]] for t in r]
y2_coords = [[s.bounding_poly.vertices[2].y for s in t.text_annotations[1:]] for t in r]

# List of bounding boxes encompassing each word, per page
bboxes = [[(x1, y1, x2, y2) 
                for x1, y1, x2, y2 
                in zip(x1_coords, y1_coords, x2_coords, y2_coords)] for x1_coords, y1_coords, x2_coords, y2_coords in zip(x1_coords, y1_coords, x2_coords, y2_coords)]

# Width of each page -- single value per page
widths = [s.full_text_annotation.pages[0].width for s in r]
# Height of each page -- single value per page
heights = [s.full_text_annotation.pages[0].height for s in r]

# List of normalized bounding boxes per page
normalized_bboxes = [[(x1/width, y1/height, x2/width, y2/height) for x1, y1, x2, y2 in bbox] for bbox, width, height in zip(bboxes, widths, heights)]

tokens = [word.lower() for word in words]

words_for_clf = set([word for word in tokens if word not in stop_words])

# Will pass in None, as this column may not be needed  
lines = None

In [None]:

normalized_bboxes = [[(x1/width, y1/height, x2/width, y2/height) 
                for x1, y1, x2, y2, width, height 
                in zip(x1_coords, y1_coords, x2_coords, y2_coords, widths, heights)]]

1

In [116]:
len(words)

98

In [115]:
len(word_coords)

601

In [90]:
len(words)

601

In [89]:
word_coords

[(204, 263, 437, 292),
 (628, 264, 1048, 312),
 (1068, 264, 1154, 312),
 (1173, 264, 1474, 312),
 (1496, 264, 1868, 312),
 (2094, 255, 2153, 278),
 (2159, 256, 2167, 277),
 (2167, 256, 2202, 278),
 (2204, 257, 2211, 278),
 (2212, 257, 2241, 278),
 (2242, 257, 2250, 278),
 (2249, 257, 2308, 280),
 (2307, 258, 2318, 279),
 (2151, 292, 2283, 310),
 (228, 347, 295, 371),
 (308, 347, 508, 371),
 (524, 347, 550, 371),
 (565, 347, 674, 371),
 (688, 347, 730, 371),
 (744, 347, 767, 371),
 (782, 347, 906, 371),
 (917, 347, 959, 371),
 (973, 347, 1184, 371),
 (1200, 347, 1283, 371),
 (1296, 347, 1361, 371),
 (1375, 347, 1522, 371),
 (1536, 347, 1580, 371),
 (1595, 347, 1708, 371),
 (1721, 347, 1809, 371),
 (1823, 347, 1880, 371),
 (1896, 347, 2094, 371),
 (2110, 347, 2236, 371),
 (2237, 347, 2245, 371),
 (2258, 347, 2327, 371),
 (227, 384, 426, 409),
 (443, 384, 528, 409),
 (545, 384, 608, 409),
 (623, 384, 867, 409),
 (882, 384, 927, 409),
 (942, 384, 1136, 409),
 (1151, 384, 1263, 409),
 (1265

In [36]:
from google.cloud.vision_v1.types.image_annotator import AnnotateImageResponse

In [38]:
AnnotateImageResponse.label_annotations

AttributeError: type object 'AnnotateImageResponse' has no attribute 'label_annotations'

In [None]:
import layoutparser as lp
import cv2

In [None]:
image = cv2.imread(r"debug_images\page_0\cropped\depreciation_value_cropped.png")
image = image[..., ::-1]
    # Convert the image from BGR (cv2 default loading style)
    # to RGB

In [None]:
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
    # Load the deep layout model from the layoutparser API
    # For all the supported model, please check the Model
    # Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html