In [4]:
from datetime import datetime
start_time = datetime.now()
files = range(1000)
i = 0
for i in files:
    i += 1
print(i)
end_time = datetime.now()
print(f'''Total Processed: {str(len(files))}
Start: {start_time}
End: {end_time}''')

1000
Total Processed: 1000
Start: 2024-12-21 22:05:10.044903
End: 2024-12-21 22:05:10.045356


: 

In [8]:
files[0].split('.')[0]

'1040_p1_2024_12_12_002347'

: 

In [7]:
print(files)

['1040_p1_2024_12_12_002347.pdf', '1040_p1_2024_12_12_002348.pdf', '1040_p1_2024_12_12_002416.pdf', '1040_p1_2024_12_12_193405.pdf', '1040_p1_2024_12_12_193407.pdf', '1040_p1_2024_12_12_193408.pdf', '1040_p1_2024_12_12_193409.pdf', '1040_p1_2024_12_12_193410.pdf', '1040_p1_2024_12_12_193411.pdf', '1040_p1_2024_12_12_193412.pdf', '1040_p1_2024_12_12_193414.pdf', '1040_p1_2024_12_12_193415.pdf', '1040_p1_2024_12_18_231709_614.pdf', '1040_p1_2024_12_18_231709_669.pdf', '1040_p1_2024_12_18_231709_707.pdf', '1040_p1_2024_12_18_231709_838.pdf', '1040_p1_2024_12_18_231709_985.pdf', '1040_p1_2024_12_18_231710_127.pdf', '1040_p1_2024_12_18_231710_193.pdf', '1040_p1_2024_12_18_231710_232.pdf', '1040_p1_2024_12_18_231710_361.pdf', '1040_p1_2024_12_18_231710_513.pdf', '1040_p1_2024_12_18_231710_685.pdf', '1040_p1_2024_12_18_231710_819.pdf', '1040_p1_2024_12_18_231710_866.pdf', '1040_p1_2024_12_18_231710_924.pdf', '1040_p1_2024_12_18_231710_978.pdf', '1040_p1_2024_12_18_231711_102.pdf', '1040_p1_20

In [13]:
from paddleocr import PaddleOCR
from transformers import AutoTokenizer
from difflib import SequenceMatcher
from PIL import Image, ImageDraw

# Initialize PaddleOCR and tokenizer
ocr = PaddleOCR()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def extract_ocr_data(image_path):
    """
    Extract OCR words and bounding boxes from an image using PaddleOCR.
    
    Args:
        image_path (str): Path to the image file.
    
    Returns:
        list of dict: List containing OCR words and their bounding boxes.
    """
    # Perform OCR on the image
    ocr_results = ocr.ocr(image_path, cls=True)[0]

    # Extract words and bounding boxes
    words_bboxes = []
    for line in ocr_results:
        bbox = line[0]  # Coordinates of the bounding box
        text = line[1][0]  # Detected text
        words_bboxes.append({"text": text, "bbox": bbox})

    return words_bboxes


def tokenize_and_map(ocr_data):
    """
    Tokenize OCR words and map tokens back to their bounding boxes.
    
    Args:
        ocr_data (list of dict): OCR words and their bounding boxes.
    
    Returns:
        list of dict: List of tokenized words and their mapped bounding boxes.
    """
    tokenized_data = []
    
    for item in ocr_data:
        text = item["text"]
        bbox = item["bbox"]

        # Tokenize the word
        tokens = tokenizer.tokenize(text)

        # Map each token to the same bbox
        for token in tokens:
            tokenized_data.append({"token": token, "bbox": bbox})
    
    return tokenized_data


def find_best_match(question, tokenized_data):
    """
    Find the best match between the question and tokenized OCR data.
    
    Args:
        question (str): The extraction question.
        tokenized_data (list of dict): Tokenized OCR words and their bounding boxes.
    
    Returns:
        dict: The best-matching token and its bounding box.
    """
    best_match = None
    best_score = 0

    for item in tokenized_data:
        token = item["token"]
        score = SequenceMatcher(None, question.lower(), token.lower()).ratio()

        if score > best_score:
            best_match = item
            best_score = score

    return best_match


def draw_bounding_box(image_path, bbox):
    """
    Draws the bounding box on the image and displays it.
    
    Args:
        image_path (str): Path to the input image.
        bbox (list): Bounding box coordinates [[x1, y1], [x2, y2], [x3, y3], [x4, y4]].
    
    Returns:
        None
    """
    # Open the image
    image = Image.open(image_path)

    # Draw the bounding box
    draw = ImageDraw.Draw(image)
    polygon = [(point[0], point[1]) for point in bbox]
    draw.polygon(polygon, outline="red", width=3)

    # Display the image
    image.show()


def main(image_path, question):
    """
    Main function to process the image and question, and find the matching token and bbox.
    
    Args:
        image_path (str): Path to the input image.
        question (str): Question for extracting the relevant OCR word.
    
    Returns:
        dict: The best-matching token and its bounding box.
    """
    # Step 1: Extract OCR data
    ocr_data = extract_ocr_data(image_path)
    print(f"Extracted {len(ocr_data)} OCR words and bboxes.")

    # Step 2: Tokenize OCR words and map tokens to bboxes
    tokenized_data = tokenize_and_map(ocr_data)
    print(f"Tokenized into {len(tokenized_data)} tokens.")

    # Step 3: Find the best match for the question
    best_match = find_best_match(question, tokenized_data)

    # Step 4: Draw the bounding box on the image
    if best_match:
        print("Best Match:")
        print(f"Token: {best_match['token']}")
        print(f"Bounding Box: {best_match['bbox']}")
        draw_bounding_box(image_path, best_match["bbox"])
    else:
        print("No match found.")

    return best_match


[2024/12/15 22:28:15] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\wband/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\wband/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6

In [19]:
from custom_pipeline import layoutlm_paddleocr_pipeline

In [20]:
image_path = r"clf_images\1040_sch_c\base\1040_sch_c.png"  # Replace with your image path

layoutlm_paddleocr_pipeline(image_path, 'Locate the Gross Revenue label in the Income section')

[2024/12/15 22:33:59] ppocr DEBUG: dt_boxes num : 191, elapsed : 0.09328770637512207
[2024/12/15 22:34:00] ppocr DEBUG: cls num  : 191, elapsed : 0.4401552677154541
[2024/12/15 22:34:00] ppocr DEBUG: rec_res num  : 191, elapsed : 0.2651042938232422


{'answer': '5',
 'score': 0.014386151917278767,
 'final_answer': None,
 'final_score': None,
 'words': ['SCHEDULE C',
  'Profit or Loss From Business',
  'OMB No. 1545-0074',
  '(Form 1040)',
  '(Sole Proprietorship)',
  '2024',
  'Department of the Treasury',
  'Attach to Form 1040, 1040-SR, 1040-SS, 1040-NR, or 1041; partnerships must generally file Form 1065.',
  'Go to www.irs.gov/ScheduleC for instructions and the latest information.',
  'Attachment',
  'Internal Revenue Service',
  'Sequence No.09',
  'Name of proprietor',
  'Social security number (SSN)',
  'A',
  'Principal business or profession, including product or service (see instructions)',
  'B Enter code from instructions',
  'C',
  'Business name. If no separate business name, leave blank.',
  'D Employer ID number (EIN) (see instr.)',
  'E',
  'Business address (including suite or room no.)',
  'City, town or post office, state, and ZIP code',
  'F',
  'Accounting method:',
  '(1) Cash',
  '(2',
  'Accrual',
  '(3)Oth

In [18]:
image_path = r"clf_images\1040_sch_c\base\1040_sch_c.png"  # Replace with your image path
question = "Gross profit"  # Replace with your question

result = main(image_path, question)
if result:
    print(f"Final Match: Token: {result['token']}, BBox: {result['bbox']}")
else:
    print("No match found.")


[2024/12/15 22:29:51] ppocr DEBUG: dt_boxes num : 201, elapsed : 0.1045372486114502
[2024/12/15 22:29:51] ppocr DEBUG: rec_res num  : 201, elapsed : 0.4250507354736328
Extracted 187 OCR words and bboxes.
Tokenized into 995 tokens.
Best Match:
Token: profit
Bounding Box: [[261.0, 30.0], [504.0, 31.0], [504.0, 50.0], [261.0, 49.0]]
Final Match: Token: profit, BBox: [[261.0, 30.0], [504.0, 31.0], [504.0, 50.0], [261.0, 49.0]]


In [None]:

# Example usage
image_path = "sample_image.png"  # Replace with your image path
question = "Locate 'Name' field"  # Replace with your question

result = main(image_path, question)
print("Best match:")
print(f"Word: {result['text']}")
print(f"Bounding Box: {result['bbox']}")
