In [1]:
import pickle
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd

In [2]:
pdf_images = convert_from_path("Input/set-1/SP_MIS02824100914340 1.pdf")
image = pdf_images[0].convert("RGB")

In [3]:

# Manually set the full path to tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

In [4]:
# Extract words and their bounding boxes

words = []
boxes = []

for i in range(len(ocr_data['text'])):
    if ocr_data['text'][i].strip():
        words.append(ocr_data['text'][i])
        x, y, w, h = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
        boxes.append([x, y, x+w, y+h])


print('Extracted Words: ', words)
print('Bounding Boxes: ', boxes)

Extracted Words:  ['VEHICLE', 'LOADING', 'REPORT', '-', 'BILL', 'OF', 'LADING', 'Page:', '2', '530', 'RAPPORT', 'DE', 'CHARGEMENT', '-', 'VEHICULE', '-', 'CONNAISSEMENT', 'DOGUMENTNUMBER', '6363', '|', '—', 'RATE', 'SYEPER', 'oy', '|', 'Ne', 'bUDOCUMENT', 'mia/ad/', 'yy', '24', 'HOUR', 'NUMBER', '(613]996-6666', '-', 'NUMERO', '24', 'HEURES', '0362418', '10/006', '10/07/24', '‘COLLECT', '-', 'FRAIS', 'VIRES', '|', 'CONSIGNEE;CUSTOMER', '-', 'DESTINATAIRE/CLIENT', 'Roo', 'eyaaeeNent', '|', 'CONSIGNOR', '-', 'EXPEDITEUR', '|', 'nora', '|', 'PETRO', 'CANADA', 'LUBRICANTS', 'INC', '0005064508', '|SUNCOR', 'ENERGY', 'MONTREAL', '0082041', 'PCLI', '-', 'QUEBEC', '302', '0009406423', '11675', 'rue', 'Sherbrooke', 'est', 'oO', '385', 'SOUTHDOWN', 'ROAD', 'Montreal,', 'QC', 'H1B', '103', '(613)', '996-6666', 'MISSISSAUGA,', 'ON', 'LSJ', '2Y¥3', '__', 'ee', 'Yannick', 'Mercier', 'RELEASE', '(C-of-A)', 'NUMBER', 'CUSTOMER', 'PURCHASE', 'ORDER', 'NUMBER', 'No', 'OE', 'DEMANCE', 'DU', 'CLIENT', 'TC

In [5]:
df = pd.DataFrame(words, columns=["words"])
df.to_excel("words.xlsx", index = False)

In [6]:
input_text = "OCR mistakes: " + " ".join(words) + " → Fix spelling."
print(input_text)

OCR mistakes: VEHICLE LOADING REPORT - BILL OF LADING Page: 2 530 RAPPORT DE CHARGEMENT - VEHICULE - CONNAISSEMENT DOGUMENTNUMBER 6363 | — RATE SYEPER oy | Ne bUDOCUMENT mia/ad/ yy 24 HOUR NUMBER (613]996-6666 - NUMERO 24 HEURES 0362418 10/006 10/07/24 ‘COLLECT - FRAIS VIRES | CONSIGNEE;CUSTOMER - DESTINATAIRE/CLIENT Roo eyaaeeNent | CONSIGNOR - EXPEDITEUR | nora | PETRO CANADA LUBRICANTS INC 0005064508 |SUNCOR ENERGY MONTREAL 0082041 PCLI - QUEBEC 302 0009406423 11675 rue Sherbrooke est oO 385 SOUTHDOWN ROAD Montreal, QC H1B 103 (613) 996-6666 MISSISSAUGA, ON LSJ 2Y¥3 __ ee Yannick Mercier RELEASE (C-of-A) NUMBER CUSTOMER PURCHASE ORDER NUMBER No OE DEMANCE DU CLIENT TCARRIER-TRANSPORTEUR OO _ GEO A HALL 9401535 Vesicte NUMBER LICENSE NUMBER ADDRESS OF METER AND TRADER - _ 8800 - 6E CROISSANT No DU VEHICULE NOMBRE OE PERMIS | L'ADRESSE DE METRE ET DE COMMERCANT ANJOU 7 ee 1136 11675 rue Sherbrooke est HlJ 1Al1 | GERI eb FéanseOmTE se 1921 Montreal, OC H1B 1C3 | (514) 352-5550 (613) 99

In [7]:
# Get image dimensions (width, height)
image_width, image_height = image.size  # Assuming `image` is a PIL Image

# Normalize bounding boxes to fit in [0, 1000] range
normalized_boxes = [
    [
        int((x / image_width) * 1000),   # Normalize x1
        int((y / image_height) * 1000),  # Normalize y1
        int((x_w / image_width) * 1000), # Normalize x2
        int((y_h / image_height) * 1000) # Normalize y2
    ]
    for (x, y, x_w, y_h) in boxes
]

# Debugging: Print some normalized values
print("Original Bounding Boxes:", boxes[:5])
print("Normalized Bounding Boxes:", normalized_boxes[:5])

Original Bounding Boxes: [[896, 90, 1012, 112], [1023, 91, 1144, 113], [1156, 92, 1269, 113], [1278, 104, 1285, 106], [1296, 92, 1350, 113]]
Normalized Bounding Boxes: [[527, 40, 595, 50], [601, 41, 672, 51], [680, 41, 746, 51], [751, 47, 755, 48], [762, 41, 794, 51]]


In [9]:
labels_df = pd.read_excel('words.xlsx')
labels = labels_df['labels'].to_list()
print("Labels: ", labels)

Labels:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'QUESTION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'QUESTION', 'QUESTION', 'QUESTION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ANSWER', 'O', 'ANSWER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [10]:
# Create ner_tags
ner_tags = []
for i, label in enumerate(labels): 
    if label == 'QUESTION':
        ner_tags.append(1)
    elif label == 'ANSWER':
        ner_tags.append(2)
    else:
        ner_tags.append(0)

print(ner_tags)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [22]:
data = {
    "id": ["doc1"],
    "words": [words],
    "bboxes": [boxes],
    "ner_tags": [ner_tags]  # 1 = QUESTION, 2 = ANSWER
}

In [23]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch

In [24]:
from datasets import Dataset

dataset = Dataset.from_dict(data)
print(dataset)


Dataset({
    features: ['id', 'words', 'bboxes', 'ner_tags'],
    num_rows: 1
})


In [28]:
# Load pre-trained LayoutLMv3 processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=3)


def preprocess_data(examples):
    encoding = processor(
        images=image,
        text=examples["words"],
        boxes=examples["bboxes"],
        return_tensors="pt",
        padding="max_length",
        truncation=True
    )
    encoding["labels"] = examples["ner_tags"]  # Add labels
    

# Apply preprocessing
processed_dataset = dataset.map(preprocess_data)
print(processed_dataset)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:00<00:00, 20.00 examples/s]

Dataset({
    features: ['id', 'words', 'bboxes', 'ner_tags'],
    num_rows: 1
})





In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./layoutlmv3-finetuned",
    evaluation_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    logging_steps=10,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01
)


In [20]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor.tokenizer
)

trainer.train()

  trainer = Trainer(


ValueError: too many values to unpack (expected 2)

In [29]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch

In [None]:
# Load pre-trained LayoutLMv3 processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=4)

# Tokenization
encoding = processor(images=image, text=words, boxes=normalized_boxes, return_tensors="pt", padding="max_length", truncation=True)

# Run model inference
with torch.no_grad():
    outputs = model(**encoding)

# Get predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1)
print(predicted_labels)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[ 16, 269, 474, 506]])
