<a href="https://colab.research.google.com/github/zfriedman0/LayoutLMv2-for-PO/blob/main/LayoutLM_for_PurchaseOrders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning LayoutLMv2 on Purchase Order PDFs

## Environment Setup

In [1]:
!pip install datasets
!pip install torch
!pip install transformers[torch]
!pip install accelerate -U
!pip install pyyaml
!pip install "paddleocr>=2.0.1"
!pip install paddlepaddle-gpu
!pip install pdf2image
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.1-cp310-cp310-manylinux1_x86_64.whl (758.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.9/758.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from paddlepaddle-gpu)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting httpcore==1.* (from httpx->paddlepaddle-gpu)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->paddlepaddle-gpu)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.7 MB/s[0m

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import json
import torch
from torchvision.transforms import ToTensor
from transformers import AdamW, LayoutLMv2ForTokenClassification, LayoutLMv2Processor, Trainer, TrainingArguments, default_data_collator
from datasets import Dataset, Features, Sequence, ClassLabel, Value, Array3D, Array2D, load_dataset
from PIL import Image
from functools import partial
import os
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from paddleocr import PaddleOCR
from pdf2image import convert_from_path

Mounted at /content/drive




In [31]:
checkpoint_path = '/content/drive/MyDrive/LayoutLM-for-PO/saved_model-06-28-2024'

processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
model = LayoutLMv2ForTokenClassification.from_pretrained('microsoft/layoutlmv2-base-uncased', num_labels=7)
saved_model = LayoutLMv2ForTokenClassification.from_pretrained(checkpoint_path)

Some weights of LayoutLMv2ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
with open('/content/drive/MyDrive/Annotations/layoutlm-train-annotations.json', 'r') as f:
  train = json.load(f)

with open('/content/drive/MyDrive/Annotations/layoutlm-val-annotations.json', 'r') as f:
  val = json.load(f)

In [34]:
numeric_labels = set(train[200]['word_labels'])
word_labels = ["Delivery Address", "Customer Name",
               "Contact Name", "Item Reference Number",
               "Item Ordered Quantity", "Item Delivery Date",
               "Purchase Order Number"]

id2label = {v: k for v, k in enumerate(word_labels)}
label2id = {k: v for v, k in enumerate(word_labels)}

print(label2id)
print(id2label)

{'Delivery Address': 0, 'Customer Name': 1, 'Contact Name': 2, 'Item Reference Number': 3, 'Item Ordered Quantity': 4, 'Item Delivery Date': 5, 'Purchase Order Number': 6}
{0: 'Delivery Address', 1: 'Customer Name', 2: 'Contact Name', 3: 'Item Reference Number', 4: 'Item Ordered Quantity', 5: 'Item Delivery Date', 6: 'Purchase Order Number'}


In [37]:
for i, item in enumerate(val):
  print(f"{i+1}: len(bbox): {len(item['bbox'])}, len(words): {len(item['words'])}, {item['image']}, labels: {len(item['word_labels'])}")

1: len(bbox): 66, len(words): 66, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P55195_page_1.png, labels: 12
2: len(bbox): 57, len(words): 57, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P55310_page_2.png, labels: 12
3: len(bbox): 53, len(words): 53, /content/drive/MyDrive/Data/LayoutLMv2-VAL/Neighbors 0002732_page_1.png, labels: 8
4: len(bbox): 181, len(words): 181, /content/drive/MyDrive/Data/LayoutLMv2-VAL/Omni P49767_page_1.png, labels: 48
5: len(bbox): 66, len(words): 66, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P53083_page_1.png, labels: 12
6: len(bbox): 114, len(words): 114, /content/drive/MyDrive/Data/LayoutLMv2-VAL/p62828_page_6.png, labels: 30
7: len(bbox): 61, len(words): 61, /content/drive/MyDrive/Data/LayoutLMv2-VAL/PO_2-57-378765_20200520_page_1.png, labels: 10
8: len(bbox): 62, len(words): 62, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P60883_page_1.png, labels: 12
9: len(bbox): 84, len(words): 84, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P56559_page_1.png, labels: 18
1

## Preprocessing

In [47]:
max_length = 512
im_size = (224, 224)

def preprocess_data(example):
  words = example['words']
  bboxes = example['bbox']
  labels = example['word_labels']
  image_path = example['image']

  bboxes_converted = []
  for box in bboxes:
    x, y, width, height = box[0]
    bboxes_converted.append([x, y, x + width, y + height])

  image = Image.open(image_path).convert('RGB')
  image = image.resize(im_size)
  image_tensor = ToTensor()(image)

  encoding = processor(
      images=image_tensor,
      text=words,
      boxes=bboxes_converted,
      padding='max_length',
      truncation=True,
      max_length=max_length,
      return_tensors='pt'
  )

  token_labels = []
  for word, label in zip(words, labels):
    word_tokens = processor.tokenizer.tokenize(word)
    token_labels.extend([label] * len(word_tokens))

  token_labels = token_labels[:max_length] + [-100] * (max_length - len(token_labels))

  return {
      'input_ids': encoding['input_ids'].squeeze().tolist(),
      'attention_mask': encoding['attention_mask'].squeeze().tolist(),
      'bbox': encoding['bbox'].squeeze().tolist(),
      'labels': token_labels,
      'image': image_tensor.squeeze()
  }

In [10]:
processed_train = [preprocess_data(example) for example in train]
processed_val = [preprocess_data(example) for example in val]

train_set = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in processed_train],
    'attention_mask': [example['attention_mask'] for example in processed_train],
    'bbox': [example['bbox'] for example in processed_train],
    'labels': [example['labels'] for example in processed_train],
    'image': [example['image'] for example in processed_train]
})

val_set = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in processed_val],
    'attention_mask': [example['attention_mask'] for example in processed_val],
    'bbox': [example['bbox'] for example in processed_val],
    'labels': [example['labels'] for example in processed_val],
    'image': [example['image'] for example in processed_val]
})

In [11]:
train_set.set_format(type="torch")
val_set.set_format(type="torch")

In [12]:
print(val_set[205]['bbox'])

tensor([[  0,   0,   0,   0],
        [483,  46, 541,  64],
        [144,  72, 285,  94],
        ...,
        [  0,   0,   0,   0],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0]])


In [13]:
for i, item in enumerate(val_set):
  print(f"{i} \
          input_ids: {item['input_ids'].size()}, \
          attn_mask: {item['attention_mask'].size()}, \
          bbox: {item['bbox'].size()}, \
          labels: {item['labels'].size()}, \
          image: {item['image'].size()}")

0           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
1           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
2           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
3           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
4           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])


In [50]:
# Iterate over each item in the validation set with its index
for idx, item in enumerate(val_set):
    # Iterate over each bounding box in the item
    for bbox in item['bbox']:
        x1, y1, x2, y2 = bbox  # Assuming bbox is in the format [x1, y1, x2, y2]
        # Check the conditions
        if (x2 - x1 < 0) or (y2 - y1 < 0):
            print(item['image'])
            print("Item with invalid bbox dimensions:", item)
            print("BBox:", bbox)

In [15]:
print(val_set[204]['bbox'])

tensor([[   0,    0,    0,    0],
        [ 483,   46,  541,   64],
        [ 146,   72,  286,   94],
        ...,
        [ 877,  569,  951,  580],
        [ 877,  569,  951,  580],
        [1000, 1000, 1000, 1000]])


## Training

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/results',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set
)

trainer.train()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
500,0.3787
1000,0.2217


Step,Training Loss
500,0.3787
1000,0.2217
1500,0.1538


TrainOutput(global_step=1765, training_loss=0.2319747168349139, metrics={'train_runtime': 929.1364, 'train_samples_per_second': 3.799, 'train_steps_per_second': 1.9, 'total_flos': 1905616378152960.0, 'train_loss': 0.2319747168349139, 'epoch': 5.0})

## Validation

In [None]:
torch.cuda.empty_cache()

In [16]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/eval-results',
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/eval-logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args = training_args,
    eval_dataset=val_set
)

eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.991010069847107, 'eval_runtime': 404.1666, 'eval_samples_per_second': 1.185, 'eval_steps_per_second': 0.148}


In [None]:
save_directory = '/content/drive/MyDrive/LayoutLM-for-PO/saved_model-trash'

# Save the model
model.save_pretrained(save_directory)

# Save the processor (tokenizer and feature extractor)
processor.save_pretrained(save_directory)

[]

## Inference

In [55]:
def run_inference(example, processor, model):
  encoding = processor(
      images=example['image'],
      text=example.get(words, []),
      boxes=example.get(bbox, []),
      padding='max_length',
      truncation=True,
      max_length=512,
      return_tensors='pt'
  )

  with torch.no_grad():
      outputs = model(**encoding)

  predictions = outputs.logits.argmax(-1).squeeze().tolist()

  return predictions

In [53]:
# Load your saved model and processor
#processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", apply_ocr=False)
#saved_model = LayoutLMv2ForTokenClassification.from_pretrained('/content/drive/MyDrive/LayoutLM-for-PO/results-BEST/checkpoint-1500')

# Load the single annotation
with open('/content/drive/MyDrive/Annotations/layoutlm-single-annotation.json', 'r') as f:
    single = json.load(f)

# Assume single contains a list of examples and we process the first one
example = single[0]

# Use the processor to preprocess the data
processed_single = preprocess_data(example)

single_set = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in processed_single],
    'attention_mask': [example['attention_mask'] for example in processed_single],
    'bbox': [example['bbox'] for example in processed_single],
    'labels': [example['labels'] for example in processed_single],
    'image': [example['image'] for example in processed_single]
})

single_set.set_format(type="torch")

# Run inference
with torch.no_grad():
    outputs = saved_model(input_ids=single_set['input_ids'], attention_mask=single_set['attention_mask'], bbox=single_set['bbox'])

# Get predictions
predictions = outputs.logits.argmax(-1).squeeze().tolist()
print(f"Predictions: {predictions}")

TypeError: string indices must be integers

In [56]:
run_inference(processed_single['image'], processor, saved_model)

IndexError: too many indices for tensor of dimension 3

# DEBUG

In [None]:
with open('/content/drive/MyDrive/Annotations/layoutlm-train-annotations-NEWNEW.json', 'r') as f:
  train = json.load(f)

with open('/content/drive/MyDrive/Annotations/layoutlm-val-annotations-NEW.json', 'r') as f:
  val = json.load(f)

In [None]:
train_small = processed_train[0:9]
val_small = processed_val[0:4]

train_set_small = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in train_small],
    'attention_mask': [example['attention_mask'] for example in train_small],
    'bbox': [example['bbox'] for example in train_small],
    'labels': [example['labels'] for example in train_small],
    'image': [example['image'] for example in train_small]
})

val_set_small = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in val_small],
    'attention_mask': [example['attention_mask'] for example in val_small],
    'bbox': [example['bbox'] for example in val_small],
    'labels': [example['labels'] for example in val_small],
    'image': [example['image'] for example in val_small]
})

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set_small,
    eval_dataset=val_set_small
)

In [None]:
eval_results = trainer.evaluate(eval_dataset=val_set_small)

In [None]:
print(eval_results)

{'eval_loss': 0.01273418590426445, 'eval_runtime': 1.8208, 'eval_samples_per_second': 2.197, 'eval_steps_per_second': 0.549}


In [None]:
!pip install paddlepaddle paddleocr pdf2image

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.1-cp310-cp310-manylinux1_x86_64.whl (125.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.9/125.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting paddleocr
  Downloading paddleocr-2.7.3-py3-none-any.whl (780 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.0/780.0 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en')

def get_ocr_data(image):
  result = ocr.ocr(image, cls=True)
  words = []
  bounding_boxes = []

  for line in result:
    for word_info in line:
      word = word_info[1][0]
      bbox = word_info[0]

      x_min = min([point[0] for point in bbox])
      y_min = min([point[1] for point in bbox])
      x_max = max([point[0] for point in bbox])
      y_max = max([point[1] for point in bbox])

      width = x_max - x_min
      height = y_max - y_min
      bounding_box = [x_min, y_min, width, height]

      words.append(word)
      bounding_boxes.append(bounding_box)

  return words, bounding_boxes

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:00<00:00, 4.88MiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:00<00:00, 10.5MiB/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:01<00:00, 1.52MiB/s]

[2024/06/19 21:28:42] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_dict_path='


