<a href="https://colab.research.google.com/github/zfriedman0/LayoutLMv2-for-PO/blob/main/LayoutLM_for_PurchaseOrders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning LayoutLMv2 on Purchase Order PDFs

## Environment Setup

In [1]:
!pip install datasets
!pip install torch
!pip install transformers[torch]
!pip install accelerate -U
!pip install pyyaml
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m348.2/547.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-an

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import json
import torch
from torchvision.transforms import ToTensor
from transformers import AdamW, LayoutLMv2ForTokenClassification, LayoutLMv2Processor, Trainer, TrainingArguments, default_data_collator
from datasets import Dataset, Features, Sequence, ClassLabel, Value, Array3D, Array2D, load_dataset
from PIL import Image
from functools import partial
import os
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

Mounted at /content/drive


In [3]:
checkpoint_path = '/content/drive/MyDrive/LayoutLM-for-PO/results-BEST/checkpoint-1500'

processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
model = LayoutLMv2ForTokenClassification.from_pretrained('microsoft/layoutlmv2-base-uncased', num_labels=7)
saved_model = LayoutLMv2ForTokenClassification.from_pretrained(checkpoint_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/802M [00:00<?, ?B/s]

Some weights of LayoutLMv2ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
with open('/content/drive/MyDrive/Annotations/layoutlm-train-annotations.json', 'r') as f:
  train = json.load(f)

with open('/content/drive/MyDrive/Annotations/layoutlm-val-annotations.json', 'r') as f:
  val = json.load(f)

In [5]:
numeric_labels = set(train[200]['word_labels'])
word_labels = ["Delivery Address", "Customer Name",
               "Contact Name", "Item Reference Number",
               "Item Ordered Quantity", "Item Delivery Date",
               "Purchase Order Number"]

id2label = {v: k for v, k in enumerate(word_labels)}
label2id = {k: v for v, k in enumerate(word_labels)}

print(label2id)
print(id2label)

{'Delivery Address': 0, 'Customer Name': 1, 'Contact Name': 2, 'Item Reference Number': 3, 'Item Ordered Quantity': 4, 'Item Delivery Date': 5, 'Purchase Order Number': 6}
{0: 'Delivery Address', 1: 'Customer Name', 2: 'Contact Name', 3: 'Item Reference Number', 4: 'Item Ordered Quantity', 5: 'Item Delivery Date', 6: 'Purchase Order Number'}


In [6]:
print(train[0])

{'image': '/content/drive/MyDrive/Data/LayoutLMv2-TRAIN/P54624_page_2.png', 'bbox': [[[483, 46, 58, 18]], [[136, 72, 150, 22]], [[756, 71, 104, 12]], [[892, 71, 61, 11]], [[759, 90, 99, 11]], [[882, 87, 70, 17]], [[153, 95, 130, 10]], [[779, 105, 80, 17]], [[912, 106, 42, 14]], [[69, 111, 150, 15]], [[70, 129, 221, 13]], [[307, 134, 575, 13]], [[69, 160, 37, 13]], [[105, 161, 91, 11]], [[240, 160, 101, 12]], [[419, 158, 62, 17]], [[490, 161, 33, 12]], [[585, 161, 36, 11]], [[634, 161, 85, 11]], [[735, 158, 58, 17]], [[859, 158, 56, 17]], [[105, 181, 247, 13]], [[69, 195, 19, 13]], [[105, 194, 93, 12]], [[241, 195, 91, 11]], [[415, 195, 62, 11]], [[501, 195, 22, 13]], [[575, 193, 47, 14]], [[650, 195, 68, 11]], [[763, 195, 31, 12]], [[872, 195, 81, 11]], [[105, 209, 246, 13]], [[69, 222, 19, 12]], [[105, 222, 94, 12]], [[240, 222, 93, 12]], [[415, 224, 62, 11]], [[502, 222, 21, 13]], [[574, 221, 49, 14]], [[650, 224, 68, 11]], [[763, 222, 30, 12]], [[871, 224, 82, 11]], [[105, 237, 247,

In [7]:
print(val[0])

{'image': '/content/drive/MyDrive/Data/LayoutLMv2-VAL/P55195_page_1.png', 'bbox': [[[483, 46, 58, 18]], [[145, 72, 141, 22]], [[756, 71, 104, 12]], [[890, 68, 64, 16]], [[759, 90, 99, 11]], [[882, 87, 72, 15]], [[153, 95, 130, 10]], [[781, 108, 77, 11]], [[912, 106, 42, 14]], [[69, 111, 150, 15]], [[70, 129, 220, 13]], [[307, 134, 575, 13]], [[69, 172, 51, 14]], [[478, 171, 139, 15]], [[69, 188, 180, 13]], [[478, 189, 160, 13]], [[70, 206, 130, 13]], [[478, 206, 108, 13]], [[70, 222, 169, 13]], [[478, 222, 203, 13]], [[67, 248, 59, 17]], [[262, 248, 114, 15]], [[478, 250, 114, 13]], [[676, 252, 101, 11]], [[70, 265, 110, 10]], [[262, 263, 110, 13]], [[697, 261, 79, 16]], [[69, 311, 94, 13]], [[285, 313, 64, 11]], [[517, 313, 43, 11]], [[69, 325, 88, 13]], [[283, 325, 85, 11]], [[513, 324, 172, 15]], [[66, 379, 130, 15]], [[240, 379, 101, 15]], [[419, 379, 61, 16]], [[490, 381, 33, 12]], [[585, 381, 38, 13]], [[634, 382, 85, 11]], [[738, 382, 55, 11]], [[858, 380, 58, 15]], [[108, 400, 

In [8]:
for i, item in enumerate(val):
  print(f"{i+1}: len(bbox): {len(item['bbox'])}, len(words): {len(item['words'])}, {item['image']}, labels: {len(item['word_labels'])}")

1: len(bbox): 66, len(words): 66, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P55195_page_1.png, labels: 12
2: len(bbox): 57, len(words): 57, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P55310_page_2.png, labels: 12
3: len(bbox): 53, len(words): 53, /content/drive/MyDrive/Data/LayoutLMv2-VAL/Neighbors 0002732_page_1.png, labels: 8
4: len(bbox): 181, len(words): 181, /content/drive/MyDrive/Data/LayoutLMv2-VAL/Omni P49767_page_1.png, labels: 48
5: len(bbox): 66, len(words): 66, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P53083_page_1.png, labels: 12
6: len(bbox): 114, len(words): 114, /content/drive/MyDrive/Data/LayoutLMv2-VAL/p62828_page_6.png, labels: 30
7: len(bbox): 61, len(words): 61, /content/drive/MyDrive/Data/LayoutLMv2-VAL/PO_2-57-378765_20200520_page_1.png, labels: 10
8: len(bbox): 62, len(words): 62, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P60883_page_1.png, labels: 12
9: len(bbox): 84, len(words): 84, /content/drive/MyDrive/Data/LayoutLMv2-VAL/P56559_page_1.png, labels: 18
1

## Preprocessing

In [9]:
max_length = 512
im_size = (224, 224)

def preprocess_data(example):
  words = example['words']
  bboxes = example['bbox']
  labels = example['word_labels']
  image_path = example['image']

  bboxes_converted = []
  for box in bboxes:
    x, y, width, height = box[0]
    bboxes_converted.append([x, y, x + width, y + height])

  image = Image.open(image_path).convert('RGB')
  image = image.resize(im_size)
  image_tensor = ToTensor()(image)

  encoding = processor(
      images=image_tensor,
      text=words,
      boxes=bboxes_converted,
      padding='max_length',
      truncation=True,
      max_length=max_length,
      return_tensors='pt'
  )

  token_labels = []
  for word, label in zip(words, labels):
    word_tokens = processor.tokenizer.tokenize(word)
    token_labels.extend([label] * len(word_tokens))

  token_labels = token_labels[:max_length] + [-100] * (max_length - len(token_labels))

  return {
      'input_ids': encoding['input_ids'].squeeze().tolist(),
      'attention_mask': encoding['attention_mask'].squeeze().tolist(),
      'bbox': encoding['bbox'].squeeze().tolist(),
      'labels': token_labels,
      'image': image_tensor
  }

In [10]:
processed_train = [preprocess_data(example) for example in train]
processed_val = [preprocess_data(example) for example in val]

train_set = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in processed_train],
    'attention_mask': [example['attention_mask'] for example in processed_train],
    'bbox': [example['bbox'] for example in processed_train],
    'labels': [example['labels'] for example in processed_train],
    'image': [example['image'] for example in processed_train]
})

val_set = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in processed_val],
    'attention_mask': [example['attention_mask'] for example in processed_val],
    'bbox': [example['bbox'] for example in processed_val],
    'labels': [example['labels'] for example in processed_val],
    'image': [example['image'] for example in processed_val]
})

In [11]:
train_set.set_format(type="torch")
val_set.set_format(type="torch")

In [12]:
print(val_set[205]['bbox'])

tensor([[  0,   0,   0,   0],
        [483,  46, 541,  64],
        [144,  72, 285,  94],
        ...,
        [  0,   0,   0,   0],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0]])


In [13]:
for i, item in enumerate(val_set):
  print(f"{i} \
          input_ids: {item['input_ids'].size()}, \
          attn_mask: {item['attention_mask'].size()}, \
          bbox: {item['bbox'].size()}, \
          labels: {item['labels'].size()}, \
          image: {item['image'].size()}")

0           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
1           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
2           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
3           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])
4           input_ids: torch.Size([512]),           attn_mask: torch.Size([512]),           bbox: torch.Size([512, 4]),           labels: torch.Size([512]),           image: torch.Size([3, 224, 224])


In [14]:
# Iterate over each item in the validation set with its index
for idx, item in enumerate(val_set):
    # Iterate over each bounding box in the item
    for bbox in item['bbox']:
        x1, y1, x2, y2 = bbox  # Assuming bbox is in the format [x1, y1, x2, y2]
        # Check the conditions
        if (x2 - x1 < 0) or (y2 - y1 < 0):
            print(item['image'])
            print("Item with invalid bbox dimensions:", item)
            print("BBox:", bbox)

In [15]:
print(val_set[204]['bbox'])

tensor([[   0,    0,    0,    0],
        [ 483,   46,  541,   64],
        [ 146,   72,  286,   94],
        ...,
        [ 877,  569,  951,  580],
        [ 877,  569,  951,  580],
        [1000, 1000, 1000, 1000]])


## Training

In [16]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/results',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set
)

trainer.train()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
500,0.3787
1000,0.2217


Step,Training Loss
500,0.3787
1000,0.2217
1500,0.1538


TrainOutput(global_step=1765, training_loss=0.2319747168349139, metrics={'train_runtime': 929.1364, 'train_samples_per_second': 3.799, 'train_steps_per_second': 1.9, 'total_flos': 1905616378152960.0, 'train_loss': 0.2319747168349139, 'epoch': 5.0})

## Validation

In [17]:
torch.cuda.empty_cache()

In [18]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/eval-results',
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/eval-logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args = training_args,
    eval_dataset=val_set
)

eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.061071813106536865, 'eval_runtime': 22.8909, 'eval_samples_per_second': 20.925, 'eval_steps_per_second': 2.621}


In [19]:
save_directory = '/content/drive/MyDrive/LayoutLM-for-PO/saved_model-06-28-2024'

# Save the model
model.save_pretrained(save_directory)

# Save the processor (tokenizer and feature extractor)
processor.save_pretrained(save_directory)

[]

# DEBUG

In [None]:
checkpoint_path = '/content/drive/MyDrive/LayoutLM-for-PO/results-BEST/checkpoint-1500'

model = LayoutLMv2ForTokenClassification.from_pretrained(checkpoint_path)
processor = LayoutLMv2Processor.from_pretrained(checkpoint_path)

OSError: /content/drive/MyDrive/LayoutLM-for-PO/results-BEST/checkpoint-1500 does not appear to have a file named preprocessor_config.json. Checkout 'https://huggingface.co//content/drive/MyDrive/LayoutLM-for-PO/results-BEST/checkpoint-1500/tree/main' for available files.

In [None]:
with open('/content/drive/MyDrive/Annotations/layoutlm-train-annotations-NEWNEW.json', 'r') as f:
  train = json.load(f)

with open('/content/drive/MyDrive/Annotations/layoutlm-val-annotations-NEW.json', 'r') as f:
  val = json.load(f)

In [None]:
train_small = processed_train[0:9]
val_small = processed_val[0:4]

train_set_small = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in train_small],
    'attention_mask': [example['attention_mask'] for example in train_small],
    'bbox': [example['bbox'] for example in train_small],
    'labels': [example['labels'] for example in train_small],
    'image': [example['image'] for example in train_small]
})

val_set_small = Dataset.from_dict({
    'input_ids': [example['input_ids'] for example in val_small],
    'attention_mask': [example['attention_mask'] for example in val_small],
    'bbox': [example['bbox'] for example in val_small],
    'labels': [example['labels'] for example in val_small],
    'image': [example['image'] for example in val_small]
})

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LayoutLM-for-PO/results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_dir='/content/drive/MyDrive/LayoutLM-for-PO/logs',
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set_small,
    eval_dataset=val_set_small
)

In [None]:
eval_results = trainer.evaluate(eval_dataset=val_set_small)

In [None]:
print(eval_results)

{'eval_loss': 0.01273418590426445, 'eval_runtime': 1.8208, 'eval_samples_per_second': 2.197, 'eval_steps_per_second': 0.549}


In [None]:
!pip install paddlepaddle paddleocr pdf2image

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.1-cp310-cp310-manylinux1_x86_64.whl (125.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.9/125.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting paddleocr
  Downloading paddleocr-2.7.3-py3-none-any.whl (780 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.0/780.0 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en')

def get_ocr_data(image):
  result = ocr.ocr(image, cls=True)
  words = []
  bounding_boxes = []

  for line in result:
    for word_info in line:
      word = word_info[1][0]
      bbox = word_info[0]

      x_min = min([point[0] for point in bbox])
      y_min = min([point[1] for point in bbox])
      x_max = max([point[0] for point in bbox])
      y_max = max([point[1] for point in bbox])

      width = x_max - x_min
      height = y_max - y_min
      bounding_box = [x_min, y_min, width, height]

      words.append(word)
      bounding_boxes.append(bounding_box)

  return words, bounding_boxes

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:00<00:00, 4.88MiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:00<00:00, 10.5MiB/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:01<00:00, 1.52MiB/s]

[2024/06/19 21:28:42] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_dict_path='




In [None]:
import numpy as np

def run_inference(path):

  image_np = np.array(image)
  image = Image.open(path).convert('RGB')
  image = image.resize((224, 224))

  words, boxes = get_ocr_data(image)

  encoded_inputs = processor(
      images=image,
      text=words,
      boxes=boxes,
      padding='max_length',
      truncation=True,
      max_length=512,
      return_tensors='pt'
  )

  if torch.cuda.is_available():
    model.to('cuda')
    encoded_inputs = {k: v.to('cuda') for k, v in encoded_inputs.items()}

  with torch.no_grad():
      outputs = model(**encoded_inputs)

  predictions = outputs.logits.argmax(-1).squeeze().tolist()

  return predictions

In [None]:
single = '/content/drive/MyDrive/Data/LayoutLMv2-VAL/Omni P46966_page_1.png'

predictions = run_inference(single)

print(f"Predictions: {predictions}")

UnboundLocalError: local variable 'image' referenced before assignment