## Installing requirements

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
! pip install -q git+https://github.com/huggingface/datasets.git "dill<0.3.5" seqeval
!pip install accelerate -U

## Pulling preprocessing file

In [None]:
! rm -r layoutlmv3FineTuning
! git clone -b master https://github.com/ubiai-incorporated/layoutlm-preprocess.git

## Loading UBIAI dataset from Drive

Export your data from UbiAI in "OCR Processed Format"make sure to place the exported zip in /content/

In [None]:
! rm -r data

In [None]:
#!/bin/bash
IOB_DATA_PATH = "/content/drive/MyDrive/Invoice Dataset/exported_data.zip"
! cd /content/
! rm -r data
! mkdir data
! cp "$IOB_DATA_PATH" data/dataset.zip
! cd data && unzip -q dataset && rm dataset.zip
! cd ..

rm: cannot remove 'data': No such file or directory


In [None]:
ls

[0m[01;34mdata[0m/  [01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
import os
import re

data_folder = '/content/data'
files = os.listdir(data_folder)
pattern = r'([a-f0-9\-]+)(?=_\w+\.txt)'
common_string = None

for file in files:
    match = re.search(pattern, file)
    if match:
        common_string = match.group(1)
        break

if common_string:
    original_zip_path = '/drive/MyDrive/Invoice Dataset/exported_date.zip'  # Original ZIP file path
    new_zip_path = f'/content/{common_string}.zip'  # New ZIP file path
    os.rename(original_zip_path, new_zip_path)
    print(f'Renamed zip file to {new_zip_path}')
else:
    print('Could not find the common string pattern in the text files')



then run this cell to create the output folder where the processed dataset will be saved

In [None]:
import os

output_folder = '/content/output'

subfolders = ['train_split', 'test_split', 'raw_data']
if not os.path.exists(output_folder):
    os.mkdir(output_folder)
for subfolder in subfolders:
    path = os.path.join(output_folder, subfolder)
    if not os.path.exists(path):
        os.mkdir(path)

## defining preprocessing params and running the script

In [None]:
#!/bin/bash
#preprocessing args
TEST_SIZE = 0.33
DATA_OUTPUT_PATH = "/content/"

## Run the preprocessing script

In [None]:
! python3 layoutlm-preprocess/preprocess.py --valid_size $TEST_SIZE --output_path $DATA_OUTPUT_PATH

# training

In [None]:
from datasets import load_metric
from transformers import TrainingArguments, Trainer
from transformers import LayoutLMv3ForTokenClassification,AutoProcessor
from transformers.data.data_collator import default_data_collator
import torch

In [None]:
# load datasets
from datasets import load_from_disk
train_dataset = load_from_disk('/content/output/train_split')
eval_dataset = load_from_disk('/content/output/test_split')

In [None]:
label_list = train_dataset.features["labels"].feature.names
num_labels = len(label_list)
label2id, id2label = dict(), dict()
for i, label in enumerate(label_list):
    label2id[label] = i
    id2label[i] = label

In [None]:
dataset_size = len(train_dataset)
print(f"Dataset size: {dataset_size}")


Dataset size: 322


In [None]:
print(label_list)

## defining metric

In [None]:
metric = load_metric("seqeval")
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels,zero_division='0')
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

## loading model and preprocessor (also required for Hugging face trainer)

In [None]:
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

## let's train the model

In [None]:
NUM_TRAIN_EPOCHS = 120 #increase this to your liking
PER_DEVICE_TRAIN_BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH_SIZE = 4
LEARNING_RATE = 4e-5

In [None]:
training_args = TrainingArguments(output_dir="output",
                                  # max_steps=1500,
                                  num_train_epochs=NUM_TRAIN_EPOCHS,
                                  logging_strategy="epoch",
                                  save_total_limit=1,
                                  per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                                  per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
                                  learning_rate=LEARNING_RATE,
                                  evaluation_strategy="no",
                                  save_strategy="no",
                                  dataloader_pin_memory=False,
                                  # eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [None]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

save the model to a folder output you create

In [None]:
torch.save(model,'/content/model_output/layoutlmv3.pth')