In [3]:
# LayoutLM Training Notebook
import os
import torch
from pathlib import Path
import json
from PIL import Image
import logging
from transformers import LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
import numpy as np
from datasets import Dataset
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [5]:
# 1. Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else 
                     "mps" if torch.backends.mps.is_available() else 
                     "cpu")
print(f"Using device: {device}")

Using device: mps


In [6]:
# 2. Load vendor mapping
def load_vendor_mapping(dataset_dir="invoice_dataset_processed"):
    """Load vendor mapping from the processed dataset directory."""
    try:
        dataset_dir = Path(dataset_dir)
        with open(dataset_dir / 'vendor_map.json', 'r', encoding='utf-8') as f:
            vendor_info = json.load(f)
            vendor_map = vendor_info['vendor_map']
            vendor_names = vendor_info['vendors']
            
        print(f"Loaded {len(vendor_names)} vendors:")
        for vendor in vendor_names:
            print(f"  - {vendor}")
            
        return vendor_map, vendor_names
    except Exception as e:
        raise ValueError(f"Error loading vendor mapping: {str(e)}")

In [7]:
# 3. Initialize model and processor
def init_model_and_processor(num_labels, model_name="microsoft/layoutlmv3-base"):
    """Initialize LayoutLM model and processor."""
    try:
        processor = LayoutLMv3Processor.from_pretrained(
            model_name,
            apply_ocr=True
        )
        
        model = LayoutLMv3ForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        ).to(device)
        
        print(f"Model initialized with {num_labels} classes")
        return processor, model
    except Exception as e:
        raise RuntimeError(f"Error initializing model: {str(e)}")

In [8]:
# 4. Process single image
def process_image(processor, image_path):
    """Process a single image for the model."""
    try:
        image_path = Path(image_path)
        if not image_path.exists():
            raise FileNotFoundError(f"Image file not found: {image_path}")
        
        # Load and process image
        image = Image.open(image_path).convert("RGB")
        encoding = processor(
            image,
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )
        
        # Remove batch dimension
        processed = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # Validate and adjust tensor shapes
        for key, value in processed.items():
            if isinstance(value, torch.Tensor):
                if key in ['input_ids', 'attention_mask', 'bbox']:
                    if value.dim() == 1:
                        value = value.unsqueeze(0)
                    if value.shape[0] != 512:
                        if value.shape[0] < 512:
                            pad_size = 512 - value.shape[0]
                            padding = torch.zeros(pad_size, *value.shape[1:], dtype=value.dtype)
                            value = torch.cat([value, padding], dim=0)
                        else:
                            value = value[:512]
                    processed[key] = value
        
        return processed
    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        raise e

In [9]:
# 5. Load and prepare dataset
def load_dataset(dataset_dir, vendor_map, split="train"):
    """Load and prepare dataset for training or validation."""
    dataset_dir = Path(dataset_dir)
    data_dir = dataset_dir / split
    
    if not data_dir.exists():
        raise ValueError(f"Directory not found: {data_dir}")
    
    data = []
    for file in data_dir.iterdir():
        if file.suffix.lower() in ['.png', '.jpg', '.jpeg']:
            # Extract vendor name
            filename = file.stem
            if filename.startswith(('train_', 'val_')):
                filename = filename[filename.find('_') + 1:]
            
            parts = filename.split('_')
            if parts[0].isdigit() or parts[0].startswith(('1', '5')):
                vendor_parts = []
                for part in parts[1:]:
                    if part.startswith(('1', '2', '3', '4', '5', '6', '7', '8', '9', '0')):
                        break
                    vendor_parts.append(part)
                vendor_name = '_'.join(vendor_parts)
            else:
                vendor_name = parts[0]
            
            if vendor_name in vendor_map:
                data.append((str(file), vendor_map[vendor_name]))
            else:
                print(f"Warning: Unknown vendor '{vendor_name}' from file: {file.name}")
    
    return data

In [10]:
# 6. Prepare dataset for training
def prepare_dataset(processor, image_paths, labels):
    """Prepare dataset with processed images and labels."""
    print(f"Preparing dataset with {len(image_paths)} images")
    
    def process_example(example):
        try:
            processed = process_image(processor, example['image_path'])
            processed['labels'] = torch.tensor(example['label'], dtype=torch.long)
            return processed
        except Exception as e:
            print(f"Error processing example: {str(e)}")
            raise e

    # Create initial dataset
    dataset = Dataset.from_dict({
        'image_path': image_paths,
        'label': labels
    })
    
    # Process all examples
    processed_dataset = dataset.map(
        process_example,
        remove_columns=dataset.column_names,
        num_proc=1,
        desc="Processing images"
    )
    
    return processed_dataset

In [11]:
# 7. Training function
def train_model(model, train_dataset, eval_dataset=None, 
                output_dir="invoice_model", num_epochs=3, 
                batch_size=2, gradient_accumulation_steps=4):
    """Train the model with the prepared datasets."""
    
    # Create output directory
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir=str(output_dir),
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        eval_steps=100,
        logging_steps=10,
        save_steps=100,
        evaluation_strategy="steps" if eval_dataset else "no",
        save_strategy="steps",
        load_best_model_at_end=True if eval_dataset else False,
        save_total_limit=2,
        logging_dir=str(output_dir / "logs"),
        dataloader_num_workers=0,
        report_to="none",
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    
    # Train
    print("Starting training...")
    trainer.train()
    
    # Save final model
    print("Saving final model...")
    trainer.save_model(str(output_dir / "final_model"))
    
    return trainer

In [12]:
# 1. Load vendor mapping
vendor_map, vendor_names = load_vendor_mapping()

Loaded 12 vendors:
  - Brother
  - Coople
  - Eidg._STVA
  - KSU_A-Technik
  - K_Müller
  - Saviva_AG
  - Schaefer_AG
  - Shiva_Siegen
  - Topmech
  - Wei_Grueber
  - asa
  - train


In [13]:
# 2. Initialize model and processor
processor, model = init_model_and_processor(len(vendor_names))

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 12 classes


In [14]:
# 3. Load datasets
train_data = load_dataset("invoice_dataset_processed", vendor_map, "train")
val_data = load_dataset("invoice_dataset_processed", vendor_map, "validation")



In [15]:
# 4. Prepare datasets
train_paths, train_labels = zip(*train_data)
train_dataset = prepare_dataset(processor, train_paths, train_labels)

Preparing dataset with 357 images


Processing images:   0%|          | 1/357 [00:00<05:21,  1.11 examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing images:   1%|          | 2/357 [00:01<05:06,  1.16 examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing images:   1%|          | 3/357 [00:02<05:00,  1.18 examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly

In [16]:
val_paths, val_labels = zip(*val_data)
val_dataset = prepare_dataset(processor, val_paths, val_labels)

Preparing dataset with 122 images


Processing images:   0%|          | 0/122 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing images:   1%|          | 1/122 [00:00<01:52,  1.07 examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing images:   2%|▏         | 2/122 [00:01<01:49,  1.10 examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the

In [17]:
trainer = train_model(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    num_epochs=3,
    batch_size=2,
    gradient_accumulation_steps=4
)



Starting training...


  0%|          | 0/132 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)