# Model Training Script
This notebook is a transformation of the provided Python script into a Jupyter notebook. The script is designed for training a VisionEncoderDecoderModel using a custom dataset.

In [1]:
!pip install transformers datasets torch torchvision streamlit fastapi uvicorn accelerate sentencepiece
!pip install tensorflow tf-keras

Collecting tensorflow
  Downloading tensorflow-2.17.0-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow)
  Downloading tensorflow_intel-2.17.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading google_pasta-

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

In [3]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
import gc

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm



cuda


## Dataset Loading and Model Preparation

In [4]:
# Loading the dataset
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1")

# Model name
model_name = "microsoft/trocr-small-stage1"

# Processor & Model Loading
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)

# Set the decoder start token ID
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id

# Ensure pad_token_id is set
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# Set the pad_token_id in the model configuration
model.config.pad_token_id = processor.tokenizer.pad_token_id

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Image Preprocessing

In [5]:
# Function to resize images
def resize_image(image, max_width=1024, max_height=1024):
    image.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
    return image

In [6]:
# Preprocessing the dataset with resized images
def preprocess_batch(batch):
    images = [resize_image(img.convert("RGB")) if isinstance(img, Image.Image) else resize_image(Image.open(img).convert("RGB")) for img in batch['image']]

    # Get pixel values from the processor
    pixel_values = processor(images, return_tensors="pt").pixel_values.to(device)

    # Tokenizing the text data
    labels = processor.tokenizer(
        batch['parsed_data'],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).input_ids.to(device)

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

## Custom Data Collator

In [7]:
# Custom Data Collator for VisionEncoderDecoderModel
def collate_fn(batch):
    # Ensure pixel_values are stacked as tensors
    pixel_values = torch.stack([torch.tensor(item["pixel_values"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [8]:
# Apply preprocessing
preprocessed_dataset = dataset.map(preprocess_batch, batched=True, remove_columns=['id', 'parsed_data', 'raw_data'])

# Dataset split
train_dataset = preprocessed_dataset['train']
eval_dataset = preprocessed_dataset['valid']
test_dataset = preprocessed_dataset['test']

Map:   0%|          | 0/2043 [03:57<?, ? examples/s]


ArrowMemoryError: realloc of size 3221225472 failed

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr_finetuned_model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
)

In [None]:
# Train the model
trainer.train()

# Save the model locally
model.save_pretrained("./trocr_finetuned_model")
processor.save_pretrained("./trocr_finetuned_model")