# Model Training Script
This notebook is a transformation of the provided Python script into a Jupyter notebook. The script is designed for training a VisionEncoderDecoderModel using a custom dataset.

In [None]:
!pip install transformers datasets torch torchvision streamlit fastapi uvicorn accelerate sentencepiece
!pip install tensorflow tf-keras

In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

In [None]:
import torch
from PIL import Image
from datasets import load_dataset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
import gc

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Dataset Loading and Model Preparation

In [None]:
# Loading the dataset
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1")

# Model name
model_name = "microsoft/trocr-small-stage1"

# Processor & Model Loading
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(device)

# Set the decoder start token ID
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id

# Ensure pad_token_id is set
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# Set the pad_token_id in the model configuration
model.config.pad_token_id = processor.tokenizer.pad_token_id

## Image Preprocessing

In [None]:
# Function to resize images
def resize_image(image, max_width=1024, max_height=1024):
    image.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
    return image

In [None]:
# Preprocessing the dataset with resized images
def preprocess_batch(batch):
    images = [resize_image(img.convert("RGB")) if isinstance(img, Image.Image) else resize_image(Image.open(img).convert("RGB")) for img in batch['image']]

    # Get pixel values from the processor
    pixel_values = processor(images, return_tensors="pt").pixel_values.to(device)

    # Tokenizing the text data
    labels = processor.tokenizer(
        batch['parsed_data'],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).input_ids.to(device)

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

## Custom Data Collator

In [None]:
# Custom Data Collator for VisionEncoderDecoderModel
def collate_fn(batch):
    # Ensure pixel_values are stacked as tensors
    pixel_values = torch.stack([torch.tensor(item["pixel_values"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [None]:
# Apply preprocessing
preprocessed_dataset = dataset.map(preprocess_batch, batched=True, remove_columns=['id', 'parsed_data', 'raw_data'])

# Dataset split
train_dataset = preprocessed_dataset['train']
eval_dataset = preprocessed_dataset['valid']
test_dataset = preprocessed_dataset['test']

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./trocr_finetuned_model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
)

In [None]:
# Train the model
trainer.train()

# Save the model locally
model.save_pretrained("./trocr_finetuned_model")
processor.save_pretrained("./trocr_finetuned_model")