In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
IOB_DATA_PATH = "/content/drive/MyDrive/visual-doc-data/w9/dataset/50/"

! cd /content/

In [None]:
!pip install transformers torch pillow huggingface_hub



In [None]:
import json
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer

class DonutDataset(Dataset):
    def __init__(self, jsonl_file, processor, image_dir="./", max_length=768):
        self.processor = processor
        self.image_dir = image_dir
        self.max_length = max_length

        self.data = []
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(json.loads(line.strip()))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        image_path = os.path.join(self.image_dir, item['image_path'])
        image = Image.open(image_path).convert('RGB')

        pixel_values = self.processor(image, return_tensors="pt").pixel_values.squeeze()
        ground_truth = item['text']

        target_sequence = self.processor.tokenizer(
            ground_truth,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        labels = target_sequence.input_ids.squeeze()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {"pixel_values": pixel_values, "labels": labels}

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {"pixel_values": pixel_values, "labels": labels}

# Configuration
model_name = "naver-clova-ix/donut-base"
jsonl_file = "/content/drive/MyDrive/visual-doc-data/w9/dataset/50/donut_dataset.jsonl"
image_dir = "/content/drive/MyDrive/visual-doc-data/w9/dataset/50/"

# Load processor and model
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

# Add special tokens
new_special_tokens = [
    "<s_name>", "</s_name>",
    "<s_city_state_zip_code>", "</s_city_state_zip_code>",
    "<s_list_account_number>", "</s_list_account_number>",
    "<s_address>", "</s_address>",
    "<s_ssn>", "</s_ssn>",
    "<s_sign-date>", "</s_sign-date>"
]

processor.tokenizer.add_special_tokens({"additional_special_tokens": new_special_tokens})
model.decoder.resize_token_embeddings(len(processor.tokenizer))

model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]

# Create dataset
dataset = DonutDataset(jsonl_file, processor, image_dir)
train_size = int(0.9 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, eval_size])

# hyperparameters used for multiple args
hf_repository_id = "dof-passport-1"

# Arguments for training
training_args = Seq2SeqTrainingArguments(
    output_dir=hf_repository_id,
    num_train_epochs=30,
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    weight_decay=0.01,
    fp16=True,
    logging_steps=100,
    save_total_limit=2,
    eval_strategy="no",
    save_strategy="epoch",
    predict_with_generate=True,
    generation_max_length=768,
    generation_num_beams=1,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    report_to="tensorboard",
)

# Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset if eval_size > 0 else None,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn,
)

# Start training
trainer.train()

# Save final model
trainer.save_model()
processor.save_pretrained(hf_repository_id)

  trainer = Seq2SeqTrainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 150.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 92.12 MiB is free. Process 40163 has 14.65 GiB memory in use. Of the allocated memory 14.08 GiB is allocated by PyTorch, and 451.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)