## import thu vien

In [116]:
import os
import json
import torch
from PIL import Image
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from transformers import (
    LayoutLMv3Processor,
    LayoutLMv3ForTokenClassification,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset

## cau hinh path

In [117]:
# 💡 Đổi path phù hợp với máy bạn
JSONL_PATH = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/LayoutLMV3/outputJSONL/layoutLMV3.jsonl"
LABEL_FOLDER = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/pipeline_LayoutLMV3/label2id_Folder"
IMAGE_FOLDER = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/converted_pngs"
OUTPUT_DIR = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/Finetuned_Model/LayoutLMV3_model"
MODEL_NAME = "microsoft/layoutlmv3-base"


## LOAD processor + label maps ===


In [118]:
# === Load model ===
model = LayoutLMv3ForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [119]:
# === Load label maps ===
label2id = json.load(open(os.path.join(LABEL_FOLDER, "label2id.json"), encoding="utf-8"))
id2label = {int(k): v for k, v in json.load(open(os.path.join(LABEL_FOLDER, "id2label.json"), encoding="utf-8")).items()}

# === Load processor ===
processor = LayoutLMv3Processor.from_pretrained(MODEL_NAME, apply_ocr=False)

In [120]:
print(f"📋 Số nhãn được model hỗ trợ (num_labels): {len(label2id)}")
print(f"🔎 Giá trị label_id cao nhất: {max(label2id.values())}")


📋 Số nhãn được model hỗ trợ (num_labels): 20
🔎 Giá trị label_id cao nhất: 19


## dinh nghia dataset class

In [121]:
# === Dataset class ===
class InvoiceDataset(Dataset):
    def __init__(self, jsonl_path, label2id, processor, image_folder):
        self.processor = processor
        self.label2id = label2id
        self.image_folder = image_folder

        with open(jsonl_path, "r", encoding="utf-8") as f:
            self.samples = []
            for i, line in enumerate(f):
                if line.strip():
                    data = json.loads(line.strip())
                    if not all(k in data for k in ["words", "bboxes", "labels", "image_path"]):
                        print(f"⚠️ Sample {i} thiếu key — bỏ qua.")
                        continue
                    if not (len(data["words"]) == len(data["bboxes"]) == len(data["labels"])):
                        print(f"⚠️ Sample {i} độ dài không khớp — bỏ qua.")
                        continue
                    self.samples.append(data)

        print(f"📄 Loaded {len(self.samples)} hóa đơn hợp lệ từ {jsonl_path}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        image_path = os.path.join(self.image_folder, os.path.basename(item["image_path"]))
        image = Image.open(image_path).convert("RGB")

        # Chuyển nhãn và kiểm tra kỹ
        word_labels = []
        for i, lbl in enumerate(item["labels"]):
            if lbl not in self.label2id:
                raise ValueError(f"❌ Nhãn không hợp lệ: '{lbl}' tại vị trí {i}")
            label_id = self.label2id[lbl]
            if not (0 <= label_id < len(self.label2id)):
                raise ValueError(f"❌ label_id {label_id} vượt giới hạn tại vị trí {i}")
            word_labels.append(label_id)

        encoding = self.processor(
            images=image,
            text=item["words"],
            boxes=item["bboxes"],
            word_labels=word_labels,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        return {k: v.squeeze(0) for k, v in encoding.items()}

In [122]:
for i in range(len(dataset)):
    try:
        _ = dataset[i]
    except Exception as e:
        print(f"❌ Lỗi ở sample {i}: {e}")
        break


❌ Lỗi ở sample 0: list index out of range


In [123]:
# Kiểm tra xem tất cả các nhãn trong file JSONL có hợp lệ không
with open(JSONL_PATH, "r", encoding="utf-8") as f:
    invalid_labels = set()
    for line in f:
        data = json.loads(line)
        for lbl in data.get("labels", []):
            if lbl not in label2id:
                invalid_labels.add(lbl)

if invalid_labels:
    print(f"❌ Các nhãn KHÔNG tồn tại trong label2id: {invalid_labels}")
else:
    print("✅ Tất cả nhãn đều hợp lệ trong label2id.")


✅ Tất cả nhãn đều hợp lệ trong label2id.


In [124]:
# === Tạo dataset ===
dataset = InvoiceDataset(JSONL_PATH, label2id, processor, IMAGE_FOLDER)
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.1, random_state=42)
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)

📄 Loaded 25 hóa đơn hợp lệ từ /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/LayoutLMV3/outputJSONL/layoutLMV3.jsonl


In [125]:
# === Load model ===
model = LayoutLMv3ForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [126]:
# === Evaluation metrics ===
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    true_preds, true_labels = [], []

    for pred, label in zip(preds, labels):
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                true_preds.append(p_i)
                true_labels.append(l_i)

    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_preds, average="macro", zero_division=0
    )

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

## train model

In [127]:

# === Training args ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    remove_unused_columns=False,
    fp16=torch.cuda.is_available()
)

# === Data collator ===
data_collator = DataCollatorForTokenClassification(
    tokenizer=processor.tokenizer,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
)

# === Trainer ===

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Train and save ===
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

trainer.train()
trainer.save_model()
processor.tokenizer.save_pretrained(OUTPUT_DIR)


  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
