In [1]:
! pip install numpy==1.26.4



In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ritvik1909/document-classification-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/document-classification-dataset


In [5]:
import os
import shutil

# Create 'data' directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Move all items from 'path' to 'data'
for item in os.listdir(path):
    src = os.path.join(path, item)
    dst = os.path.join('data', item)
    shutil.move(src, dst)

In [25]:
path = "data"

In [26]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset, Features, Sequence, ClassLabel, Value, Array2D

In [8]:
labels = [label for label in os.listdir(path)]
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
label2idx

{'scientific_publication': 0, 'email': 1, 'resume': 2}

In [27]:
images = []
labels = []

for label in os.listdir(path):
    images.extend([
        f"{path}/{label}/{img_name}" for img_name in os.listdir(f"{path}/{label}")
    ])
    labels.extend([
        label for _ in range(len(os.listdir(f"{path}/{label}")))
    ])
data = pd.DataFrame({'image_path': images, 'label': labels})

train_data, valid_data = train_test_split(data, test_size=0.09, random_state=0, stratify=data.label)
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)
print(f"{len(train_data)} training examples, {len(valid_data)} validation examples")
data.head()

150 training examples, 15 validation examples


Unnamed: 0,image_path,label
0,data/scientific_publication/doc_000334.png,scientific_publication
1,data/scientific_publication/doc_000513.png,scientific_publication
2,data/scientific_publication/doc_000285.png,scientific_publication
3,data/scientific_publication/doc_000480.png,scientific_publication
4,data/scientific_publication/doc_000821.png,scientific_publication


In [10]:
! python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/

Looking in indexes: https://www.paddlepaddle.org.cn/packages/stable/cu118/


In [11]:
! pip install -qU paddleocr

In [12]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(
    use_doc_orientation_classify=True,
    use_doc_unwarping=True,
    use_textline_orientation=True
)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /root/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [28]:
def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]

def get_box_from_quad(quad):
    """
    Converts a 4-point quadrilateral into a rectangular bounding box.
    Input: quad - list of 4 points: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
    Output: [x_min, y_min, x_max, y_max]
    """
    x_coords = [point[0] for point in quad]
    y_coords = [point[1] for point in quad]
    return [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]

In [29]:
from PIL import Image, ImageDraw, ImageFont

In [30]:
def apply_ocr(example):
    # Get the image
    image = Image.open(example['image_path'])
    width, height = image.size

    # Run PaddleOCR
    result = ocr.predict(example['image_path'])

    words = []
    boxes = []

    ocr_data = result[0]

    # Get texts and boxes from the OCR result
    rec_texts = ocr_data['rec_texts']
    rec_boxes = ocr_data['rec_boxes']

    for i, text in enumerate(rec_texts):
        if not text.strip():
            continue

        box = rec_boxes[i]

        # Ensure box has 4 coordinates
        if len(box) == 4:
            box = [int(coord) for coord in box]
            norm_box = normalize_box(box, width, height)
            words.append(text)
            boxes.append(norm_box)

    # Add to the example dict
    example['words'] = words
    example['bbox'] = boxes
    return example

In [31]:
from transformers import LayoutLMv3TokenizerFast, LayoutLMv3ForSequenceClassification

In [32]:
tokenizer= LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", cache_dir="models/layoutlmv3")

In [33]:
def encode_training_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
    words = [str(w).strip() for w in example['words'] if str(w).strip()]
    boxes = example['bbox'][:len(words)]

    # Ensure alignment of words and boxes
    assert len(words) == len(boxes), "Words and boxes must be aligned."

    encoding = tokenizer(
        words,
        boxes=boxes,
        truncation=True,
        padding='max_length',
        max_length=max_seq_length,
        return_tensors='pt'
    )

    # Add label
    encoding['labels'] = torch.tensor(label2idx[example['label']], dtype=torch.long)

    return {k: v.squeeze(0) for k, v in encoding.items()}

# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
training_features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=list(idx2label.keys())),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})

In [34]:
def training_dataloader_from_df(data):
    dataset = Dataset.from_pandas(data)

    # Apply OCR if needed
    dataset = dataset.map(apply_ocr)

    # Apply encoding
    encoded_dataset = dataset.map(
        encode_training_example,
        remove_columns=dataset.column_names
    )

    encoded_dataset.set_format(
        type='torch',
        columns=['bbox', 'input_ids', 'attention_mask', 'labels']
    )

    dataloader = torch.utils.data.DataLoader(
        encoded_dataset,
        batch_size=1,
        shuffle=True
    )

    return dataloader

In [35]:
# test_loader = training_dataloader_from_df(valid_data[:2])

In [36]:
train_dataloader = training_dataloader_from_df(train_data)
valid_dataloader = training_dataloader_from_df(valid_data)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LayoutLMv3ForSequenceClassification.from_pretrained(
    "microsoft/layoutlmv3-base", num_labels=len(label2idx)
)
model.to(device);

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from torch.optim import AdamW
import numpy as np
from tqdm.auto import tqdm

In [43]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [44]:
# optimizer = AdamW(model.parameters(), lr=4e-5)
# num_epochs = 10


# for epoch in range(num_epochs):
#     print("Epoch:", epoch)
#     training_loss = 0.0
#     training_correct = 0
#     #put the model in training mode
#     model.train()
#     for batch in tqdm(train_dataloader):
#         labels = batch["labels"].to(device)
#         outputs = model(
#             input_ids=batch["input_ids"].to(device), bbox=batch["bbox"].to(device),
#             attention_mask=batch["attention_mask"].to(device),
#             labels=labels
#         )
#         loss = outputs.loss

#         training_loss += loss.item()
#         predictions = outputs.logits.argmax(-1)
#         training_correct += (predictions == labels).float().sum()

#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

#     print("Training Loss:", training_loss / batch["input_ids"].shape[0])
#     training_accuracy = 100 * training_correct / len(train_data)
#     print("Training accuracy:", training_accuracy.item())

#     validation_loss = 0.0
#     validation_correct = 0
#     for batch in tqdm(valid_dataloader):
#         labels = batch["labels"].to(device)
#         outputs = model(
#             input_ids=batch["input_ids"].to(device), bbox=batch["bbox"].to(device),
#             attention_mask=batch["attention_mask"].to(device),
#             labels=labels
#         )
#         loss = outputs.loss

#         validation_loss += loss.item()
#         predictions = outputs.logits.argmax(-1)
#         validation_correct += (predictions == labels).float().sum()

#     print("Validation Loss:", validation_loss / batch["input_ids"].shape[0])
#     validation_accuracy = 100 * validation_correct / len(valid_data)
#     print("Validation accuracy:", validation_accuracy.item())

Epoch: 0


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 167.6161406636238
Training accuracy: 26.666667938232422


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.469866394996643
Validation accuracy: 26.666667938232422
Epoch: 1


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 169.76784467697144
Training accuracy: 30.0


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 18.54185926914215
Validation accuracy: 33.333335876464844
Epoch: 2


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 170.73923927545547
Training accuracy: 32.0


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.895142436027527
Validation accuracy: 33.333335876464844
Epoch: 3


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 167.39754444360733
Training accuracy: 32.66666793823242


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.530977845191956
Validation accuracy: 33.333335876464844
Epoch: 4


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 166.51250952482224
Training accuracy: 30.0


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.55224698781967
Validation accuracy: 33.333335876464844
Epoch: 5


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 169.260668694973
Training accuracy: 24.666667938232422


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.513352513313293
Validation accuracy: 40.000003814697266
Epoch: 6


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 167.566446185112
Training accuracy: 24.666667938232422


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 17.697901129722595
Validation accuracy: 33.333335876464844
Epoch: 7


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 168.80988144874573
Training accuracy: 26.0


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.725744128227234
Validation accuracy: 20.000001907348633
Epoch: 8


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 167.06641352176666
Training accuracy: 34.66666793823242


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.44680082798004
Validation accuracy: 40.000003814697266
Epoch: 9


  0%|          | 0/150 [00:00<?, ?it/s]

Training Loss: 167.9170190691948
Training accuracy: 28.666667938232422


  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 16.532378911972046
Validation accuracy: 33.333335876464844


In [None]:
num_epochs = 20

In [46]:
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=4e-5, weight_decay=0.01)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*num_epochs)

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch in tqdm(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        bbox = batch["bbox"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            bbox=bbox,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        predictions = logits.argmax(dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct / total * 100
    print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")


Epoch 1/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1185, Train Accuracy: 30.67%

Epoch 2/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1168, Train Accuracy: 31.33%

Epoch 3/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1129, Train Accuracy: 26.00%

Epoch 4/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1221, Train Accuracy: 28.00%

Epoch 5/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1130, Train Accuracy: 25.33%

Epoch 6/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1061, Train Accuracy: 30.00%

Epoch 7/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1016, Train Accuracy: 33.33%

Epoch 8/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1107, Train Accuracy: 26.00%

Epoch 9/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1033, Train Accuracy: 34.00%

Epoch 10/10


  0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 1.1027, Train Accuracy: 30.00%


In [49]:
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0

with torch.no_grad():
    for batch in tqdm(valid_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        bbox = batch["bbox"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            bbox=bbox,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()
        predictions = logits.argmax(dim=-1)
        val_correct += (predictions == labels).sum().item()
        val_total += labels.size(0)

val_accuracy = val_correct / val_total * 100
print(f"Validation Loss: {val_loss / len(valid_dataloader):.4f}, Accuracy: {val_accuracy:.2f}%")

  0%|          | 0/15 [00:00<?, ?it/s]

Validation Loss: 1.0987, Accuracy: 33.33%
