In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ritvik1909/document-classification-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/visithkumarapperuma/.cache/kagglehub/datasets/ritvik1909/document-classification-dataset/versions/1


In [2]:
import os
import shutil

# Create 'data' directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Move all items from 'path' to 'data'
for item in os.listdir(path):
    src = os.path.join(path, item)
    dst = os.path.join('data', item)
    shutil.move(src, dst)

In [3]:
path = "data"

In [None]:
import os 

import numpy as np 
import pandas as pd
import tqdm.auto as tqdm 
from sklearn.model_selection import train_test_split

from PIL import Image, ImageDraw, ImageFont

import torch 
from datasets import Dataset, Features, Sequence, ClassLabel, Value, Array2D

In [5]:
labels = [label for label in os.listdir(path)]
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
label2idx

{'resume': 0, 'scientific_publication': 1, 'email': 2}

In [6]:
images = []
labels = []

for label in os.listdir(path):
    images.extend([
        f"{path}/{label}/{img_name}" for img_name in os.listdir(f"{path}/{label}")
    ])
    labels.extend([
        label for _ in range(len(os.listdir(f"{path}/{label}")))
    ])
data = pd.DataFrame({'image_path': images, 'label': labels})

train_data, valid_data = train_test_split(data, test_size=0.09, random_state=0, stratify=data.label)
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)
print(f"{len(train_data)} training examples, {len(valid_data)} validation examples")
data.head()

150 training examples, 15 validation examples


Unnamed: 0,image_path,label
0,data/resume/doc_000501.png,resume
1,data/resume/doc_000070.png,resume
2,data/resume/doc_000460.png,resume
3,data/resume/doc_000476.png,resume
4,data/resume/doc_000674.png,resume


In [7]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(
    use_doc_orientation_classify=True, 
    use_doc_unwarping=True, 
    use_textline_orientation=True
)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /Users/visithkumarapperuma/.paddlex/official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2462.17it/s]
[32mCreating model: ('UVDoc', None)[0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /Users/visithkumarapperuma/.paddlex/official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 1050.28it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /Users/visithkumarapperuma/.paddlex/official_models.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /Users/visithkumarapperuma/.paddlex/officia

In [8]:
result = ocr.predict("data/resume/doc_000674.png")
for res in result:
    res.print()
    res.save_to_img("output")
    res.save_to_json("output")

[32m{'res': {'input_path': 'data/resume/doc_000674.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 0}, 'dt_polys': array([[[238,   0],
        ...,
        [238,   9]],

       ...,

       [[695, 958],
        ...,
        [695, 977]]], shape=(91, 4, 2), dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([0, ..., 0], shape=(91,)), 'text_rec_score_thresh': 0.0, 'rec_texts': ['iveSiigator/iPrograifDifecior(Last，first，ffhGdie)：', 'BIOGRAPHICAL SKETCH', 'Giveelloiioiotekelncoultncollartecal', 'investigatorprogramdirectorPhotocopythispageforeachperson.', 'NAME', 'POSITION TITLE', 'VERMA,INDER M.', 'PROFESSOR

In [59]:
len(result)

1

In [23]:
def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]

def get_box_from_quad(quad):
    """
    Converts a 4-point quadrilateral into a rectangular bounding box.
    Input: quad - list of 4 points: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
    Output: [x_min, y_min, x_max, y_max]
    """
    x_coords = [point[0] for point in quad]
    y_coords = [point[1] for point in quad]
    return [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]

In [60]:
def apply_ocr(example):
    # Get the image
    image = Image.open(example['image_path'])
    width, height = image.size
    
    # Run PaddleOCR
    result = ocr.predict(example['image_path'])
    
    words = []
    boxes = []

    ocr_data = result[0]
    
    # Get texts and boxes from the OCR result
    rec_texts = ocr_data['rec_texts']
    rec_boxes = ocr_data['rec_boxes'] 
    
    for i, text in enumerate(rec_texts):
        if not text.strip():
            continue
            
        box = rec_boxes[i]
        
        # Ensure box has 4 coordinates
        if len(box) == 4:
            box = [int(coord) for coord in box]
            norm_box = normalize_box(box, width, height)
            words.append(text)
            boxes.append(norm_box)
    
    # Add to the example dict
    example['words'] = words
    example['bbox'] = boxes
    return example

In [61]:
from transformers import LayoutLMv3TokenizerFast, LayoutLMv3ForSequenceClassification

In [62]:
tokenizer= LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", cache_dir="models/layoutlmv3")

In [110]:
def encode_training_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
    words = [str(w).strip() for w in example['words'] if str(w).strip()]
    boxes = example['bbox'][:len(words)]

    # Ensure alignment of words and boxes
    assert len(words) == len(boxes), "Words and boxes must be aligned."

    encoding = tokenizer(
        words,
        boxes=boxes,
        truncation=True,
        padding='max_length',
        max_length=max_seq_length,
        return_tensors='pt'
    )

    # Add label
    encoding['labels'] = torch.tensor(label2idx[example['label']], dtype=torch.long)

    return {k: v.squeeze(0) for k, v in encoding.items()}

# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
training_features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=list(idx2label.keys())),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})

In [None]:
def training_dataloader_from_df(data):
    dataset = Dataset.from_pandas(data)

    # Apply OCR if needed
    dataset = dataset.map(apply_ocr)

    # Apply encoding
    encoded_dataset = dataset.map(
        encode_training_example,
        remove_columns=dataset.column_names
    )

    encoded_dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'bbox', 'labels']
    )

    dataloader = torch.utils.data.DataLoader(
        encoded_dataset,
        batch_size=1,
        shuffle=True
    )

    return dataloader


In [114]:
# train_dataloader = training_dataloader_from_df(train_data)
valid_dataloader = training_dataloader_from_df(valid_data[:2])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map: 100%|██████████| 2/2 [00:56<00:00, 28.34s/ examples]
Map: 100%|██████████| 2/2 [00:00<00:00, 36.76 examples/s]
