# Custom dataset for InvoiceXpert
This dataset is based on subset of [rvl_cdip](https://huggingface.co/datasets/jinhybr/rvl_cdip_400_train_val_test) and [FATURA2 invoice dataset](https://huggingface.co/datasets/mathieu1256/FATURA2-invoices) datasets.
It contains 2 classes: `invoice` and `non-invoice`.

In [1]:
import os
import random
from collections import Counter

import dotenv

from datasets import load_dataset, Features, Image, ClassLabel, DatasetDict, concatenate_datasets

from src.config import init_config

## Config and settings

In [2]:
config = init_config()

In [3]:
print(config)

data:
  fatura: mathieu1256/FATURA2-invoices
  rvl_cdip_subset: jinhybr/rvl_cdip_400_train_val_test
  invoiceXpert:
    hf_path: wiF0n/InvoiceXpert
    local_path: data/invoiceXpert
    num_labels: 2
    label_names:
    - invoice
    - not-invoice
models:
  classification:
    oob:
      pt: microsoft/dit-base
      ft: microsoft/dit-base-finetuned-rvlcdip
mlflow:
  experiments:
    classification: invoice-classification
utils:
  seed: 420
  run_date: 2024-03-02/24/24



In [4]:
# set random seed
random.seed(config.utils.seed)

In [5]:
# load dotenv
dotenv.load_dotenv()

True

## Load the data

In [6]:
fatura = load_dataset(config.data.fatura)

In [7]:
rvl_cdip = load_dataset(config.data.rvl_cdip_subset)

In [8]:
invoiceXpert_feats = Features({
    "image": Image(),
    "label": ClassLabel(num_classes=2, names=["invoice", "non-invoice"])
})

In [9]:
# process `rvl_cdip` dataset
# change labels to `invoice` or `non-invoice`
# retain only `image` and label `columns`
def process_rvl_cdip(dataset):
    dataset = (
        dataset.map(
            lambda example: {
                "image": example["image"].convert("RGB"),
                "label": 0 if example["label"] == 11 else 1,
            }
        )
        .select_columns(["image", "label"])
        .cast(invoiceXpert_feats)
    )
    return dataset

In [10]:
mine_rvl_cdip = process_rvl_cdip(rvl_cdip)

Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6400 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [11]:
# process `fatura` dataset
# change add labels (only `invoice`)
# retain only `image` and label `columns`
# take random ~1600 entires from `train` split and create `valid` split
def process_fatura(dataset):
    dataset = (
        dataset.map(
            lambda example: {
                "image": example["image"].convert("RGB"),
                "label": 0,
            }
        )
        .select_columns(["image", "label"])
        .cast(invoiceXpert_feats)
    )
    # Create random integers from 0 to len(dataset["train"])
    inds = list(range(0, len(dataset["train"])))
    random.shuffle(inds)
    train_inds, valid_inds = inds[1600:], inds[:1600]
    
    
    dataset = DatasetDict({
        "train": dataset["train"].select(train_inds),
        "validation": dataset["train"].select(valid_inds),
        "test": dataset["test"]
    })
    return dataset

In [12]:
mine_fatura = process_fatura(fatura)

Map:   0%|          | 0/8600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [13]:
invoiceXpert = DatasetDict()
dd_to_concat = [mine_fatura, mine_rvl_cdip]
for key in ["train", "validation", "test"]:
    invoiceXpert[key] = concatenate_datasets([ddd[key] for ddd in dd_to_concat])

In [14]:
from collections import Counter

In [15]:
# compute statistic about `label`
for key in ["train", "validation", "test"]:
    print(key)
    print(Counter(invoiceXpert[key]["label"]))

train
Counter({0: 7400, 1: 6000})
validation
Counter({0: 1700, 1: 1500})
test
Counter({0: 1500, 1: 1500})


In [17]:
invoiceXpert.save_to_disk(config.data.invoiceXpert.local_path)

Saving the dataset (0/5 shards):   0%|          | 0/13400 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/3200 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# publish dataset
#invoiceXpert.push_to_hub(repo_id="wiF0n/invoiceXpert", token=os.environ["HF_TOKEN"])