This notebook inspects the input shape for LayoutLM's masked language modeling.

For this task, LayoutLM expects these inputs:

1. a sequence of strings where each string is a word (or token)
2. the corresponding bounding box for each string (i.e., 4 numbers representing the diagonal
   coordinates). The coordinates are normalized to \[0, 1000\].

In [20]:
%matplotlib inline
%load_ext autoreload
%autoreload 3
%config IPythonBackend.figure_format = 'retina'

import os
from dataclasses import asdict

from project_config import DataPaths, S3DataPaths
from transformers import AutoTokenizer

from my_code.data import mlm
from my_code.train import get_model

try:
    import rich

    rich.reconfigure(force_terminal=True, force_jupyter=False)
    rich.pretty.install()
    print = rich.get_console().out
    rprint = rich.get_console().print
except:
    pass

def mydir(o):
    return [s for s in dir(o) if not s.startswith('_')]

datapaths_local = DataPaths()
datapaths_s3 = S3DataPaths()
display(asdict(datapaths_local), asdict(datapaths_s3))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

[1m{[0m
    [32m'data_prefix'[0m: [1;35mPosixPath[0m[1m([0m[32m'data'[0m[1m)[0m,
    [32m'imgs'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/imgs-clean'[0m[1m)[0m,
    [32m'textracted'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/textracted'[0m[1m)[0m,
    [32m'annotations'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/annotations'[0m[1m)[0m
[1m}[0m

[1m{[0m
    [32m'data_prefix'[0m: [1;35mS3Path[0m[1m([0m[32m's3://sagemaker-ap-southeast-1-111122223333/textract-transformers/data'[0m[1m)[0m,
    [32m'imgs'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/imgs-clean'[0m[1m)[0m,
    [32m'textracted'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/textracted'[0m[1m)[0m,
    [32m'annotations'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/annotations'[0m[1m)[0m,
    [32m'bucket_name'[0m: [1;35mS3Path[0m[1m([0m[32m's3://sagemaker-ap-southeast-1-111122223333'

In [2]:
# Let's figure out the tokenizer to use, by looking at the mlm-pretraining job log.
rprint(
    "ModelArguments(cache_dir='/tmp/transformers/cache', config_name=None, model_name_or_path='microsoft/layoutlm-base-uncased', model_revision='main', tokenizer_name=None, use_auth_token=False)"
)
print()
rprint(
    "DataTrainingArguments(annotation_attr='labels', max_seq_length=512, max_train_samples=None, task_name='mlm', textract='/opt/ml/input/data/textract', textract_prefix='textract-transformers/data/textracted', train='/opt/ml/input/data/train', validation='/opt/ml/input/data/validation', num_labels=2, mlm_probability=0.15)"
)

[1;35mModelArguments[0m[1m([0m[33mcache_dir[0m=[32m'/tmp/transformers/cache'[0m, [33mconfig_name[0m=[3;35mNone[0m, 
[33mmodel_name_or_path[0m=[32m'microsoft/layoutlm-base-uncased'[0m, [33mmodel_revision[0m=[32m'main'[0m, 
[33mtokenizer_name[0m=[3;35mNone[0m, [33muse_auth_token[0m=[3;91mFalse[0m[1m)[0m

[1;35mDataTrainingArguments[0m[1m([0m[33mannotation_attr[0m=[32m'labels'[0m, [33mmax_seq_length[0m=[1;36m512[0m, 
[33mmax_train_samples[0m=[3;35mNone[0m, [33mtask_name[0m=[32m'mlm'[0m, [33mtextract[0m=[32m'/opt/ml/input/data/textract'[0m,
[33mtextract_prefix[0m=[32m'textract-transformers/data/textracted'[0m, 
[33mtrain[0m=[32m'/opt/ml/input/data/train'[0m, [33mvalidation[0m=[32m'/opt/ml/input/data/validation'[0m, 
[33mnum_labels[0m=[1;36m2[0m, [33mmlm_probability[0m=[1;36m0[0m[1;36m.15[0m[1m)[0m


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # For notebook only.

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    cache_dir=datapaths_local.data_prefix / "transformers/cache",
    use_fast=True,
    revision="main",
    use_auth_token=False,
)

ds_mlm = mlm.TextractLayoutLMDatasetForLM(
    textract_path=str(datapaths_local.textracted),
    tokenizer=tokenizer,
)



In [21]:
mydir(ds_mlm)


[1m[[0m
    [32m'dataset_inputs'[0m,
    [32m'example_index'[0m,
    [32m'functions'[0m,
    [32m'manifest_file_path'[0m,
    [32m'max_content_seq_len'[0m,
    [32m'max_seq_len'[0m,
    [32m'parse_textract_file'[0m,
    [32m'register_datapipe_as_function'[0m,
    [32m'register_function'[0m,
    [32m'splitter'[0m,
    [32m'textract_path'[0m,
    [32m'textract_prefix'[0m,
    [32m'textract_s3uri_to_file_path'[0m,
    [32m'tokenizer'[0m
[1m][0m


In [13]:
display(
    ds_mlm.textract_path,
    "<NONE>" if ds_mlm.manifest_file_path is None else ds_mlm.manifest_file_path,
    ds_mlm.textract_prefix,
    len(ds_mlm.example_index),
)

[32m'data/textracted/'[0m
[32m'[0m[32m<[0m[32mNONE[0m[32m>[0m[32m'[0m
[32m''[0m
[1;36m2544[0m


In [15]:
first_doc = ds_mlm.example_index[0]
print(first_doc)

[1;35mTextractLayoutLMExampleForLM[0m[1m([0m[33mword_boxes_normalized[0m=[1;35marray[0m[1m([0m[1m[[0m[1m[[0m[1;36m568.13204288[0m,  [1;36m49.72891137[0m, [1;36m701.71193779[0m,  [1;36m63.92122339[0m[1m][0m,
       [1m[[0m[1;36m544.09110546[0m,  [1;36m69.7896257[0m , [1;36m630.71605563[0m,  [1;36m84.08804704[0m[1m][0m,
       [1m[[0m[1;36m635.99562645[0m,  [1;36m70.1417774[0m , [1;36m700.49893856[0m,  [1;36m84.08466354[0m[1m][0m,
       [33m...[0m,
       [1m[[0m[1;36m604.19762135[0m, [1;36m617.5339222[0m , [1;36m613.47612366[0m, [1;36m625.69778133[0m[1m][0m,
       [1m[[0m[1;36m617.8407073[0m , [1;36m615.1227355[0m , [1;36m679.96006832[0m, [1;36m625.22857077[0m[1m][0m,
       [1m[[0m[1;36m685.43183804[0m, [1;36m615.04977942[0m, [1;36m759.9587664[0m , [1;36m625.48963912[0m[1m][0m[1m][0m[1m)[0m, [33mword_texts[0m=[1m[[0m[32m'CONSUMER'[0m, [32m'CREDIT'[0m, [32m'CARD'[0m, [32m'AGREEMENT'[0

In [30]:
# Based on GITROOT/notebooks/src/my_code/data/mlm.py::TextractLayoutLMDatasetForLM.__init__()
self = ds_mlm  # Alias, to minimize changes to the copy-pasted code :P

from my_code.data.geometry import layoutlm_boxes_from_trp_blocks
from my_code.data.mlm import TextractLayoutLMExampleForLM

record = next(self.dataset_inputs())
textract_file_path = record["textract-ref"]
page_num = record.get("page-num")
doc = self.parse_textract_file(textract_file_path)
for page in doc.pages[
    # Filter to target page if provided, else process all pages:
    slice(None)
    if page_num is None
    else slice(page_num - 1, page_num)
]:
    words = [word for line in page.lines for word in line.words]
    word_boxes = layoutlm_boxes_from_trp_blocks(words)
    word_texts = [word.text for word in words]
    for start_word, end_word in self.splitter.split(
        word_texts,
        tokenizer,
        self.max_content_seq_len,
    ):
        self.example_index.append(
            TextractLayoutLMExampleForLM(
                word_boxes_normalized=word_boxes[start_word:end_word, :],
                word_texts=word_texts[start_word:end_word],
            )
        )

del self
rprint("Amazing, it worked :joy:")

Amazing, it worked 😂


In [51]:
rprint("Let's start inspecting the temporary variables :eyes:")
print(word_texts, word_boxes, ds_mlm.example_index[-1].word_boxes_normalized, sep='\n\n')
display(
    ds_mlm.max_content_seq_len,
    word_boxes.shape,
    len(word_texts),
)

Let's start inspecting the temporary variables 👀
[1m[[0m[32m'Your'[0m, [32m'Billing'[0m, [32m'Rights:'[0m, [32m'Keep'[0m, [32m'this'[0m, [32m'Document'[0m, [32m'for'[0m, [32m'Future'[0m, [32m'Use'[0m, [32m'This'[0m, [32m'notice'[0m, [32m'tells'[0m, [32m'You'[0m, [32m'about'[0m, [32m'Your'[0m, [32m'rights'[0m, [32m'and'[0m, [32m'Our'[0m, [32m'responsibilities'[0m, [32m'under'[0m, [32m'the'[0m, [32m'Fair'[0m, [32m'Credit'[0m, [32m'Billing'[0m, [32m'Act.'[0m, [32m'What'[0m, [32m'To'[0m, [32m'Do'[0m, [32m'If'[0m, [32m'You'[0m, [32m'Find'[0m, [32m'A'[0m, [32m'Mistake'[0m, [32m'On'[0m, [32m'Your'[0m, [32m'Statement'[0m, [32m'If'[0m, [32m'You'[0m, [32m'think'[0m, [32m'there'[0m, [32m'is'[0m, [32m'an'[0m, [32m'error'[0m, [32m'on'[0m, [32m'Your'[0m, [32m'statement,'[0m, [32m'write'[0m, [32m'to'[0m, [32m'Us'[0m, [32m'at'[0m, [32m'the'[0m, [32m'address'[0m, [32m'listed'[0m, [32m'on'[0m, 