In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 3
%config IPythonBackend.figure_format = 'retina'

import os
from dataclasses import asdict

from project_config import DataPaths, S3DataPaths
from transformers import AutoTokenizer

from my_code.data import mlm
from my_code.train import get_model

try:
    import rich

    rich.reconfigure(force_terminal=True, force_jupyter=False)
    rich.pretty.install()
    print = rich.get_console().out
    rprint = rich.get_console().print
except:
    pass

datapaths_local = DataPaths()
datapaths_s3 = S3DataPaths()
display(asdict(datapaths_local), asdict(datapaths_s3))


[1m{[0m
    [32m'data_prefix'[0m: [1;35mPosixPath[0m[1m([0m[32m'data'[0m[1m)[0m,
    [32m'imgs'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/imgs-clean'[0m[1m)[0m,
    [32m'textracted'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/textracted'[0m[1m)[0m,
    [32m'annotations'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/annotations'[0m[1m)[0m
[1m}[0m

[1m{[0m
    [32m'data_prefix'[0m: [1;35mS3Path[0m[1m([0m[32m's3://sagemaker-ap-southeast-1-111122223333/textract-transformers/data'[0m[1m)[0m,
    [32m'imgs'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/imgs-clean'[0m[1m)[0m,
    [32m'textracted'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/textracted'[0m[1m)[0m,
    [32m'annotations'[0m: [1;35mPosixPath[0m[1m([0m[32m'data/annotations'[0m[1m)[0m,
    [32m'bucket_name'[0m: [1;35mS3Path[0m[1m([0m[32m's3://sagemaker-ap-southeast-1-111122223333'[0m[1m)[0m,
    [32m'bucket_prefix'[0m: [32m'textract-transformers'[0m
[1m}[0m


In [2]:
# Let's figure out the tokenizer to use, by looking at the mlm-pretraining job log.
rprint(
    "ModelArguments(cache_dir='/tmp/transformers/cache', config_name=None, model_name_or_path='microsoft/layoutlm-base-uncased', model_revision='main', tokenizer_name=None, use_auth_token=False)"
)
print()
rprint(
    "DataTrainingArguments(annotation_attr='labels', max_seq_length=512, max_train_samples=None, task_name='mlm', textract='/opt/ml/input/data/textract', textract_prefix='textract-transformers/data/textracted', train='/opt/ml/input/data/train', validation='/opt/ml/input/data/validation', num_labels=2, mlm_probability=0.15)"
)

[1;35mModelArguments[0m[1m([0m[33mcache_dir[0m=[32m'/tmp/transformers/cache'[0m, [33mconfig_name[0m=[3;35mNone[0m, 
[33mmodel_name_or_path[0m=[32m'microsoft/layoutlm-base-uncased'[0m, [33mmodel_revision[0m=[32m'main'[0m, 
[33mtokenizer_name[0m=[3;35mNone[0m, [33muse_auth_token[0m=[3;91mFalse[0m[1m)[0m

[1;35mDataTrainingArguments[0m[1m([0m[33mannotation_attr[0m=[32m'labels'[0m, [33mmax_seq_length[0m=[1;36m512[0m, 
[33mmax_train_samples[0m=[3;35mNone[0m, [33mtask_name[0m=[32m'mlm'[0m, [33mtextract[0m=[32m'/opt/ml/input/data/textract'[0m,
[33mtextract_prefix[0m=[32m'textract-transformers/data/textracted'[0m, 
[33mtrain[0m=[32m'/opt/ml/input/data/train'[0m, [33mvalidation[0m=[32m'/opt/ml/input/data/validation'[0m, 
[33mnum_labels[0m=[1;36m2[0m, [33mmlm_probability[0m=[1;36m0[0m[1;36m.15[0m[1m)[0m


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # For notebook only.

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/layoutlm-base-uncased",
    cache_dir=datapaths_local.data_prefix / "transformers/cache",
    use_fast=True,
    revision="main",
    use_auth_token=False,
)

ds_mlm = mlm.TextractLayoutLMDatasetForLM(
    textract_path=str(datapaths_local.textracted),
    tokenizer=tokenizer,
)

Downloading:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [5]:
# TODO: GITROOT/notebooks/my_code/data/mlm.py:TextractLayoutLMDatasetForLM

[1m<[0m[1;95mmy_code.data.mlm.TextractLayoutLMDatasetForLM[0m[39m object at [0m[1;36m0x7f88ca5c9eb0[0m[1m>[0m
