<a href="https://www.kaggle.com/code/akarshu121/2-docformer-pre-train-modeling?scriptVersionId=136945317" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install -qqq datasets
!pip install -qqq transformers
!pip install -qqq einops
!pip install -qqq tqdm
!pip install -qqq PyPDF2

In [2]:
## Cloning the repository
!git clone https://github.com/uakarsh/docformer.git

Cloning into 'docformer'...
remote: Enumerating objects: 1392, done.[K
remote: Counting objects: 100% (326/326), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 1392 (delta 210), reused 244 (delta 159), pack-reused 1066[K
Receiving objects: 100% (1392/1392), 4.94 MiB | 15.20 MiB/s, done.
Resolving deltas: 100% (741/741), done.


In [3]:
## You can't apply/map functions to a only readable folder

!cp -r /kaggle/input/sample-idl-dataset/idl-pretrain-dataset /kaggle/working/

In [4]:
from PIL import Image, ImageDraw

## A bit of code taken from here : https://www.kaggle.com/code/akarshu121/docformer-for-token-classification-on-funsd/notebook
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## PyTorch Libraries
import torch
from torchvision.transforms import ToTensor
from torch.utils.data import  DataLoader
import torch.nn.functional as F
import torch.nn as nn

## Adding the path of docformer to system path
import sys
sys.path.append('./docformer/src/docformer/')

## Importing the functions from the DocFormer Repo
from dataset import resize_align_bbox, get_centroid, get_pad_token_id_start_index, get_relative_distance
from modeling import DocFormerEncoder,ResNetFeatureExtractor,DocFormerEmbeddings,LanguageFeatureExtractor

## Transformer librarues
from transformers import AutoTokenizer

In [5]:
def get_tokens_with_boxes(unnormalized_word_boxes, word_ids,max_seq_len = 512, pad_token_box = [0, 0, 0, 0]):

    # assert len(unnormalized_word_boxes) == len(word_ids), this should not be applied, since word_ids may have higher 
    # length and the bbox corresponding to them may not exist

    unnormalized_token_boxes = []
    
    i = 0
    for word_idx in word_ids:
        if word_idx is None:
            break
        unnormalized_token_boxes.append(unnormalized_word_boxes[word_idx])
        i+=1

    # all remaining are padding tokens so why add them in a loop one by one
    num_pad_tokens = len(word_ids) - i - 1
    if num_pad_tokens > 0:
        unnormalized_token_boxes.extend([pad_token_box] * num_pad_tokens)


    if len(unnormalized_token_boxes)<max_seq_len:
        unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))

    return unnormalized_token_boxes

def create_features_for_cls(image,
        tokenizer = None,
        target_size=(500,384),  # This was the resolution used by the authors
        max_seq_length=512,
        bounding_box = None,
        words = None,
        resize_scale = (1000, 1000)):
    
    
    CLS_TOKEN_BOX = [0, 0, *resize_scale]    # Can be variable, but as per the paper, they have mentioned that it covers the whole image
    # step 2: resize image
    resized_image = image.resize(target_size)
    
    # step 4: tokenize words and get their bounding boxes (one word may split into multiple tokens)
    encoding = tokenizer(words,
                         padding="max_length",
                         max_length=max_seq_length,
                         is_split_into_words=True,
                         truncation=True,
                         add_special_tokens=False)
    
    unnormalized_token_boxes = get_tokens_with_boxes(unnormalized_word_boxes = bounding_box,
                                                     word_ids = encoding.word_ids())
    
    # step 5: add special tokens and truncate seq. to maximum length
    unnormalized_token_boxes = [CLS_TOKEN_BOX] + unnormalized_token_boxes[:-1]
    # add CLS token manually to avoid autom. addition of SEP too (as in the paper)
    encoding["input_ids"] = [tokenizer.cls_token_id] + encoding["input_ids"][:-1]
    
    # step 6: Add bounding boxes to the encoding dict
    encoding["unnormalized_token_boxes"] = unnormalized_token_boxes
    
    # step 8: normalize the image
    encoding["resized_scaled_img"] = ToTensor()(resized_image).tolist()
    
    # step 10: rescale and align the bounding boxes to match the resized image size (typically 224x224)
    resized_and_aligned_bboxes = []

    for bbox in unnormalized_token_boxes:
        # performing the normalization of the bounding box
        resized_and_aligned_bboxes.append(resize_align_bbox(tuple(bbox), *resize_scale, *target_size)) ## The bbox are resized to (500, 500)

    encoding["resized_and_aligned_bounding_boxes"] = resized_and_aligned_bboxes

    # step 11: add the relative distances in the normalized grid
    bboxes_centroids = get_centroid(resized_and_aligned_bboxes)
    pad_token_start_index = get_pad_token_id_start_index(words, encoding, tokenizer)
    a_rel_x, a_rel_y = get_relative_distance(resized_and_aligned_bboxes, bboxes_centroids, pad_token_start_index)

#     # step 12: convert all to tensors
#     for k, v in encoding.items():
#         encoding[k] = torch.as_tensor(encoding[k])
    
    encoding.update({
        "x_features": a_rel_x,
        "y_features": a_rel_y,
        })

    
    # step 16: keys to keep, resized_and_aligned_bounding_boxes have been added for the purpose to test if the bounding boxes are drawn correctly or not, it maybe removed
    keys = ['resized_scaled_img', 'x_features','y_features','input_ids']

    final_encoding = {k: encoding[k] for k in keys}
#     final_encoding['x_features'] = torch.stack(final_encoding['x_features'])
#     final_encoding['y_features'] = torch.stack(final_encoding['y_features'])
    #final_encoding['input_ids'] = final_encoding['input_ids']#.to_list()
    #final_encoding['resized_scaled_img'] = ToTensor()(final_encoding['resized_scaled_img'])

    del encoding
    return final_encoding

tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
def preprocess_fn(entry):
    return create_features_for_cls(image = entry['img'], tokenizer = tokenizer, bounding_box = entry['bbox'],
                                  words = entry['words'])

Downloading (…)okenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
from datasets import load_from_disk, Features, Sequence, ClassLabel, Value, Array2D, Array3D
ds = load_from_disk("/kaggle/working/idl-pretrain-dataset")

# we need to define custom features
features = Features({
    'resized_scaled_img': Array3D(dtype="float32", shape=(3, 384, 500)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'x_features': Array2D(dtype="int64", shape=(512, 8)),
    'y_features': Array2D(dtype="int64", shape=(512, 8)),
})

ds = ds.map(preprocess_fn, batched = False,remove_columns=ds['train'].column_names,features=features)
ds.set_format('torch')

  0%|          | 0/2 [00:00<?, ?ex/s]

In [7]:
sample = ds['train'][0]
for key in sample.keys():
    print(key, sample[key].shape)

resized_scaled_img torch.Size([3, 384, 500])
input_ids torch.Size([512])
x_features torch.Size([512, 8])
y_features torch.Size([512, 8])


## Writing the MLM

In [8]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [9]:
input_ids, labels = data_collator.torch_mask_tokens(sample['input_ids'].unsqueeze(0))
sample['resized_scaled_img'] = sample['resized_scaled_img'].unsqueeze(0)
sample['x_features'] = sample['x_features'].unsqueeze(0)
sample['y_features'] = sample['y_features'].unsqueeze(0)
sample['input_ids'] = input_ids
sample['labels'] = labels

In [10]:
config = {
  "coordinate_size": 96,              ## (768/8), 8 for each of the 8 coordinates of x, y
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "image_feature_pool_shape": [7, 7, 256],
  "intermediate_ff_size_factor": 4,
  "max_2d_position_embeddings": 1024,
  "max_position_embeddings": 512,
  "max_relative_positions": 8,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "shape_size": 96,
  "vocab_size": 30522,
  "layer_norm_eps": 1e-12,
  "classes": tokenizer.vocab_size
}

"""## Defining pytorch lightning module"""

class DocFormer(nn.Module):
  
    def __init__(self, config):
      super(DocFormer, self).__init__()

      self.resnet = ResNetFeatureExtractor()
      self.embeddings = DocFormerEmbeddings(config)
      self.lang_emb = LanguageFeatureExtractor()
      self.config = config
      self.dropout = nn.Dropout(config['hidden_dropout_prob'])
      self.encoder = DocFormerEncoder(config)
      self.linear_layer = nn.Linear(in_features = config['hidden_size'], out_features = config['vocab_size'])

    def forward(self, batch_dict):

      x_feat = batch_dict['x_features']
      y_feat = batch_dict['y_features']

      token = batch_dict['input_ids']
      img = batch_dict['resized_scaled_img']

      v_bar_s, t_bar_s = self.embeddings(x_feat,y_feat)
      v_bar = self.resnet(img)
      t_bar = self.lang_emb(token)
      out = self.encoder(t_bar,v_bar,t_bar_s,v_bar_s)
      out = self.linear_layer(out)

      return out

In [11]:
docformer = DocFormer(config)



Downloading pytorch_model.bin:   0%|          | 0.00/453M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

In [12]:
import torch.nn.functional as F
logits = docformer(sample)
loss = F.cross_entropy(
                  logits.transpose(1, 2),
                  sample['labels']
              )

In [13]:
logits.shape

torch.Size([1, 512, 30522])

In [14]:
loss

tensor(72.4023, grad_fn=<NllLoss2DBackward0>)