<a href="https://colab.research.google.com/github/uakarsh/TiLT-Implementation/blob/main/how_did_i_prepare_the_stuffs/tilt_part_1_preparing_the_dataset_funsd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq transformers
!pip install -qqq datasets
!pip install -qqq sentencepiece==0.1.91

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision.ops import RoIAlign

#### Reference has been taken from here :

1. [Hugging Face T5 Implementation](https://github.com/huggingface/transformers/blob/v4.26.1/src/transformers/models/t5/modeling_t5.py)
2. [UNet model implementation](https://github.com/facebookresearch/fastMRI/blob/main/fastmri/models/unet.py)

## Part 1.1 UNet for Feature Extraction (we would focus only on the encoder part)

In [None]:
class ConvBlock(nn.Module):
    """
    A Convolutional Block that consists of two convolution layers each followed by
    instance normalization, LeakyReLU activation and dropout.
    """

    def __init__(self, in_chans: int, out_chans: int, drop_prob: float):
        """
        Args:
            in_chans: Number of channels in the input.
            out_chans: Number of channels in the output.
            drop_prob: Dropout probability.
        """
        super().__init__()

        self.in_chans = in_chans
        self.out_chans = out_chans
        self.drop_prob = drop_prob

        self.layers = nn.Sequential(
            nn.Conv2d(in_chans, out_chans, kernel_size=3, padding=1, bias=False),
            nn.InstanceNorm2d(out_chans),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout2d(drop_prob),
            nn.Conv2d(out_chans, out_chans, kernel_size=3, padding=1, bias=False),
            nn.InstanceNorm2d(out_chans),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout2d(drop_prob),
            nn.Conv2d(out_chans, out_chans, kernel_size=3, padding=1, bias=False),
            nn.InstanceNorm2d(out_chans),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout2d(drop_prob),
        )

    def forward(self, image: torch.Tensor) -> torch.Tensor:
        """
        Args:
            image: Input 4D tensor of shape `(N, in_chans, H, W)`.
        Returns:
            Output tensor of shape `(N, out_chans, H, W)`.
        """
        return self.layers(image)


class Unet_encoder(nn.Module):

  def __init__(self, 
               in_channels: int = 3,
               channels: int = 32,
               num_pool_layers: int = 4,
               drop_prob: float = 0.0
               ):
    """
        Args:
            in_chans: Number of channels in the input to the U-Net model.
            out_chans: Number of channels in the output to the U-Net model.
            chans: Number of output channels of the first convolution layer.
            num_pool_layers: Number of down-sampling and up-sampling layers.
            drop_prob: Dropout probability.
    """
    super().__init__()

    self.in_channels = in_channels
    self.channels = channels

    self.num_pool_layers = num_pool_layers
    self.drop_prob = drop_prob

    self.down_sample_layers = nn.ModuleList([
        ConvBlock(in_channels, channels, drop_prob)
    ])
    ch = channels

    for _ in range(num_pool_layers - 1):
      self.down_sample_layers.append(ConvBlock(ch, ch*2,drop_prob))
      ch *= 2

    self.conv = ConvBlock(ch, ch*2, drop_prob)

  def forward(self, image: torch.Tensor) -> torch.Tensor:
    '''
    Args:
      Image: Input 4D tensor of shape (Batch Size, in channels, H, W)
    Returns:
      Output tensor of shape (Batch Size, out_channels, H, W)
    '''
    output = image

    ## Appplying down sample layers
    for num, layer in enumerate(self.down_sample_layers):
      output = layer(output)
      output = F.max_pool2d(output, kernel_size = 2, stride = 2, padding = 0)

    output = self.conv(output)  
    return output

* Each token’s bounding box is used to extract features from U-Net’s feature map with ROI pooling [Reference](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Add_image_embeddings_to_LayoutLM.ipynb)
* Let's test the same on FUNSD dataset, as it would be helpful in giving the estimated pipeline

In [None]:
from datasets import load_dataset
dataset = load_dataset("nielsr/funsd-layoutlmv3")

Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading and preparing dataset funsd-layoutlmv3/funsd to /root/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9...


Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset funsd-layoutlmv3 downloaded and prepared to /root/.cache/huggingface/datasets/nielsr___funsd-layoutlmv3/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
sample_encoding = dataset['train'][0]

In [None]:
# from PIL import ImageDraw
# image_with_bboxes = sample_encoding['image'].copy().resize((1000, 1000))

# draw = ImageDraw.Draw(image_with_bboxes, "RGBA")
# for bbox in sample_encoding['bboxes']:
#     draw.rectangle(bbox, outline='red', width=1) ## The boxes are normalized

# image_with_bboxes

In [None]:
resized_image = sample_encoding['image'].copy().resize((512, 384))
words = sample_encoding['tokens']
bboxes = sample_encoding['bboxes']
labels = sample_encoding['ner_tags']

In [None]:
image = ToTensor()(resized_image).unsqueeze(0) # batch size of 1
image.shape

torch.Size([1, 3, 384, 512])

In [None]:
in_channels = 3
num_pool_layers = 3
channels = 16
unet_encoder = Unet_encoder(in_channels = in_channels, channels = channels, num_pool_layers = num_pool_layers)

with torch.no_grad():
  image_embedding = unet_encoder(image)
image_embedding.shape

torch.Size([1, 128, 48, 64])

## Part 1.2 ROI pooling

In [None]:
output_size = (3,3)
target_size = (512, 384)
spatial_scale = image_embedding.shape[2] / target_size[1] # 48/384
sampling_ratio = 2  

roi_align = RoIAlign(output_size, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)
feature_maps_bboxes = roi_align(input=image_embedding, 
                                # we pass in a single tensor, with each bounding box also containing the batch index (0)
                                # We also add -0.5 for the first two coordinates and +0.5 for the last two coordinates,
                                # see https://stackoverflow.com/questions/60060016/why-does-roi-align-not-seem-to-work-in-pytorch
                                rois=torch.tensor([[0] + bbox for bbox in bboxes]).float()
                      )
print(feature_maps_bboxes.shape)

torch.Size([145, 128, 3, 3])


In [None]:
visual_embeddings = torch.flatten(feature_maps_bboxes, 1)
visual_embeddings.shape

torch.Size([145, 1152])

In [None]:
projection = nn.Linear(in_features=1152, out_features=768)
output = projection(visual_embeddings)
print(output.shape)

torch.Size([145, 768])


## Part 2.1 Performing the tokenization part

In [None]:
from transformers import AutoTokenizer
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)
encoding = tokenizer(words, is_split_into_words = True, add_special_tokens = False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
pad_token_box = [0, 0, 0, 0]
max_seq_length = 512

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

bbox_according_to_tokenizer = [bboxes[i] for i in encoding.word_ids()]
labels_according_to_tokenizer = [labels[i] for i in encoding.word_ids()]

# Truncation of token_boxes + token_labels
special_tokens_count = 2 
if len(bbox_according_to_tokenizer) > max_seq_length - special_tokens_count:
    bbox_according_to_tokenizer = bbox_according_to_tokenizer[: (max_seq_length - special_tokens_count)]
    input_ids = input_ids[: (max_seq_length - special_tokens_count)]
    labels_according_to_tokenizer = labels_according_to_tokenizer[: (max_seq_length - special_tokens_count)]
    attention_mask = attention_mask[: (max_seq_length - special_tokens_count)]


## Padding
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
bbox_according_to_tokenizer = [[0, 0, 0, 0]] + bbox_according_to_tokenizer + [[1000, 1000, 1000, 1000]]
labels_according_to_tokenizer = [-100] + labels_according_to_tokenizer + [-100]
attention_mask = [1] + attention_mask + [1]

pad_length = max_seq_length -  len(input_ids)

input_ids = input_ids + [tokenizer.pad_token_id] * (pad_length)
bbox_according_to_tokenizer = bbox_according_to_tokenizer + [pad_token_box] * (pad_length)
labels_according_to_tokenizer = labels_according_to_tokenizer + [-100] * (pad_length)
attention_mask = attention_mask + [0] * (pad_length)

In [None]:
feature_maps_bboxes = roi_align(input=image_embedding, 
                                # we pass in a single tensor, with each bounding box also containing the batch index (0)
                                # We also add -0.5 for the first two coordinates and +0.5 for the last two coordinates,
                                # see https://stackoverflow.com/questions/60060016/why-does-roi-align-not-seem-to-work-in-pytorch
                                rois=torch.tensor([[0] + bbox for bbox in bbox_according_to_tokenizer]).float()
                      )
print(feature_maps_bboxes.shape)  ## Then flatten and projection. And, thus final embedding becomes addition of the image embedding + semantic embedding, which will do tomorrow

torch.Size([512, 128, 3, 3])


## Part 2.2 Writing the dataset

In [None]:
from torch.utils.data import Dataset

class FUNSDDs(Dataset):

  def __init__(self, ds, tokenizer, max_seq_length:int = 512, pad_token_box = [0, 0, 0, 0], resize_scale = (512, 384), transform = None):
    self.ds = ds
    self.tokenizer = tokenizer
    self.max_seq_length = max_seq_length
    self.pad_token_box = pad_token_box
    self.resize_scale = resize_scale
    self.transform = transform if transform is not None else ToTensor()

  def __len__(self):
    return len(self.ds)
  
  def __getitem__(self, idx):
    
    encoding = self.ds[idx]

    resized_image = encoding['image'].copy().resize(self.resize_scale)
    words = encoding['tokens']
    bboxes = encoding['bboxes']
    labels = encoding['ner_tags']

    ## 1. Performing the image pre-processing
    img_tensor = self.transform(resized_image)  ## (3, 384, 512)

    ## 2. Performing the semantic pre-processing
    encoding = self.tokenizer(words, is_split_into_words = True, add_special_tokens = False)

    pad_token_box = [0, 0, 0, 0]
    max_seq_length = 512

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    ## Note that, there is no need for bboxes, since the model does not use bbox as feature, so no pre-processing of that
    # bbox_according_to_tokenizer = [bboxes[i] for i in encoding.word_ids()]
    labels_according_to_tokenizer = [labels[i] for i in encoding.word_ids()]

    # Truncation of token_boxes + token_labels
    special_tokens_count = 1
    if len(input_ids) > max_seq_length - special_tokens_count:
        # bbox_according_to_tokenizer = bbox_according_to_tokenizer[: (max_seq_length - special_tokens_count)]
        input_ids = input_ids[: (max_seq_length - special_tokens_count)]
        labels_according_to_tokenizer = labels_according_to_tokenizer[: (max_seq_length - special_tokens_count)]
        attention_mask = attention_mask[: (max_seq_length - special_tokens_count)]


    ## Padding
    input_ids =  input_ids + [self.tokenizer.eos_token_id]
    # bbox_according_to_tokenizer = [[0, 0, 0, 0]] + bbox_according_to_tokenizer + [[1000, 1000, 1000, 1000]]
    labels_according_to_tokenizer = labels_according_to_tokenizer + [-100]
    attention_mask = attention_mask + [1]

    pad_length = max_seq_length -  len(input_ids)

    input_ids = input_ids + [tokenizer.pad_token_id] * (pad_length)
    # bbox_according_to_tokenizer = bbox_according_to_tokenizer + [pad_token_box] * (pad_length)
    labels_according_to_tokenizer = labels_according_to_tokenizer + [-100] * (pad_length)
    attention_mask = attention_mask + [0] * (pad_length)

    ## Converting stuffs to tensor
    input_ids = torch.tensor(input_ids)
    # bbox_according_to_tokenizer = torch.tensor(bbox_according_to_tokenizer)
    labels_according_to_tokenizer = torch.tensor(labels_according_to_tokenizer)
    attention_mask = torch.tensor(attention_mask)

    return {"input_ids" : input_ids,  "labels" : labels_according_to_tokenizer, "attention_mask" : attention_mask, # "bboxes" : bbox_according_to_tokenizer,
            "pixel_values" : img_tensor}


In [None]:
from torchvision import transforms

transform = transforms.Compose([transforms.ToTensor(), 
                                transforms.Lambda(lambda x : 2 * x - 1)])

In [None]:
ds = FUNSDDs(dataset['train'],tokenizer = tokenizer, transform = transform)

## Combining the embeddings