In [1]:
!git clone https://github.com/uakarsh/SSM-s-on-Document-AI-Task.git

Cloning into 'SSM-s-on-Document-AI-Task'...
remote: Enumerating objects: 212, done.[K
remote: Total 212 (delta 0), reused 0 (delta 0), pack-reused 212[K
Receiving objects: 100% (212/212), 396.75 KiB | 7.08 MiB/s, done.
Resolving deltas: 100% (117/117), done.


In [2]:
!pip -qqq install -r /kaggle/input/docvqa-hf-dataset/LayoutLMv3-DocVQA/requirements.txt
!pip -qqq install -r /kaggle/working/SSM-s-on-Document-AI-Task/requirements.txt
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.20.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.5/227.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.20.1
[0m

In [3]:
!cp -r /kaggle/input/docvqa-hf-dataset/LayoutLMv3-DocVQA/docvqa_cached_extractive_all_lowercase_True_msr_False_extraction_v3_enumeration ./

In [4]:
## Logging into wandb

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.makedirs(f"results", exist_ok=True)

import sys
sys.path.append("/kaggle/working/SSM-s-on-Document-AI-Task/src")
sys.path.append("/kaggle/input/docvqa-hf-dataset/LayoutLMv3-DocVQA")

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForQuestionAnswering, AutoConfig

from datasets import load_from_disk, DatasetDict

from src.utils import get_optimizers, create_and_fill_np_array, write_data, anls_metric_str, postprocess_qa_predictions
from src.data.tokenization import tokenize_docvqa, DocVQACollator

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [7]:
pl.seed_everything(42)

42

## 1. All hyperparameters setting

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
dataset_file = "/kaggle/working/docvqa_cached_extractive_all_lowercase_True_msr_False_extraction_v3_enumeration"
batch_size = 4
learning_rate = 2e-5
num_epochs = 5
seed = 42
pretrained_model_name = 'microsoft/layoutlmv3-base'
use_generation = False
stride = 0
ignore_unmatched_span = 1
extraction_nbest = 20
max_answer_length = 100
fp16 = True
model_folder = "layoutlmv3-extractive-uncased"

image_dir = {"train": "/kaggle/input/docvqa-dataset/train/train", "val": "/kaggle/input/docvqa-dataset/val/val", "test": "/kaggle/input/docvqa-dataset/test/test"}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, use_fast=True)
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name, apply_ocr=False)
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
config = AutoConfig.from_pretrained(pretrained_model_name)
config.update({"learning_rate" : learning_rate, "batch_size" : batch_size})

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['qa_outputs.out_proj.bias', 'qa_outputs.dense.bias', 'qa_outputs.out_proj.weight', 'qa_outputs.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 2. Pre-processing steps

In [10]:
collator = DocVQACollator(tokenizer, feature_extractor, pretrained_model_name=pretrained_model_name, model=model)
dataset = load_from_disk(dataset_file)
# dataset = DatasetDict({"train": dataset["train"].select(range(10)), "val": dataset['val'].select(range(10)), "test": dataset['test'].select(range(10))})

In [11]:
use_msr = "msr_True" in dataset_file
tokenized = dataset.map(tokenize_docvqa,
                            fn_kwargs={"tokenizer": tokenizer,
                                       "img_dir": image_dir,
                                       "use_msr_ocr": use_msr,
                                       "use_generation": bool(use_generation),
                                       "doc_stride": stride,
                                       "ignore_unmatched_answer_span_during_train": bool(ignore_unmatched_span)},
                            batched=True, num_proc=8,batch_size = 4,
                            load_from_cache_file=True,
                            remove_columns=dataset["val"].column_names
                            )

         

#0:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1234 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1233 [00:00<?, ?ba/s]

         

#0:   0%|          | 0/168 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/168 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/168 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/168 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/168 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/167 [00:00<?, ?ba/s]

#6:   0%|          | 0/167 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/167 [00:00<?, ?ba/s]

         

#0:   0%|          | 0/163 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/163 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/163 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/163 [00:00<?, ?ba/s]

#4:   0%|          | 0/162 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/162 [00:00<?, ?ba/s]

#6:   0%|          | 0/162 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/162 [00:00<?, ?ba/s]

## 3. Writing some post-processing steps and defining DataLoader

In [12]:
class DataModule(pl.LightningDataModule):
  def __init__(self, batch_size:int = 8):
    super(DataModule, self).__init__()
    self.batch_size = batch_size

  def train_dataloader(self):
    return DataLoader(tokenized["train"].remove_columns("metadata"), batch_size = self.batch_size,
                      shuffle = True, pin_memory=True, collate_fn=collator)
    
  def val_dataloader(self):
    return DataLoader(tokenized["val"].remove_columns("metadata"), batch_size = self.batch_size,
                      shuffle = False, collate_fn=collator)

In [13]:
pl_dl = DataModule()
# sample = next(iter(pl_dl.train_dataloader()))

# for key in sample:
#     sample[key] = sample[key].to(device)
# model = model.to(device)
# output = model(**sample)

In [14]:
t_total = int(len(pl_dl.train_dataloader()) * num_epochs)
# optimizer, scheduler = get_optimizers(model=model, learning_rate=learning_rate, num_training_steps=t_total,
#                                           warmup_step=0, eps=1e-8)

## 4. Defining the Modeling approach

In [15]:
import torch.nn as nn
class S4Model(pl.LightningModule):

  def __init__(self, config, use_pretrained_word_embedding = True):

    super(S4Model, self).__init__()
    self.save_hyperparameters()
    
    self.model = AutoModelForQuestionAnswering.from_config(config)
    if use_pretrained_word_embedding:
        
        ## Currently working only for LayoutLMv3
        from transformers import AutoModel
        layoutlm_dummy = AutoModel.from_pretrained(config._name_or_path)
        self.model.layoutlmv3.embeddings.word_embeddings = nn.Embedding.from_pretrained(layoutlm_dummy.embeddings.word_embeddings.weight)
        print(f"The word embedding has been initialized from : {config._name_or_path}")
    
    self.learning_rate = config.learning_rate
  
  def forward(self, batch):
    return self.model(**batch)

  def setup_optimizer(self, weight_decay = 0.01):
    """
    S4 requires a specific optimizer setup.

    The S4 layer (A, B, C, dt) parameters typically
    require a smaller learning rate (typically 0.001), with no weight decay.

    The rest of the model can be trained with a higher learning rate (e.g. 0.004, 0.01)
    and weight decay (if desired).
    """

    # All parameters in the model
    all_parameters = list(self.model.parameters())

    # General parameters don't contain the special _optim key
    params = [p for p in all_parameters if not hasattr(p, "_optim")]

    # Create an optimizer with the general parameters
    optimizer = AdamW(params, lr=self.lr, weight_decay=weight_decay)

    # Add parameters with special hyperparameters
    hps = [getattr(p, "_optim") for p in all_parameters if hasattr(p, "_optim")]
    hps = [
        dict(s) for s in sorted(list(dict.fromkeys(frozenset(hp.items()) for hp in hps)))
    ]  # Unique dicts
    for hp in hps:
        params = [p for p in all_parameters if getattr(p, "_optim", None) == hp]
        optimizer.add_param_group(
            {"params": params, **hp}
        )

    # Create a lr scheduler
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.2)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

    # Print optimizer info
    keys = sorted(set([k for hp in hps for k in hp.keys()]))
    for i, g in enumerate(optimizer.param_groups):
        group_hps = {k: g.get(k, None) for k in keys}
        print(' | '.join([
            f"Optimizer group {i}",
            f"{len(g['params'])} tensors",
        ] + [f"{k} {v}" for k, v in group_hps.items()]))

    return optimizer# , scheduler

  def configure_optimizers(self):
    optimizer, scheduler = get_optimizers(model=self.model, learning_rate=self.learning_rate, num_training_steps=t_total,
                                          warmup_step=0, eps=1e-8)
    return optimizer
    

  def training_step(self, batch, batch_idx):

    ## Forward Propagatipn
    outputs = self(batch)
    self.log("train_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)

    ## Backpropagation
    loss = outputs.loss
    return loss

  def validation_step(self, batch, batch_idx):
    outputs = self(batch)
    self.log("val_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)

    loss = outputs.loss
    
    return loss

## 5. Train and Go!!

In [16]:
def main(config):
    
    checkpoint_callback = ModelCheckpoint(
        dirpath="./s4d/models", monitor="val_loss_epoch", mode="min", filename = 's4d_best_ckpt'
    )
    
    wandb.init(project="Benchmarking LayoutLMv3 on DocVQA")
    wandb_logger = WandbLogger(project="Benchmarking LayoutLMv3 on DocVQA", entity="iakarshu", run = "first_run")
    
    max_epochs = 5
    trainer = pl.Trainer(
        max_epochs = max_epochs,
        default_root_dir="./s4d/logs",
        accelerator="auto", 
        devices="auto",
        logger=wandb_logger,
        callbacks=[checkpoint_callback],
        # deterministic=True
    )
    
    pl_model = S4Model(config)
    pl_dl = DataModule(batch_size = config.batch_size)
    
    # trainer.fit(pl_model, pl_dl)

    return pl_model, pl_dl

In [17]:
if __name__ == "__main__":
  pl_model, pl_dl = main(config)

[34m[1mwandb[0m: Currently logged in as: [33miakarshu[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230608_130740-mpxfrlwa[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mrestful-plant-7[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/iakarshu/Benchmarking%20LayoutLMv3%20on%20DocVQA[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/iakarshu/Benchmarking%20LayoutLMv3%20on%20DocVQA/runs/mpxfrlwa[0m
  rank_zero_warn(


The word embedding has been initialized from : microsoft/layoutlmv3-base


In [18]:
ckpt_path = "/kaggle/input/git-repo-ssm-p3-docvqa-layoutlmv3-train/s4d/models/s4d_best_ckpt.ckpt"
pl_model = pl_model.load_from_checkpoint(ckpt_path, config = config)

The word embedding has been initialized from : microsoft/layoutlmv3-base


In [19]:
test_loader = DataLoader(tokenized["val"].remove_columns("metadata"), batch_size=batch_size,
                                                    collate_fn=collator, shuffle=False)

In [20]:
from accelerate import Accelerator
from functools import partial
from tqdm import tqdm

accelerator = Accelerator(kwargs_handlers=[])
tqdm = partial(tqdm, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', disable=not accelerator.is_local_main_process)

In [21]:
model, test_loader = accelerator.prepare(pl_model.model, test_loader)

In [22]:
def evaluate(tokenizer, valid_dataloader, model,
             valid_dataset_before_tokenized, metadata,
             res_file=None, err_file=None):
    model.eval()
    all_start_logits = []
    all_end_logits = []
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=bool(fp16)):
        for index, batch in tqdm(enumerate(valid_dataloader), desc="--validation", total=len(valid_dataloader)):
            batch.start_positions = None
            batch.end_positions = None
            outputs = model(**batch)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
            end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
            all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy())
            all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy())

    max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
    eval_dataset = valid_dataloader.dataset
    # concatenate the numpy array
    start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
    end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
    # delete the list of numpy arrays
    del all_start_logits
    del all_end_logits

    outputs_numpy = (start_logits_concat, end_logits_concat)
    prediction_dict, prediction_list = postprocess_qa_predictions(dataset_before_tokenized = valid_dataset_before_tokenized,
                                                                      metadata=metadata, predictions=outputs_numpy,
                                                                      n_best_size=extraction_nbest, max_answer_length=max_answer_length)
    all_pred_texts = [prediction['answer'] for prediction in prediction_list]
    truth = [data["original_answer"] for data in valid_dataset_before_tokenized]
    accelerator.print(f"prediction: {all_pred_texts[:10]}")
    accelerator.print(f"gold_answers: {truth[:10]}")
    all_anls, anls = anls_metric_str(predictions=all_pred_texts, gold_labels=truth)
    accelerator.print(f"[Info] Average Normalized Lev.S : {anls} ", flush=True)
    if res_file is not None and accelerator.is_main_process:
        accelerator.print(f"Writing results to {res_file} and {err_file}")
        write_data(data=prediction_list, file=res_file)
    return anls

In [23]:
evaluate(tokenizer=tokenizer, valid_dataloader=test_loader, model=model,
                 valid_dataset_before_tokenized=dataset["val"], metadata=tokenized["val"]["metadata"],
             res_file=f"results/{model_folder}.res.json", err_file=f"results/{model_folder}.err.json")

--validation:   0%|          | 0/1338 [00:00<?, ?it/s]You're using a LayoutLMv3TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
--validation: 100%|██████████| 1338/1338 [14:38<00:00,  1.52it/s]


prediction: ['8.28', 'san diego', 'itc limited report', 'san diego', 'san diego', 'the best thing between two', 'flama', 'robert a. welch foundation 2010 bank of the southwest building houston, texas 77002', '11:14 to coffee break 11:39 a.m. coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. (exhibits open) 11:39', '$11, 228.00']
gold_answers: [['0.28'], ['university of california, san diego', 'university of california'], ['itc limited'], ['san diego'], ['paul'], ['1128 sixteenth st., n. w., washington, d. c. 20036'], ['aashirvaad'], ['the robert a. welch foundation'], ['11.14 to 11.39 a.m.', '11:14 to 11:39 a.m.'], ['975.00', '$975.00']]
[Info] Average Normalized Lev.S : 0.05767679221230046 
Writing results to results/layoutlmv3-extractive-uncased.res.json and results/layoutlmv3-extractive-uncased.err.json


0.05767679221230046