In [1]:
!git clone https://github.com/uakarsh/SSM-s-on-Document-AI-Task.git

Cloning into 'SSM-s-on-Document-AI-Task'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 65 (delta 26), reused 42 (delta 9), pack-reused 0[K
Unpacking objects: 100% (65/65), 90.49 KiB | 2.21 MiB/s, done.


In [2]:
!pip -qqq install -r /kaggle/working/SSM-s-on-Document-AI-Task/requirements.txt
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.19.0
[0m

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import sys
sys.path.append("/kaggle/working/SSM-s-on-Document-AI-Task/src")
from models.s4d import S4ModelForTokenClassification

In [4]:
## Importing Libraries
import torch
import torch.nn as nn
from torch.optim import AdamW
import pytorch_lightning as pl
from torch.utils.data import DataLoader

from datasets import load_dataset
from datasets.features import ClassLabel
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

from transformers import AutoTokenizer, AutoConfig, AutoProcessor

from tqdm.auto import tqdm
import wandb
import evaluate

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger



In [5]:
## Logging into wandb

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
pl.seed_everything(42, workers=True)

42

## 1. All hyperparameters setting

In [7]:
## Objects required for pre-processing the dataset

model_name = "microsoft/layoutlmv3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name, apply_ocr = False)

config.update({"num_labels" : 7, "lr" : 0.01, "prenorm" : False, "num_hidden_layers" : 4, "hidden_dropout_prob" : 0.2,
              "intermediate_size" : 64, "weight_decay" : 0.01, "batch_size" : 16}) ## For classes, learning rate and usage of pre-norm

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

## 2. Pre-processing steps

In [8]:
## Dataset loading part
dataset = load_dataset("nielsr/funsd-layoutlmv3")
labels = dataset["train"].features['ner_tags'].feature.names

Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading and preparing dataset funsd/funsd to /root/.cache/huggingface/datasets/nielsr___funsd/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9...


Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset funsd downloaded and prepared to /root/.cache/huggingface/datasets/nielsr___funsd/funsd/1.0.0/0e3f4efdfd59aa1c3b4952c517894f7b1fc4d75c12ef01bcc8626a69e41c1bb9. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
## Pre-processing steps

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
    
num_labels = len(label_list)

In [10]:
def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  encoding.pop("pixel_values") ## For now, we are not focusing on image property, definitely we can work it out soon :
  return encoding

## 3. Creating the dataset

In [11]:
features = Features({
    #'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask','bbox', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask','bbox', 'labels'])

## 4. Creating S4D Model and validating it on a sample data sample

In [13]:
# # Works!!
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# model = S4ModelForTokenClassification(config).to(device)
# output = model(train_dataset[0]['input_ids'].unsqueeze(0).to(device), train_dataset[0]['labels'].unsqueeze(0).to(device))

## 5. Writing some post-processing steps and defining DataLoader

In [14]:
class DataModule(pl.LightningDataModule):
  def __init__(self, batch_size:int = 16):
    super(DataModule, self).__init__()
    self.batch_size = batch_size

  def train_dataloader(self):
    return DataLoader(train_dataset, batch_size = self.batch_size,
                      shuffle = True)
    
  def val_dataloader(self):
    return DataLoader(eval_dataset, batch_size = self.batch_size,
                      shuffle = False)

In [15]:
def get_labels(predictions, references):

    # Transform predictions and references tensors to numpy arrays
    if predictions.device.type == "cpu":
        y_pred = predictions.detach().clone().numpy()
        y_true = references.detach().clone().numpy()

    else:
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels[p] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    true_labels = [
        [labels[l] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    return true_predictions, true_labels

## 6. Defining the Modeling approach

In [16]:
class S4Model(pl.LightningModule):

  def __init__(self, config, use_pretrained_word_embedding = True):

    super(S4Model, self).__init__()
    self.save_hyperparameters()
    
    self.model = S4ModelForTokenClassification(config)
    if use_pretrained_word_embedding:
        
        ## Currently working only for LayoutLMv3
        from transformers import AutoModel
        layoutlm_dummy = AutoModel.from_pretrained(config._name_or_path)
        self.model.emb = nn.Embedding.from_pretrained(layoutlm_dummy.embeddings.word_embeddings.weight)
        print(f"The word embedding has been initialized from : {config._name_or_path}")
        
    
    ## Parameters
    self.lr = config.lr
    self.weight_decay = config.weight_decay
    
    self.train_metric = evaluate.load("seqeval")
    self.val_metric = evaluate.load("seqeval")
  
  def forward(self, batch):
    return self.model(input_ids = batch['input_ids'], labels = batch['labels'])

  def setup_optimizer(self, weight_decay = 0.01):
    """
    S4 requires a specific optimizer setup.

    The S4 layer (A, B, C, dt) parameters typically
    require a smaller learning rate (typically 0.001), with no weight decay.

    The rest of the model can be trained with a higher learning rate (e.g. 0.004, 0.01)
    and weight decay (if desired).
    """

    # All parameters in the model
    all_parameters = list(self.model.parameters())

    # General parameters don't contain the special _optim key
    params = [p for p in all_parameters if not hasattr(p, "_optim")]

    # Create an optimizer with the general parameters
    optimizer = AdamW(params, lr=self.lr, weight_decay=weight_decay)

    # Add parameters with special hyperparameters
    hps = [getattr(p, "_optim") for p in all_parameters if hasattr(p, "_optim")]
    hps = [
        dict(s) for s in sorted(list(dict.fromkeys(frozenset(hp.items()) for hp in hps)))
    ]  # Unique dicts
    for hp in hps:
        params = [p for p in all_parameters if getattr(p, "_optim", None) == hp]
        optimizer.add_param_group(
            {"params": params, **hp}
        )

    # Create a lr scheduler
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.2)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

    # Print optimizer info
    keys = sorted(set([k for hp in hps for k in hp.keys()]))
    for i, g in enumerate(optimizer.param_groups):
        group_hps = {k: g.get(k, None) for k in keys}
        print(' | '.join([
            f"Optimizer group {i}",
            f"{len(g['params'])} tensors",
        ] + [f"{k} {v}" for k, v in group_hps.items()]))

    return optimizer# , scheduler

  def configure_optimizers(self):
    optimizer = self.setup_optimizer(self.weight_decay)
    return optimizer

  def training_step(self, batch, batch_idx):

    ## Forward Propagatipn
    outputs = self(batch)

    ## Predictions and adding the metrics
    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"])
#     self.train_metric.add_batch(references=true_labels, predictions=true_predictions)

#     ## Logging Purpose
    results = self.train_metric.compute(references=true_labels, predictions=true_predictions)
    self.log("train_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_fl", results["overall_f1"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_recall", results["overall_recall"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_precision", results["overall_precision"], prog_bar = True, on_epoch = True, on_step = True)

    ## Backpropagation
    loss = outputs.loss
    return loss

  def validation_step(self, batch, batch_idx):

    outputs = self(batch)
    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"])
    # self.val_metric.add_batch(references=true_labels, predictions=true_predictions)

    ## Logging Purpose
    results = self.val_metric.compute(references=true_labels, predictions=true_predictions)
    self.log("val_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_fl", results["overall_f1"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_recall", results["overall_recall"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_precision", results["overall_precision"], prog_bar = True, on_epoch = True, on_step = True)

    loss = outputs.loss
    
    return loss

## 7. Train and Go!!

In [17]:
def main(config):
    
    checkpoint_callback = ModelCheckpoint(
        dirpath="./s4d/models", monitor="val_overall_fl_epoch", mode="max", filename = 's4d_best_ckpt'
    )
    
    wandb.init(project="Benchmarking S4D on FUNSD")
    wandb_logger = WandbLogger(project="Benchmarking S4D on FUNSD", entity="iakarshu", run = "first_run")
    
    max_epochs = 50
    trainer = pl.Trainer(
        max_epochs = max_epochs,
        default_root_dir="./s4d/logs",
        accelerator="auto", 
        devices="auto",
        #logger=wandb_logger,
        callbacks=[checkpoint_callback],
        # deterministic=True
    )
    
    pl_model = S4Model(config)
    pl_dl = DataModule(batch_size = config.batch_size)
    
    trainer.fit(pl_model, pl_dl)

    return pl_model, pl_dl

In [18]:
if __name__ == "__main__":
  pl_model, pl_dl = main(config)

[34m[1mwandb[0m: Currently logged in as: [33miakarshu[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_warn(


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

The word embedding has been initialized from : microsoft/layoutlmv3-base


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Optimizer group 0 | 27 tensors | lr 0.01 | weight_decay 0.01
Optimizer group 1 | 12 tensors | lr 0.001 | weight_decay 0.0


## 8. Performing the evaluations

In [None]:
model_path = os.path.join('./s4d/models', os.listdir('./s4d/models')[0])
pl_model = pl_model.load_from_checkpoint(model_path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
eval_metric = evaluate.load("seqeval")
pl_model.eval();

model = pl_model.model.to(device)

for idx, batch in enumerate(tqdm(pl_dl.val_dataloader())):
    # move batch to device
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
      outputs = model(batch['input_ids'])

    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"])
    eval_metric.add_batch(references=true_labels, predictions=true_predictions)

In [None]:
results = eval_metric.compute()

In [None]:
for key in ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']:
  print_statement = '{0: <30}'.format(str(key) + " has value:")
  print(print_statement, results[key])

In [None]:
pytorch_total_params = sum(p.numel() for p in pl_model.parameters()) / 1e6
print(f"Number of parameters in the model is: {pytorch_total_params:.4f}")

In [None]:
from transformers import AutoModelForTokenClassification
transformer_model = AutoModelForTokenClassification.from_pretrained(config._name_or_path, num_labels = config.num_labels)
pytorch_total_params = sum(p.numel() for p in transformer_model.parameters()) / 1e6
print(f"Number of parameters in the model is: {pytorch_total_params:.4f}M")