In [1]:
!git clone https://github.com/uakarsh/SSM-s-on-Document-AI-Task.git

Cloning into 'SSM-s-on-Document-AI-Task'...
remote: Enumerating objects: 158, done.[K
remote: Counting objects: 100% (158/158), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 158 (delta 85), reused 70 (delta 24), pack-reused 0[K
Receiving objects: 100% (158/158), 328.27 KiB | 2.38 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [2]:
!pip -qqq install -r /kaggle/working/SSM-s-on-Document-AI-Task/requirements.txt
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.19.0
[0m

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import sys
sys.path.append("/kaggle/working/SSM-s-on-Document-AI-Task/src")
from models.s4d import S4ModelForSequenceClassification

In [4]:
## Importing Libraries
import torch
import torch.nn as nn
from torch.optim import AdamW
import pytorch_lightning as pl
from torch.utils.data import DataLoader

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoConfig, AutoProcessor, AutoModelForSequenceClassification

from tqdm.auto import tqdm
import wandb
import evaluate

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

import json
import pandas as pd
from PIL import Image

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [5]:
## Logging into wandb

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
pl.seed_everything(42, workers=True)

42

## 1. All hyperparameters setting

In [7]:
## Objects required for pre-processing the dataset

model_name = "microsoft/layoutlmv3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name, apply_ocr = False)

# config.update({"num_labels": 10, "lr" : 1e-5, "weight_decay" : 0.0, "batch_size" : 4})
config.update({"num_labels" : 10, "lr" : 0.001, "prenorm" : False, "num_hidden_layers" : 6, "hidden_dropout_prob" : 0.1,
              "intermediate_size" : 64, "weight_decay" : 0.01, "batch_size" : 4}) ## For classes, learning rate and usage of pre-norm

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

## 2. Pre-processing steps

In [8]:
id2label = ['Resume','Memo','News','Report','Scientific','Letter','Form','Email','ADVE','Note']
label2id = {v : k for k, v in enumerate(id2label)}

def convert_json_to_pd(json_file):
    df = pd.DataFrame(json_file)
    return df

class Tobacoo3482Dataset(Dataset):
    
    def __init__(self, json_file, base_img_path : str = ".", label2id : dict = None):
        
        assert label2id is not None, "Make sure to provide label2id"
        
        print(f"Creating data for {model_name}")
        self.json_file = json_file
        self.base_img_path = base_img_path
        self.processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        
    def load_img(self, img_path : str):
        img = Image.open(img_path).convert("RGB")
        return img
    
    def layoutlmv3_data(self, img_path, words, boxes):
        img = self.load_img(img_path)
        encoding = self.processor(img, words, boxes=boxes,
                             truncation=True, padding="max_length",
                            return_tensors = "pt")
        
        for key in list(encoding.keys()):
            encoding[key].squeeze_(0)

        return encoding
    
    def __len__(self):
        return len(self.json_file)
    
    
    def __getitem__(self, idx):
        entry = self.json_file.iloc[idx]
        img_path = os.path.join(self.base_img_path, entry['image_id'])
        words = entry['words']
        bboxes = entry['bbox']
        encoding = self.layoutlmv3_data(img_path, words, bboxes)   
        label = label2id[entry['label']]
        encoding['labels'] = torch.as_tensor(label)
        
        return encoding

In [9]:
def get_splits(dataframe, seed : int = 42, test_size : float = 0.2):
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(dataframe, random_state = seed, test_size = test_size,stratify = dataframe['label'])
    return train_df, val_df

def get_dataset(dataframe, label2id, base_path : str = ".", seed : int = 42, test_size : float = 0.2):

    train_df, val_df = get_splits(dataframe, seed, test_size)
    train_ds = Tobacoo3482Dataset(train_df, base_path, label2id)
    val_ds = Tobacoo3482Dataset(val_df, base_path, label2id)
    return train_ds, val_ds

In [10]:
## Dataset part

json_file = json.load(open("/kaggle/input/tobacco3482-tesseract-ocr/data.json"))
pd_file = convert_json_to_pd(json_file)
base_img_path = "/kaggle/input/tobacco3482jpg/Tobacco3482-jpg"
test_size = 0.2
seed = 42
train_ds, val_ds = get_dataset(pd_file, label2id, base_path = base_img_path,
                                  seed = seed, test_size = test_size)

Creating data for microsoft/layoutlmv3-base
Creating data for microsoft/layoutlmv3-base


## 5. Writing some post-processing steps and defining DataLoader

In [11]:
class DataModule(pl.LightningDataModule):
  def __init__(self, batch_size:int = 4):
    super(DataModule, self).__init__()
    self.batch_size = batch_size

  def train_dataloader(self):
    return DataLoader(train_ds, batch_size = self.batch_size,
                      shuffle = True)
    
  def val_dataloader(self):
    return DataLoader(val_ds, batch_size = self.batch_size,
                      shuffle = False)

In [12]:
def get_labels(predictions, references, id2label):

    # Transform predictions and references tensors to numpy arrays
    if predictions.device.type == "cpu":
        y_pred = predictions.detach().clone().numpy()
        y_true = references.detach().clone().numpy()

    else:
        y_pred = predictions.detach().cpu().clone().numpy()
        y_true = references.detach().cpu().clone().numpy()
    
    # Remove ignored index (special tokens)
    true_predictions = [[id2label[pred] for pred, gold_label in zip(y_pred, y_true)]]
    true_labels = [[id2label[gold_label] for pred, gold_label in zip(y_pred, y_true)]]
    return true_predictions, true_labels

## 6. Defining the Modeling approach

In [13]:
class S4Model(pl.LightningModule):

  def __init__(self, config, use_pretrained_word_embedding = True):

    super(S4Model, self).__init__()
    self.save_hyperparameters()
    
    global id2label
    self.id2label = id2label
    
    self.model = S4ModelForSequenceClassification(config)
    if use_pretrained_word_embedding:
        ## Currently working only for LayoutLMv3
        from transformers import AutoModel
        layoutlm_dummy = AutoModel.from_pretrained(config._name_or_path)
        self.model.emb = nn.Embedding.from_pretrained(layoutlm_dummy.embeddings.word_embeddings.weight)
        print(f"The word embedding has been initialized from : {config._name_or_path}")
        
    ## Parameters
    self.lr = config.lr
    self.weight_decay = config.weight_decay
    
    self.train_metric = evaluate.load("poseval")
    self.val_metric = evaluate.load("poseval")
  
  def forward(self, batch):
    return self.model(input_ids = batch['input_ids'], labels = batch['labels'])

  def setup_optimizer(self, weight_decay = 0.01):
    """
    S4 requires a specific optimizer setup.

    The S4 layer (A, B, C, dt) parameters typically
    require a smaller learning rate (typically 0.001), with no weight decay.

    The rest of the model can be trained with a higher learning rate (e.g. 0.004, 0.01)
    and weight decay (if desired).
    """

    # All parameters in the model
    all_parameters = list(self.model.parameters())

    # General parameters don't contain the special _optim key
    params = [p for p in all_parameters if not hasattr(p, "_optim")]

    # Create an optimizer with the general parameters
    optimizer = AdamW(params, lr=self.lr, weight_decay=weight_decay)

    # Add parameters with special hyperparameters
    hps = [getattr(p, "_optim") for p in all_parameters if hasattr(p, "_optim")]
    hps = [
        dict(s) for s in sorted(list(dict.fromkeys(frozenset(hp.items()) for hp in hps)))
    ]  # Unique dicts
    for hp in hps:
        params = [p for p in all_parameters if getattr(p, "_optim", None) == hp]
        optimizer.add_param_group(
            {"params": params, **hp}
        )

    # Create a lr scheduler
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.2)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)

    # Print optimizer info
    keys = sorted(set([k for hp in hps for k in hp.keys()]))
    for i, g in enumerate(optimizer.param_groups):
        group_hps = {k: g.get(k, None) for k in keys}
        print(' | '.join([
            f"Optimizer group {i}",
            f"{len(g['params'])} tensors",
        ] + [f"{k} {v}" for k, v in group_hps.items()]))

    return optimizer# , scheduler

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = self.lr)

  def training_step(self, batch, batch_idx):

    ## Forward Propagatipn
    outputs = self(batch)
    ## Predictions and adding the metrics
    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"], id2label = self.id2label)
    ## Logging Purpose
    results = self.train_metric.compute(references=true_labels, predictions=true_predictions, zero_division = 0)
    self.log("train_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_acc", results['accuracy'], prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_fl", results['macro avg']["f1-score"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_recall", results['macro avg']["recall"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("train_overall_precision", results['macro avg']["precision"], prog_bar = True, on_epoch = True, on_step = True)

    ## Backpropagation
    loss = outputs.loss
    return loss

  def validation_step(self, batch, batch_idx):

    outputs = self(batch)
    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"], id2label = self.id2label)

    ## Logging Purpose
    results = self.train_metric.compute(references=true_labels, predictions=true_predictions,zero_division = 0)
    self.log("val_loss", outputs.loss.item(), prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_acc", results['accuracy'], prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_fl", results['macro avg']["f1-score"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_recall", results['macro avg']["recall"], prog_bar = True, on_epoch = True, on_step = True)
    self.log("val_overall_precision", results['macro avg']["precision"], prog_bar = True, on_epoch = True, on_step = True)

    loss = outputs.loss
    
    return loss

## 7. Train and Go!!

In [14]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# pl_model = S4Model(config).to(device)
# pl_dl = DataModule(batch_size = config.batch_size)
# sample = next(iter(pl_dl.train_dataloader()))
# for key in sample:
#     sample[key] = sample[key].to(device)
# out = pl_model(sample)

In [15]:
def main(config):
    
    checkpoint_callback = ModelCheckpoint(
        dirpath="./s4d/models", monitor="val_overall_fl_epoch", mode="max", filename = 's4d_best_ckpt'
    )
    
    wandb.init(project="Benchmarking S4D on Tobacco3482")
    wandb_logger = WandbLogger(project="Benchmarking S4D on Tobacco3482", entity="iakarshu", run = "first_run")
    
    max_epochs = 5
    trainer = pl.Trainer(
        max_epochs = max_epochs,
        default_root_dir="./s4d/logs",
        accelerator="auto", 
        devices="auto",
        logger=wandb_logger,
        callbacks=[checkpoint_callback],
        # deterministic=True
    )
    
    pl_model = S4Model(config)
    pl_dl = DataModule(batch_size = config.batch_size)
    
    trainer.fit(pl_model, pl_dl)

    return pl_model, pl_dl

In [16]:
if __name__ == "__main__":
  pl_model, pl_dl = main(config)

[34m[1mwandb[0m: Currently logged in as: [33miakarshu[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230527_144106-d1pbqnb3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwinter-thunder-10[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/iakarshu/Benchmarking%20S4D%20on%20Tobacco3482[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/iakarshu/Benchmarking%20S4D%20on%20Tobacco3482/runs/d1pbqnb3[0m
  rank_zero_warn(


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

The word embedding has been initialized from : microsoft/layoutlmv3-base


Downloading builder script:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

## 8. Performing the evaluations

In [17]:
model_path = os.path.join('./s4d/models', os.listdir('./s4d/models')[0])
pl_model = pl_model.load_from_checkpoint(model_path)

The word embedding has been initialized from : microsoft/layoutlmv3-base


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
eval_metric = evaluate.load("poseval")
pl_model.eval();

model = pl_model.model.to(device)

for idx, batch in enumerate(tqdm(pl_dl.val_dataloader())):
    # move batch to device
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
      outputs = model(input_ids = batch['input_ids'], labels = batch['labels'])

    predictions = outputs.logits.argmax(-1)
    true_predictions, true_labels = get_labels(predictions, batch["labels"], id2label = id2label)
    eval_metric.add_batch(references=true_labels, predictions=true_predictions)

  0%|          | 0/175 [00:00<?, ?it/s]

In [19]:
results = eval_metric.compute(zero_division = 0)

In [20]:
key = "accuracy"
print_statement = '{0: <30}'.format(str(key) + " has value:")
print(print_statement, results[key])

for key in ['precision','recall', 'f1-score']:
  print_statement = '{0: <30}'.format(str(key) + " has value:")
  print(print_statement, results['macro avg'][key])

accuracy has value:            0.17790530846484937
precision has value:           0.017790530846484937
recall has value:              0.1
f1-score has value:            0.030207064555420222
