# Fine-tuning LayoutLMv2 

### Task DocVQA 


In [1]:
#import libraries

from huggingface_hub import notebook_login
from tqdm import tqdm
from datasets import load_dataset, Dataset, Features, Sequence, Value, Array2D, Array3D, load_metric
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
import editdistance
import numpy as np
import transformers
from transformers import AutoProcessor, AutoTokenizer, LayoutLMv2Processor, AutoModelForDocumentQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator

from PIL import Image
import os
import torch
import evaluate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score




In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_checkpoint = "microsoft/layoutlmv2-base-uncased"

batch_size = 1

### Load the data

In [None]:
dataset = load_dataset("svenjars/dataset", cache_dir="/Users/svenja")
dataset

In [6]:
dataset["train"].features

{'id': Value(dtype='string', id=None),
 'image': Value(dtype='string', id=None),
 'query': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bounding_boxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None)}

In [7]:
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

df_train['image'] = df_train['image'].apply(lambda x: 'training_data/images/' + x)
df_test['image'] = df_test['image'].apply(lambda x: 'testing_data/images/' + x)

In [8]:
hg_dataset_train = Dataset(pa.Table.from_pandas(df_train))
hg_dataset_test = Dataset(pa.Table.from_pandas(df_test))

In [9]:
df_train = pd.DataFrame(df_train)
df_train.head()
df_train.to_csv('LayoutLMv2_training_data.csv')

In [10]:
df_test = pd.DataFrame(df_test)
df_test.head()
df_test.to_csv('LayoutLMv2_testing_data.csv')

In [14]:
updated_dataset_train = hg_dataset_train.map(
    lambda example: {"question": example["query"]},
    remove_columns=["query"]
)


updated_dataset_test = hg_dataset_test.map(
    lambda example: {"question": example["query"]},
    remove_columns=["query"]
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [15]:
updated_dataset_train = updated_dataset_train.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
updated_dataset_train = updated_dataset_train.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

In [16]:
updated_dataset_train = updated_dataset_train.remove_columns("words")
updated_dataset_train = updated_dataset_train.remove_columns("bounding_boxes")

updated_dataset_test = updated_dataset_test.remove_columns("words")
updated_dataset_test = updated_dataset_test.remove_columns("bounding_boxes")

In [18]:
updated_dataset_train[11]["image"]

'training_data/images/data27.jpg'

In [19]:
processor = AutoProcessor.from_pretrained(model_checkpoint)

### Preprocessing the document images

In [20]:
image_processor = processor.image_processor

root_dir = '/Users/svenja/Downloads/dataset/'

#get words and boxes by using optical character recognition
def get_ocr_words_and_boxes(examples):
    
    #get a batch of document images
    images = [Image.open(root_dir + image_file).convert("RGB") for image_file in examples['image']]
    #resize every image to 224x224 + apply tesseract to get words + normalized boxes
    examples['source'] = examples['image']
    
    encoded_inputs = image_processor(images)

    examples['image'] = encoded_inputs.pixel_values
    examples['words'] = encoded_inputs.words
    examples['bounding_boxes'] = encoded_inputs.boxes

    return examples

In [21]:
#applying the function to the dataset

dataset_with_ocr_train = updated_dataset_train.map(get_ocr_words_and_boxes, batched=True, batch_size = 2)

dataset_with_ocr_test = updated_dataset_test.map(get_ocr_words_and_boxes, batched=True, batch_size = 2)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
print(dataset_with_ocr_train[0]['words'])
print(dataset_with_ocr_train[0]['bounding_boxes'])
print("-----")
print(dataset_with_ocr_train[1]['words'])
print(dataset_with_ocr_train[1]['bounding_boxes'])

In [24]:
dataset_with_ocr_train[0].keys()

dict_keys(['id', 'image', 'answer', 'question', 'source', 'words', 'bounding_boxes'])

### Preprocessing the text data

In [25]:
tokenizer = processor.tokenizer

In [26]:
def subfinder(words_list, answer_list):
    matches = []
    start_indices = []
    end_indices = []
    for idx, i in enumerate(range(len(words_list))):
        if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list:
            matches.append(answer_list)
            start_indices.append(idx)
            end_indices.append(idx + len(answer_list) - 1)
    if matches:
        return matches[0], start_indices[0], end_indices[0]
    else:
        return None, 0, 0

In [27]:
#example
question = "where is it located?"
words = ["this", "is", "located", "in", "the", "university", "of", "california", "in", "the", "US"]
boxes = [[1000,1000,1000,1000] for _ in range(len(words))]
answer = "university of california"

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [29]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [30]:
encoding = tokenizer(question, words, boxes=boxes)

In [31]:
tokenizer.decode(encoding.input_ids)

'[CLS] where is it located? [SEP] this is located in the university of california in the us [SEP]'

In [32]:
print(encoding.word_ids())

[None, 0, 1, 2, 3, 3, None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]


In [33]:
match, word_idx_start, word_idx_end = subfinder(words, answer.split())

In [34]:
print("Match:", match)
print("Word idx start:", word_idx_start)
print("Word idx end:", word_idx_end)

Match: ['university', 'of', 'california']
Word idx start: 5
Word idx end: 7


In [35]:
sequence_ids = encoding.sequence_ids()

#start token index of the current span in the text
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

#end token index of the current span in the text
token_end_index = len(encoding.input_ids) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

print("Token start index:", token_start_index)
print("Token end index:", token_end_index)
print(tokenizer.decode(encoding.input_ids[token_start_index:token_end_index+1]))

word_ids = encoding.word_ids()[token_start_index:token_end_index+1]
print("Word ids:", word_ids)
for id in word_ids:
    if id == word_idx_start:
        start_position = token_start_index 
    else:
        token_start_index += 1

for id in word_ids[::-1]:
    if id == word_idx_end:
        end_position = token_end_index 
    else:
        token_end_index -= 1

print(start_position)
print(end_position)
print("Reconstructed answer:", tokenizer.decode(encoding.input_ids[start_position:end_position+1]))

Token start index: 7
Token end index: 17
this is located in the university of california in the us
Word ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
12
14
Reconstructed answer: university of california


In [38]:
#encode the data

def encode_dataset(examples, max_length=512):
    questions = examples["question"]
    words = examples["words"]
    boxes = examples["bounding_boxes"]
    answers = examples["answer"]
    sources = examples["source"]

    #encode the batch of examples and initialize the start_positions and end_positions
    encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)
    start_positions = []
    end_positions = []

    #loop through the examples in the batch
    for i in range(len(questions)):
        cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id)

        #find position of the answer in example's words
        words_example = [word.lower() for word in words[i]]
        answer = answers[i]
        match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())

        print(f"{i}. match: {match} in {sources[i]}")
        print(f'\nwords_example: {" ".join(words_example)} \nanswer: {answer}\nquestion: {questions[i]}')

        if match:
            #if match is found, use token_type_ids to find where words start in the encoding
            token_type_ids = encoding["token_type_ids"][i]
            token_start_index = 0
            while token_type_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(encoding["input_ids"][i]) - 1
            while token_type_ids[token_end_index] != 1:
                token_end_index -= 1

            word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1]
            start_position = cls_index
            end_position = cls_index

            #loop over word_ids and increase token_start_index until it matches the answer position in words
            #if it matches, save the token_start_index as the start_position of the answer in the encoding
            for id in word_ids:
                if id == word_idx_start:
                    start_position = token_start_index
                else:
                    token_start_index += 1
            

            #loop over word_ids starting from the end to find the end_position of the answer
            for id in word_ids[::-1]:
                if id == word_idx_end:
                    end_position = token_end_index
                else:
                    token_end_index -= 1

            print("True answer:", answer)
            start_positions.append(start_position)
            end_positions.append(end_position)
            reconstructed_answer = tokenizer.decode(encoding.input_ids[cls_index][start_position:end_position+1])
            print("Reconstructed answer:", reconstructed_answer)

        else:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        
        print("\n\n\n")

    encoding["image"] = examples["image"]
    encoding["start_positions"] = start_positions
    encoding["end_positions"] = end_positions

    

    return encoding

In [39]:
#define custom features
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'image': Array3D(dtype="int64", shape=(3, 224, 224)),
    'start_positions': Value(dtype='int64'),
    'end_positions': Value(dtype='int64'),
})

In [None]:
#encode entire dataset
encoded_train_dataset = dataset_with_ocr_train.map(
    encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr_train.column_names, features=features
)
encoded_test_dataset = dataset_with_ocr_test.map(
    encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr_test.column_names, features=features
)

In [46]:
encoded_train_dataset.features

{'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'bbox': Array2D(shape=(512, 4), dtype='int64', id=None),
 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'image': Array3D(shape=(3, 224, 224), dtype='int64', id=None),
 'start_positions': Value(dtype='int64', id=None),
 'end_positions': Value(dtype='int64', id=None)}

In [47]:
encoded_train_dataset.set_format(type="torch")
train_dataloader = torch.utils.data.DataLoader(encoded_train_dataset, batch_size=4)
batch = next(iter(train_dataloader))

for k,v in batch.items():
    print(k, v.shape)

input_ids torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
attention_mask torch.Size([4, 512])
token_type_ids torch.Size([4, 512])
image torch.Size([4, 3, 224, 224])
start_positions torch.Size([4])
end_positions torch.Size([4])


In [48]:
encoded_test_dataset.set_format(type="torch")
eval_dataloader = torch.utils.data.DataLoader(encoded_test_dataset, batch_size=4)
batch = next(iter(eval_dataloader))

for k,v in batch.items():
    print(k, v.shape)

input_ids torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
attention_mask torch.Size([4, 512])
token_type_ids torch.Size([4, 512])
image torch.Size([4, 3, 224, 224])
start_positions torch.Size([4])
end_positions torch.Size([4])


### Evaluation

In [45]:
metric = load_metric('accuracy')
metric1 = load_metric('accuracy')


root_dir = '/Users/svenja/Downloads/dataset/'
model_acc = 0
ANLS_final_score = 0
model_acc_divider = len(dataset["test"])

anls_list = []
accuracy_list = []
recall_list = []
precision_list = []
f1_list = []

def ANLS(pred,ans):
    if ans[0] is not None:
        scores = []
        ed = editdistance.eval(ans.lower(),pred.lower())
        NL = ed/max(len(ans),len(pred))
        scores.append(1-NL if NL<0.5 else 0)
        return [max(scores)]
    return []

def compute_metrics(eval_preds):
    anls_epoch = 0
    accuracy_epoch = 0
    recall_epoch = 0
    precision_epoch = 0
    f1_epoch = 0
    
        
    for example in dataset["test"]:
        epoch = example['id']
        question = example['query']
        image_path = root_dir + '/testing_data/images/' + example['image']
        image = Image.open(image_path).convert("RGB")
        answer = example['answer']

        print("Question:", question,"\n")

        processor = LayoutLMv2Processor.from_pretrained(model_checkpoint)

        #prepare for the model
        encoding = processor(image, question, return_tensors="pt")

        #forward pass
        for k,v in encoding.items():
            encoding[k] = v.to(model.device)

        outputs = model(**encoding)


        #get start_logits and end_logits
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        #get largest logit for both
        predicted_start_idx = start_logits.argmax(-1).item()
        predicted_end_idx = end_logits.argmax(-1).item()
        
        #decode the predicted answer
        predicted_answer = processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx:predicted_end_idx+1])
        
        pred_tensor = encoding.input_ids.squeeze()[predicted_start_idx:predicted_end_idx+1]


        print("True answer:", answer)
        print("Predicted answer:", predicted_answer, "\n")

        # Part for ANLS
        scores=[]
        scores+=ANLS(predicted_answer, answer)
        final_score = np.mean(scores)
        print('ID: {}, ANLS: {}'.format(epoch,final_score))
        torch.save({'state_dict':model.state_dict(), 'epoch':epoch, 'ANLS':final_score},'state_latest.pth')

        answer_tensor = torch.LongTensor(list(bytes(answer, 'utf8')))
        pred_tensor = torch.LongTensor(list(bytes(predicted_answer, 'utf8')))

        max_length = max(answer_tensor.size(0), pred_tensor.size(0))
        answer_tensor = torch.cat([answer_tensor, torch.zeros(max_length - answer_tensor.size(0))], dim=0)
        pred_tensor = torch.cat([pred_tensor, torch.zeros(max_length - pred_tensor.size(0))], dim=0)

        accuracy = 0
        recall = 0
        precision = 0
        f1 = 0
        accuracy = accuracy_score(answer_tensor, pred_tensor, normalize=True, sample_weight=None)
        recall = recall_score(answer_tensor, pred_tensor, average='weighted', zero_division=0)
        precision = precision_score(answer_tensor, pred_tensor, average='weighted', zero_division=0)
        f1 = f1_score(answer_tensor, pred_tensor, average='weighted')
        
        print("accuracy:", accuracy)
        print("recall:", recall)
        print("precision:", precision)
        print("f1:", f1)
        
        anls_epoch += final_score
        accuracy_epoch += accuracy
        recall_epoch += recall
        precision_epoch += precision
        f1_epoch += f1
        
        
        
        print('')
        print("--------------------------------------")
        
    anls_epoch = anls_epoch/model_acc_divider
    accuracy_epoch = accuracy_epoch/model_acc_divider
    recall_epoch = recall_epoch/model_acc_divider
    precision_epoch = precision_epoch/model_acc_divider
    f1_epoch = f1_epoch/model_acc_divider
    

    epoch_values = dict()
    epoch_values['ANLS'] = anls_epoch
    epoch_values['Accuracy'] = accuracy_epoch
    epoch_values['Recall'] = recall_epoch
    epoch_values['Precision'] = precision_epoch
    epoch_values['F1'] = f1_epoch
    
    anls_list.append(anls_epoch)
    accuracy_list.append(accuracy_epoch)
    recall_list.append(recall_epoch)
    precision_list.append(precision_epoch)
    f1_list.append(f1_epoch)
    
    print("ANLS", anls_list)
    print("Accuracy", accuracy_list)
    print("Recall", recall_list)
    print("Precision", precision_list)
    print("F1", f1_list)
    
    print("Epoch values:", epoch_values)
    
    print('+++++++++++++++++++++++++++++++++++++++++++')
    return epoch_values


  metric = load_metric('accuracy')


### Training the model

In [51]:
model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at microsoft/layoutlmv2-base-uncased were not used when initializing LayoutLMv2ForQuestionAnswering: ['layoutlmv2.visual.backbone.bottom_up.res4.13.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.9.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res5.2.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.15.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.5.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.6.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.15.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.20.conv2.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res2.0.conv3.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.stem.conv1.norm.num_batches_tracked', 'layoutlmv2.visual.backbone.bottom_up.res4.20.conv3.norm.num_batches_tracked', 

Some weights of LayoutLMv2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight', 'layoutlmv2.visual_segment_embedding']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
notebook_login()

In [54]:
repo_id = "svenjars/layoutlmv2"

#define training arguments
training_args = TrainingArguments(
    output_dir=repo_id,
    per_device_train_batch_size=1,
    num_train_epochs=20,
    save_steps=20,
    logging_steps=20, 
    eval_steps=20, 
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=True,
)





In [55]:
#batch examples together
data_collator = DefaultDataCollator()

In [56]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=processor,
    compute_metrics = compute_metrics,
)



Cloning https://huggingface.co/svenjars/layoutlmv2 into local empty directory.


In [None]:
#call trainer
trainer.train()

In [None]:
#create model card and push to huggingface
trainer.create_model_card()
trainer.push_to_hub()

In [None]:
#evaluating the training
trainer.evaluate()