**Mounting Google-Drive:**

First of all we must mount our drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive/aws_annotation/")

**Install libraries:**


In [None]:
# ! rm -r unilm
# ! git clone -b remove_torch_save https://github.com/NielsRogge/unilm.git
# ! cd unilm/layoutlm
# ! pip install unilm/layoutlm

In [None]:
# ! rm -r transformers
# ! git clone https://github.com/huggingface/transformers.git
# ! cd transformers
# ! pip install ./transformers

### Load labels

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
from torch.nn import CrossEntropyLoss

def get_labels(path):
    with open(path, "r") as f:
        labels = f.read().splitlines()
    # if "O" not in labels:
    #     labels = ["O"] + labels
    d = {}
    for x in labels:
        d[x] = 1
    final_labels = list(d.keys())
    return final_labels

labels = get_labels("/content/drive/My Drive/aws_annotation/data/labels.txt")
# labels = get_labels("/content/drive/MyDrive/combined_data/labels.txt"
# print(labels)
num_labels = len(labels)
label_map = {i: label for i, label in enumerate(labels)}
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
pad_token_label_id = CrossEntropyLoss().ignore_index
# print(pad_token_label_id)

In [None]:
len(labels)

205

In [None]:
labels

['O-OTHERS',
 'B-ALTERNATEMOBILE',
 'I-ALTERNATEMOBILE',
 'E-ALTERNATEMOBILE',
 'S-ALTERNATEMOBILE',
 'B-BASICTPPREMIUM',
 'I-BASICTPPREMIUM',
 'E-BASICTPPREMIUM',
 'S-BASICTPPREMIUM',
 'B-CC',
 'I-CC',
 'E-CC',
 'S-CC',
 'B-CGST',
 'I-CGST',
 'E-CGST',
 'S-CGST',
 'B-CHASSISNO',
 'I-CHASSISNO',
 'E-CHASSISNO',
 'S-CHASSISNO',
 'B-COMMUNICATIONADDRESS',
 'I-COMMUNICATIONADDRESS',
 'E-COMMUNICATIONADDRESS',
 'S-COMMUNICATIONADDRESS',
 'B-COMMUNICATIONCITY',
 'I-COMMUNICATIONCITY',
 'E-COMMUNICATIONCITY',
 'S-COMMUNICATIONCITY',
 'B-COMMUNICATIONPINCODE',
 'I-COMMUNICATIONPINCODE',
 'E-COMMUNICATIONPINCODE',
 'S-COMMUNICATIONPINCODE',
 'B-COMMUNICATIONSTATE',
 'I-COMMUNICATIONSTATE',
 'E-COMMUNICATIONSTATE',
 'S-COMMUNICATIONSTATE',
 'B-ENGINENO',
 'I-ENGINENO',
 'E-ENGINENO',
 'S-ENGINENO',
 'B-FUEL',
 'I-FUEL',
 'E-FUEL',
 'S-FUEL',
 'B-GROSSPREMIUM',
 'I-GROSSPREMIUM',
 'E-GROSSPREMIUM',
 'S-GROSSPREMIUM',
 'B-GVW',
 'I-GVW',
 'E-GVW',
 'S-GVW',
 'B-INSURER',
 'I-INSURER',
 'E-INSURER

In [None]:
# label_map

### Feature Extractions

In [None]:
import logging
import os

import torch
from torch.utils.data import Dataset

logger = logging.getLogger(__name__)
import random

# image_path_dict = {}
# count = 1
class FunsdDataset(Dataset):
    # global count
    def __init__(self, args, tokenizer, labels, pad_token_label_id, mode):
        if args.local_rank not in [-1, 0] and mode == "train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}".format(
                mode,
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length),
            ),
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            features = torch.load(cached_features_file)
        else:
            logger.info("Creating features from dataset file at %s", args.data_dir)
            examples = read_examples_from_file(args.data_dir, mode)
            features, _, _, _, _, _ = convert_examples_to_features(
                examples,
                labels,
                args.max_seq_length,
                tokenizer,
                cls_token_at_end=bool(args.model_type in ["xlnet"]),
                # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(args.model_type in ["roberta"]),
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(args.model_type in ["xlnet"]),
                # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
                pad_token_label_id=pad_token_label_id
            )
            if args.local_rank in [-1, 0]:
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)

        if args.local_rank == 0 and mode == "train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        self.features = features
        # Convert to Tensors and build dataset
        self.all_input_ids = torch.tensor(
            [f.input_ids for f in features], dtype=torch.long
        )
        self.all_input_mask = torch.tensor(
            [f.input_mask for f in features], dtype=torch.long
        )
        self.all_segment_ids = torch.tensor(
            [f.segment_ids for f in features], dtype=torch.long
        )
        self.all_label_ids = torch.tensor(
            [f.label_ids for f in features], dtype=torch.long
        )
        self.all_bboxes = torch.tensor([f.boxes for f in features], dtype=torch.long)
        # for f in filename:
        #     if f.file_name not in image_path_dict:
        #         count = count + 1
        #         image_path_dict[count] = f.file_name
        self.all_filename = [f.file_name for f in features]
        # count = count + 1

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return (
            self.all_input_ids[index],
            self.all_input_mask[index],
            self.all_segment_ids[index],
            self.all_label_ids[index],
            self.all_bboxes[index],
            self.all_filename[index],
        )


class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words, labels, boxes, actual_bboxes, file_name, page_size):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels
        self.boxes = boxes
        self.actual_bboxes = actual_bboxes
        self.file_name = file_name
        self.page_size = page_size


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(
        self,
        input_ids,
        input_mask,
        segment_ids,
        label_ids,
        boxes,
        actual_bboxes,
        file_name,
        page_size,
    ):
        assert (
            0 <= all(boxes) <= 1000
        ), "Error with input bbox ({}): the coordinate value is not between 0 and 1000".format(
            boxes
        )
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.boxes = boxes
        self.actual_bboxes = actual_bboxes
        self.file_name = file_name
        self.page_size = page_size


def read_examples_from_file(data_dir, mode):
    file_path = os.path.join(data_dir, "{}.txt".format(mode))
    box_file_path = os.path.join(data_dir, "{}_box.txt".format(mode))
    image_file_path = os.path.join(data_dir, "{}_image.txt".format(mode))
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f, open(
        box_file_path, encoding="utf-8"
    ) as fb, open(image_file_path, encoding="utf-8") as fi:
        words = []
        boxes = []
        actual_bboxes = []
        file_name = None
        page_size = None
        labels = []
        for idx, (line, bline, iline) in enumerate(zip(f, fb, fi)):
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples.append(
                        InputExample(
                            guid="{}-{}".format(mode, guid_index),
                            words=words,
                            labels=labels,
                            boxes=boxes,
                            actual_bboxes=actual_bboxes,
                            file_name=file_name,
                            page_size=page_size,
                        )
                    )
                    guid_index += 1
                    words = []
                    boxes = []
                    actual_bboxes = []
                    file_name = None
                    page_size = None
                    labels = []
            else:
                splits = line.split("\t")
                bsplits = bline.split("\t")
                isplits = iline.split("\t")
                try:
                    assert splits[0] == bsplits[0]
                    assert len(splits) == 2
                    assert len(bsplits) == 2
                    assert len(isplits) == 4
                except:
                    print(idx, splits[0], "-----", bsplits[0], isplits[-1])

                words.append(splits[0])
                if len(splits) > 1:
                    if splits[-1].replace("\n", "") == "" or splits[-1].replace("\n", "") == 0:
                      print("wrong labels")
                    labels.append(splits[-1].replace("\n", ""))
                    box = bsplits[-1].replace("\n", "")
                    box = [int(b) for b in box.split()]
                    boxes.append(box)
                    actual_bbox = [int(b) for b in isplits[1].split()]
                    actual_bboxes.append(actual_bbox)
                    page_size = [int(i) for i in isplits[2].split()]
                    file_name = isplits[3].strip()
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(
                InputExample(
                    guid="%s-%d".format(mode, guid_index),
                    words=words,
                    labels=labels,
                    boxes=boxes,
                    actual_bboxes=actual_bboxes,
                    file_name=file_name,
                    page_size=page_size,
                )
            )
    return examples


def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""

    # Because of the sliding window approach taken to scoring documents, a single
    # token can appear in multiple documents. E.g.
    #  Doc: the man went to the store and bought a gallon of milk
    #  Span A: the man went to the
    #  Span B: to the store and bought
    #  Span C: and bought a gallon of
    #  ...
    #
    # Now the word 'bought' will have two scores from spans B and C. We only
    # want to consider the score with "maximum context", which we define as
    # the *minimum* of its left and right context (the *sum* of left and
    # right context will always be the same, of course).
    #
    # In the example the maximum context for 'bought' would be span C since
    # it has 1 left context and 3 right context, while span B has 4 left context
    # and 0 right context.
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span.start + doc_span.length - 1
        if position < doc_span.start:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span.start
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

import collections
original_token = []
def convert_single_example_to_feature(
            ex_index,
            example,
            label_list,
            max_seq_length,
            tokenizer,
            cls_token_at_end=False,
            cls_token="[CLS]",
            cls_token_segment_id=1,
            sep_token="[SEP]",
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=0,
            cls_token_box=[0, 0, 0, 0],
            sep_token_box=[1000, 1000, 1000, 1000],
            pad_token_box=[0, 0, 0, 0],
            pad_token_segment_id=0,
            pad_token_label_id=-100,
            sequence_a_segment_id=0,
            mask_padding_with_zero=True,
            doc_stride=384
        ):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # label_map = {label: i for i, label in enumerate(label_list)}
        file_name = example.file_name
        page_size = example.page_size
        width, height = page_size

        label_map = {label: i for i, label in enumerate(label_list)}

        tokens = []
        token_boxes = []
        actual_bboxes = []
        label_ids = []
        for word, label, box, actual_bbox in zip(
            example.words, example.labels, example.boxes, example.actual_bboxes
        ):
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            token_boxes.extend([box] * len(word_tokens))
            actual_bboxes.extend([actual_bbox] * len(word_tokens))
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend(
                [label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)
            )

        # print(label_ids, len(label_ids))
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2

        # print(">>>>>>>>>>>>>>>>", len(tokens))
        max_tokens_for_doc = max_seq_length - 2
        # max_tokens_for_doc = max_seq_length - len(tokens) - 2
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(tokens):
            length = len(tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(tokens):
                break
            start_offset += min(length, doc_stride)

        feature_list, ntokens_list, label_ids_list, actual_boxes_list, token_boxes_list = [], [], [], [], []

        for (doc_span_index, doc_span) in enumerate(doc_spans):

            token_is_max_context = {}
            token = []
            token_box = []
            actual_bbox = []
            label_id = []
            segment_id = []
            # print(doc_span.length, doc_spans)
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                # token_to_orig_map[len(ntokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
                                                      split_token_index)
                token_is_max_context[split_token_index] = is_max_context
                token.append(tokens[split_token_index])
                token_box.append(token_boxes[split_token_index])
                actual_bbox.append(actual_bboxes[split_token_index])
                label_id.append(label_ids[split_token_index])
                # label_id.append(label_map[label_ids[split_token_index]])
                segment_id.append(0)

            # print(label_id)
            token += [sep_token]
            token_box += [sep_token_box]
            actual_bbox += [[0, 0, width, height]]
            label_id += [pad_token_label_id]
            if sep_token_extra:
                # roberta uses an extra separator b/w pairs of sentences
                token += [sep_token]
                token_box += [sep_token_box]
                actual_bbox += [[0, 0, width, height]]
                label_id += [pad_token_label_id]
            segment_id = [sequence_a_segment_id] * len(token)

            if cls_token_at_end:
                token += [cls_token]
                token_box += [cls_token_box]
                actual_bbox += [[0, 0, width, height]]
                label_id += [pad_token_label_id]
                segment_id += [cls_token_segment_id]
            else:
                token = [cls_token] + token
                token_box = [cls_token_box] + token_box
                actual_bbox = [[0, 0, width, height]] + actual_bbox
                label_id = [pad_token_label_id] + label_id
                segment_id = [cls_token_segment_id] + segment_id

            input_id = tokenizer.convert_tokens_to_ids(token)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_id)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_id)
            if pad_on_left:
                mask_padding_with_zero = False
                input_id = ([pad_token] * padding_length) + input_id
                input_mask = (
                    [0 if mask_padding_with_zero else 1] * padding_length
                ) + input_mask
                segment_id = ([pad_token_segment_id] * padding_length) + segment_id
                label_id = ([pad_token_label_id] * padding_length) + label_id
                token_box = ([pad_token_box] * padding_length) + token_box
            else:
                mask_padding_with_zero = False
                input_id += [pad_token] * padding_length
                input_mask += [0 if mask_padding_with_zero else 1] * padding_length
                segment_id += [pad_token_segment_id] * padding_length
                label_id += [pad_token_label_id] * padding_length
                token_box += [pad_token_box] * padding_length

            # print(len(input_id), len(input_mask), len(segment_id), len(label_id), len(token_box))
            assert len(input_id) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_id) == max_seq_length
            assert len(label_id) == max_seq_length
            assert len(token_box) == max_seq_length
            # assert len(token) == max_seq_length

            feature = InputFeatures(
                    input_ids=input_id,
                    input_mask=input_mask,
                    segment_ids=segment_id,
                    label_ids=label_id,
                    boxes=token_box,
                    actual_bboxes=actual_bbox,
                    file_name=file_name,
                    page_size=page_size,
                )
            feature_list.append(feature)
            ntokens_list.append(token)
            label_ids_list.append(label_id)
            actual_boxes_list.append(actual_bbox)
            token_boxes_list.append(token_box)

        if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s", example.guid)
                logger.info("tokens: %s", " ".join([str(x) for x in token]))
                logger.info("input_ids: %s", " ".join([str(x) for x in input_id]))
                logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
                logger.info("segment_ids: %s", " ".join([str(x) for x in segment_id]))
                logger.info("label_ids: %s", " ".join([str(x) for x in label_id]))
                logger.info("boxes: %s", " ".join([str(x) for x in token_box]))
                logger.info("actual_bboxes: %s", " ".join([str(x) for x in actual_bbox]))
        # print(len(feature_list))
        return feature_list, ntokens_list, label_ids_list, actual_boxes_list, token_boxes_list



def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    cls_token_box=[0, 0, 0, 0],
    sep_token_box=[1000, 1000, 1000, 1000],
    pad_token_box=[0, 0, 0, 0],
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
    doc_stride=384
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    batch_tokens = []
    batch_labels = []
    batch_tokens_boxes = []
    batch_actual_bboxes = []
    batch_index = []
    feature_list_total = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logging.info("Writing example %d of %d" % (ex_index, len(examples)))
        feature_list, ntokens_list, label_ids_list, actual_boxes_list, token_boxes_list = convert_single_example_to_feature(ex_index, example, label_list, max_seq_length, tokenizer,
            cls_token_at_end=False,
            cls_token="[CLS]",
            cls_token_segment_id=1,
            sep_token="[SEP]",
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=0,
            cls_token_box=[0, 0, 0, 0],
            sep_token_box=[1000, 1000, 1000, 1000],
            pad_token_box=[0, 0, 0, 0],
            pad_token_segment_id=0,
            pad_token_label_id=-100,
            sequence_a_segment_id=0,
            mask_padding_with_zero=True,
            doc_stride = 384)
        feature_list_total.extend(feature_list)
        for feature, ntokens, label_ids, actual_boxes, token_boxes in zip(feature_list, ntokens_list, label_ids_list, actual_boxes_list, token_boxes_list):
            batch_tokens.extend(ntokens)
            batch_labels.extend(label_ids)
            batch_index.extend([ex_index]*len(ntokens))
            batch_tokens_boxes.extend(token_boxes)
            batch_actual_bboxes.extend(actual_boxes)

    return feature_list_total, batch_tokens, batch_labels, batch_index, batch_tokens_boxes, batch_actual_bboxes

### Load train test data in batches

In [None]:
from transformers import LayoutLMTokenizer
# from layoutlm.data.funsd import FunsdDataset, InputFeatures
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

args = {'local_rank': -1,
        'overwrite_cache': True,
        'data_dir': '/content/drive/MyDrive/aws_annotation/data',
        'model_name_or_path':'microsoft/layoutlm-base-uncased',
        'max_seq_length': 512,
        'model_type': 'layoutlm',}

# class to turn the keys of a dict into attributes (thanks Stackoverflow)
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

args = AttrDict(args)

tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

# the LayoutLM authors already defined a specific FunsdDataset, so we are going to use this here
train_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode="train")
train_sampler = RandomSampler(train_dataset)
# train_sampler = SequentialSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=32, num_workers=2)

eval_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode="test")
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset,
                             sampler=eval_sampler,
                            batch_size=32, num_workers=2)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [None]:
batch = next(iter(train_dataloader))
input_ids = batch[0][0]
tokenizer.decode(input_ids)

"[CLS] magma hdi - general insurance company ltd. development house, 24 park street, kolkata - 700016 in case of any query, assistance or claims, please contact us at 1800 266 3202 uin : irdan149rp0002v02201213 two wheeler package policy certificate of insurance cum schedule / tax invoice policy servicing office g - 2, ground floor, mandhana towers, 7 - 1 - 59 / 2 & 59 / 6,, ameerpet,, hyderabad - 500016 telangana, ph : ( 1800 ) 2663202 policy no p0323300028 / 4102 / 100942 insured mr chinthakindhi ramesh address # 2 - 12 - 14, vidyaranyapuri hanamkonda period of insurance warangal 00 : 00 hrs of 18 / 10 / 2022 telangana 506009 to midnight of 17 / 10 / 2023 agent no. : brc0000157 mobile : 9885707695 9885707695 agent contact no. : 7551196989 contact number email id : gst number unregistered insured motor vehicle details and premium computation registration mark & no. & rta year of location manufacture engine no. chassis no. make / model / type of body cubic capacity seating capacity ap 

### Load Pretrained Layoutlmv1 model

In [None]:
from transformers import LayoutLMForTokenClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=num_labels)
# model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/453M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

### Load tokenizer

In [None]:
from transformers import LayoutLMTokenizer
# from layoutlm.data.funsd import FunsdDataset, InputFeatures
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

**Start Training**

In [None]:
len(labels)

205

In [None]:
!pip install seqeval
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=ac6d8902f080c986266f58cd6580d3b4c1d3dd6ca846d067b0e2e7dce872f73a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m

In [None]:
import pandas as pd
from seqeval.metrics import classification_report
from seqeval.scheme import IOBES
from datasets import load_metric
from transformers import get_linear_schedule_with_warmup
import random

def set_seed(seed): ## for reproductibility
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

def do_eval(model, dataloader_eval):
    eval_loss = 0.0
    tmp_eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    # put model in evaluation mode
    model.eval()
    for batch in tqdm(dataloader_eval, desc="Evaluating"):
        with torch.no_grad():
            input_ids = batch[0].to(device)
            bbox = batch[4].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            labels = batch[3].to(device)

            # forward pass
            outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
                            labels=labels)
            # get the loss and logits
            tmp_eval_loss = outputs.loss
            logits = outputs.logits

            eval_loss += tmp_eval_loss.item()

            nb_eval_steps += 1

            # compute the predictions
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = labels.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, labels.detach().cpu().numpy(), axis=0
                )

    # compute average evaluation loss
    preds = np.argmax(preds, axis=2)

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id: #escludi pad e other
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    strict = classification_report(out_label_list, preds_list,  output_dict=True, mode='strict', scheme=IOBES)
    print('strict: ', classification_report(out_label_list, preds_list, output_dict=False, mode='strict', scheme=IOBES))

    strict['eval_loss'] = eval_loss / nb_eval_steps
    return strict

In [None]:
from tqdm import tqdm
import numpy as np
from transformers import AdamW

# train_batch_size = 8
learning_rate = 5e-5
# learning_rate = 1e-4
adam_epsilon = 1e-8
weight_decay = 0.0
num_train_epochs = 100 ## To fine-tune (adding drop out so that It can lead to overfit less)
max_steps = 0
gradient_accumulation_steps = 1
max_grad_norm = 1.0
warmup_steps = 0
seed = 42

early_stop = {'patience': 5}
learning_rate = 5e-5
# test_mode_metric = 'val_loss'
test_mode_metric = 'f1_score'
runs = 1
# optimizer = AdamW(base_model.parameters(), lr=learning_rate)

if max_steps > 0:
    t_total = max_steps
    num_train_epochs = (
        max_steps
        // (len(train_dataloader) // gradient_accumulation_steps)
        + 1
    )
else:
    t_total = (
        len(train_dataloader)
        // gradient_accumulation_steps
        * num_train_epochs
    )
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in base_model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in base_model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
optimizer = AdamW(
        optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon
    )
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )


def train_layoutLM(model, epochs, dataloader_train, dataloader_eval, optimizer, early_stop_arg, run, test_mode, seed):
    #args for early stop
    last_loss = 1000
    last_f1 = 0
    patience = early_stop_arg['patience']
    trigger_times = 0

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.zero_grad()

    final_results = []
    # steps = -1
    global_step = 0
    num_epochs = epochs
    set_seed(seed)
    # model.train()
    for epoch in range(1, num_epochs):  # loop over the dataset multiple times
        nb_train_steps = 0
        tr_loss = 0.0
        for steps, batch in enumerate(tqdm(dataloader_train, desc=f'training {epoch} / {num_epochs}')):
            model.train()
            # get the inputs;
            input_ids = batch[0].to(device)
            bbox = batch[4].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            labels = batch[3].to(device)

            # optimizer.zero_grad()
            # forward pass
            outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
                            labels=labels)

            loss = outputs.loss
            tr_loss += loss.item()
            loss.backward()

            # if (steps+1) % 100 == 0:
            #     print(f"Train Epoch : {steps+1}/{len(train_dataloader)}")

            if (steps + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(
                        model.parameters(), max_grad_norm
                    )
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
            nb_train_steps += 1

        # print(f"Loss after {epoch} epochs: {loss.item()}")
        total_trn_loss = tr_loss / nb_train_steps
        print(f"Total Average Train Loss after {epoch} epochs: {total_trn_loss}")
        eval_results = do_eval(model, dataloader_eval)
        #print(f'Validation results: {eval_results}')
        current_loss = eval_results['eval_loss']
        current_f1 = eval_results['micro avg']['f1-score']
        print(f'Validaiton loss: {current_loss}')
        print(f'F1 score: {current_f1}')
        #implementing early stopping
        if test_mode == 'val_loss':
            if current_loss > last_loss:
                trigger_times += 1
                print(f'Validation loss did not decrease from {last_loss}.')
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience or epoch == num_epochs - 1:
                    print(f'Early stopping because validation loss did not decrease after {trigger_times} epochs.')
                    print(f'Returning best model named: {best_model}')
                    best_model = torch.load(best_model)
                    df = pd.DataFrame(final_results)
                    df.to_csv(f'results/v1/log_v1_FUNSD_{test_mode}_run{run}.csv', index = False)
                    return best_model

            else:
                print(f'Validation loss decresed from {last_loss}. Saving checkpoint...')
                best_model = f'models/checkpointLM1_epoch{epoch}.pt'
                for ckpt in os.listdir('models'):
                    if 'checkpointLM1_epoch' in ckpt:
                        os.remove(f'models/{ckpt}') #avoid too many checkpoints
                torch.save(model, best_model)
                trigger_times = 0
                last_loss = current_loss
        elif test_mode == 'f1_score':
            if current_f1 < last_f1:
                trigger_times += 1
                print(f'f1 score did not increase from {last_f1}.')
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience or epoch == num_epochs - 1:
                    print(f'Early stopping because f1_score did not increase after {trigger_times} epochs.')
                    print(f'Returning best model named: {best_model}')
                    best_model = torch.load(best_model)
                    df = pd.DataFrame(final_results)
                    if not os.path.exists(f'results/v1'):
                        os.mkdir(f'results/v1')
                    df.to_csv(f'results/v1/log_v1_FUNSD_{test_mode}_run{run}.csv', index = False)
                    return best_model

            else:
                print(f'F1 score incresead from {last_f1}. Saving checkpoint...')
                best_model = f'models/checkpointLM1_epoch{epoch}.pt'
                for ckpt in os.listdir('models'):
                    if 'checkpointLM1_epoch' in ckpt:
                        os.remove(f'models/{ckpt}') #avoid too many checkpoints
                torch.save(model, best_model)
                trigger_times = 0
                last_f1 = current_f1


        tmp = eval_results
        tmp['epoch'] =  epoch
        tmp['train_loss'] =  total_trn_loss
        final_results.append(tmp)
    df = pd.DataFrame(final_results)
    df.to_csv(f'results/v1/log_v1_FUNSD_{test_mode}_run{run}.csv', index = False)
    best_model = torch.load(best_model)
    return best_model



In [None]:
# train_epoch = 20
model = train_layoutLM(base_model, epochs=num_train_epochs, dataloader_train=train_dataloader, dataloader_eval=eval_dataloader,
              optimizer=optimizer, early_stop_arg=early_stop, run=runs, test_mode=test_mode_metric, seed=seed)
torch.save(model, f'models/LayoutLMv1_FUNSD.pt')

training 1 / 100: 100%|██████████| 556/556 [06:28<00:00,  1.43it/s]


Total Average Train Loss after 1 epochs: 0.17682548887062843


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.62it/s]
  _warn_prf(average, modifier, msg_start, len(result))


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.95      0.97      0.96        76
                  CC       0.96      0.98      0.97       119
                CGST       0.82      0.82      0.82        76
           CHASSISNO       0.89      0.86      0.87       144
COMMUNICATIONADDRESS       0.80      0.69      0.74       158
   COMMUNICATIONCITY       0.77      0.83      0.80        94
COMMUNICATIONPINCODE       0.95      0.96      0.95        94
  COMMUNICATIONSTATE       0.88      0.88      0.88        88
            ENGINENO       0.83      0.90      0.87       121
                FUEL       0.98      0.96      0.97        57
        GROSSPREMIUM       0.99      1.00      1.00       116
                 GVW       0.00      0.00      0.00         3
             INSURER       0.93      0.94      0.93       120
        ISSUANCEDATE       0.89      0.94      0.91        77
                MAKE       0.88      0.89      0.88       15

training 2 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 2 epochs: 0.026302283510847166


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.62it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      0.99      0.97        76
                  CC       1.00      1.00      1.00       119
                CGST       0.93      0.93      0.93        76
           CHASSISNO       0.94      0.94      0.94       144
COMMUNICATIONADDRESS       0.79      0.75      0.77       158
   COMMUNICATIONCITY       0.86      0.95      0.90        94
COMMUNICATIONPINCODE       0.92      0.99      0.95        94
  COMMUNICATIONSTATE       0.94      0.99      0.96        88
            ENGINENO       0.90      0.93      0.91       121
                FUEL       0.98      1.00      0.99        57
        GROSSPREMIUM       0.97      1.00      0.99       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.94      0.94       120
        ISSUANCEDATE       0.89      1.00      0.94        77
                MAKE       0.93      0.92      0.92       15

training 3 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.45it/s]


Total Average Train Loss after 3 epochs: 0.01762741289717956


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.63it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      1.00      0.98        76
                  CC       1.00      1.00      1.00       119
                CGST       0.94      0.97      0.95        76
           CHASSISNO       0.97      0.97      0.97       144
COMMUNICATIONADDRESS       0.91      0.84      0.88       158
   COMMUNICATIONCITY       0.90      0.94      0.92        94
COMMUNICATIONPINCODE       0.92      0.99      0.95        94
  COMMUNICATIONSTATE       0.96      0.99      0.97        88
            ENGINENO       0.97      0.94      0.96       121
                FUEL       0.95      1.00      0.97        57
        GROSSPREMIUM       0.97      1.00      0.99       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.89      0.97      0.93       120
        ISSUANCEDATE       0.91      1.00      0.95        77
                MAKE       0.92      0.93      0.92       15

training 4 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 4 epochs: 0.01384815808114254


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.63it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      1.00      0.98        76
                  CC       1.00      1.00      1.00       119
                CGST       0.96      0.95      0.95        76
           CHASSISNO       0.97      0.97      0.97       144
COMMUNICATIONADDRESS       0.89      0.83      0.86       158
   COMMUNICATIONCITY       0.91      0.91      0.91        94
COMMUNICATIONPINCODE       0.91      0.96      0.93        94
  COMMUNICATIONSTATE       0.95      0.98      0.96        88
            ENGINENO       0.98      0.96      0.97       121
                FUEL       1.00      1.00      1.00        57
        GROSSPREMIUM       0.99      1.00      1.00       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.95      0.94       120
        ISSUANCEDATE       0.91      1.00      0.95        77
                MAKE       0.94      0.91      0.93       15

training 5 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.45it/s]


Total Average Train Loss after 5 epochs: 0.011225380679331183


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.63it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      1.00      0.98        76
                  CC       1.00      1.00      1.00       119
                CGST       0.96      0.95      0.95        76
           CHASSISNO       0.98      0.99      0.98       144
COMMUNICATIONADDRESS       0.87      0.81      0.84       158
   COMMUNICATIONCITY       0.92      0.95      0.93        94
COMMUNICATIONPINCODE       0.94      0.94      0.94        94
  COMMUNICATIONSTATE       0.95      1.00      0.97        88
            ENGINENO       0.98      0.97      0.97       121
                FUEL       1.00      0.96      0.98        57
        GROSSPREMIUM       0.97      1.00      0.98       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.93      0.93       120
        ISSUANCEDATE       0.91      1.00      0.95        77
                MAKE       0.95      0.93      0.94       15

training 6 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.45it/s]


Total Average Train Loss after 6 epochs: 0.009494122528561504


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.62it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      1.00      0.98        76
                  CC       1.00      1.00      1.00       119
                CGST       0.97      0.95      0.96        76
           CHASSISNO       0.97      0.97      0.97       144
COMMUNICATIONADDRESS       0.89      0.85      0.87       158
   COMMUNICATIONCITY       0.91      0.91      0.91        94
COMMUNICATIONPINCODE       0.92      0.96      0.94        94
  COMMUNICATIONSTATE       0.94      0.97      0.96        88
            ENGINENO       0.98      0.94      0.96       121
                FUEL       1.00      1.00      1.00        57
        GROSSPREMIUM       1.00      1.00      1.00       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.94      0.94      0.94       120
        ISSUANCEDATE       0.93      0.99      0.96        77
                MAKE       0.94      0.91      0.93       15

training 7 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 7 epochs: 0.007813625005006843


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.61it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.97      1.00      0.99        76
                  CC       1.00      1.00      1.00       119
                CGST       0.94      0.97      0.95        76
           CHASSISNO       0.96      0.96      0.96       144
COMMUNICATIONADDRESS       0.87      0.78      0.82       158
   COMMUNICATIONCITY       0.91      0.94      0.92        94
COMMUNICATIONPINCODE       0.92      0.96      0.94        94
  COMMUNICATIONSTATE       0.94      0.97      0.96        88
            ENGINENO       0.97      0.93      0.95       121
                FUEL       1.00      0.96      0.98        57
        GROSSPREMIUM       0.99      0.98      0.99       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.94      0.94      0.94       120
        ISSUANCEDATE       0.90      0.99      0.94        77
                MAKE       0.94      0.93      0.93       15

training 8 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 8 epochs: 0.006698014204584652


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.61it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.96      1.00      0.98        76
                  CC       1.00      1.00      1.00       119
                CGST       0.99      0.96      0.97        76
           CHASSISNO       0.98      0.97      0.98       144
COMMUNICATIONADDRESS       0.85      0.82      0.83       158
   COMMUNICATIONCITY       0.89      0.93      0.91        94
COMMUNICATIONPINCODE       0.92      0.96      0.94        94
  COMMUNICATIONSTATE       0.94      0.97      0.96        88
            ENGINENO       0.97      0.93      0.95       121
                FUEL       1.00      0.95      0.97        57
        GROSSPREMIUM       0.98      1.00      0.99       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.94      0.94       120
        ISSUANCEDATE       0.91      1.00      0.95        77
                MAKE       0.94      0.91      0.93       15

training 9 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 9 epochs: 0.005855442772882408


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.60it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.97      1.00      0.99        76
                  CC       1.00      1.00      1.00       119
                CGST       0.95      0.97      0.96        76
           CHASSISNO       0.97      0.97      0.97       144
COMMUNICATIONADDRESS       0.85      0.81      0.83       158
   COMMUNICATIONCITY       0.91      0.94      0.92        94
COMMUNICATIONPINCODE       0.92      0.96      0.94        94
  COMMUNICATIONSTATE       0.95      0.99      0.97        88
            ENGINENO       0.97      0.93      0.95       121
                FUEL       1.00      0.98      0.99        57
        GROSSPREMIUM       0.98      0.98      0.98       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.93      0.93       120
        ISSUANCEDATE       0.90      1.00      0.94        77
                MAKE       0.95      0.93      0.94       15

training 10 / 100: 100%|██████████| 556/556 [06:24<00:00,  1.44it/s]


Total Average Train Loss after 10 epochs: 0.0054534170616429845


Evaluating: 100%|██████████| 14/14 [00:03<00:00,  3.63it/s]


strict:                        precision    recall  f1-score   support

      BASICTPPREMIUM       0.97      1.00      0.99        76
                  CC       1.00      1.00      1.00       119
                CGST       0.94      0.96      0.95        76
           CHASSISNO       0.98      0.97      0.97       144
COMMUNICATIONADDRESS       0.86      0.81      0.83       158
   COMMUNICATIONCITY       0.90      0.93      0.91        94
COMMUNICATIONPINCODE       0.92      0.96      0.94        94
  COMMUNICATIONSTATE       0.93      0.98      0.96        88
            ENGINENO       1.00      0.93      0.97       121
                FUEL       1.00      0.96      0.98        57
        GROSSPREMIUM       0.99      1.00      1.00       116
                 GVW       1.00      1.00      1.00         3
             INSURER       0.93      0.94      0.94       120
        ISSUANCEDATE       0.90      1.00      0.94        77
                MAKE       0.92      0.88      0.90       15

### Checkpoint Saving

In [None]:
# model.save_pretrained("./model")

In [None]:
# from datetime import date
# torch.save({
#             'epoch': epoch,
#             "step": global_step,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             #'loss': loss,
#             }, "llm-"+str(date.today())+".pt")

# Save model

In [None]:
# import torch
# PATH='/content/drive/MyDrive/aws_annotation/qc_stride_model_v4.pt'
# # Save
# torch.save(model, PATH)

# Load
# model = torch.load(PATH, map_location=torch.device('cpu'))
# model.eval()