In [1]:
import pandas as pd 
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from model.model import ResumeParser

In [3]:
args = {
    'positional_dim': 32,
    'hidden_dim': 256,
    'classifier_dropout': 0.3,
    'num_classes': 4,
    'n_hidden': 1, # total layers: n_hidden + 2
}

tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}

label_idx = {'discard': 0, 'merge': 1, 'pop': 2, 'subordinate': 3} 
idx_label = {0: 'discard', 1: 'merge', 2: 'pop', 3: 'subordinate'}

In [4]:
from transformers import BertTokenizerFast, BertModel

In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
parser = ResumeParser.load_from_checkpoint("model/epoch72.ckpt", backend=model, args=args).eval()

Device:  cpu


In [7]:
in_file = "./data/pdf/BhattCV 221.pdf"

In [8]:
from annotation_object import AnnotationObject, serialize
anno = AnnotationObject(in_file)

Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


18it [00:01, 10.42it/s]


Length of dep: 800


In [9]:
json_format = anno.json_format
with tqdm(total = anno.n_lines) as pbar:
    while not anno.is_done:
        stk_idx = anno.stack[-1]
        buf_idx = anno.current_idx
        # print(json_format[buf_idx])
        buf_string = None
        stk_string = None
        lbuf = None
        rbuf = None 
        lstk = None 
        rstk = None 
        if(buf_idx == -1):
            buf_string = "$ROOT"
            lbuf = 0
            rbuf = 100
        else:
            buf_string = "$ROOT" if buf_idx == -1 else json_format[buf_idx]['text']
            lbuf = json_format[buf_idx]['x']
            rbuf = json_format[buf_idx]['x'] + json_format[buf_idx]['width']
        if(stk_idx == -1):
            stk_string = "$ROOT"
            lstk = 0
            rstk = 100
        else:
            stk_string = "$ROOT" if stk_idx == -1 else json_format[stk_idx]['text']
            lstk = json_format[stk_idx]['x']
            rstk = json_format[stk_idx]['x'] + json_format[stk_idx]['width']
        buf_tok = tokenizer(buf_string, **tokenizer_args)
        stk_tok = tokenizer(buf_string, **tokenizer_args)
        pos = torch.floor(torch.Tensor([lbuf, rbuf, lstk, rstk])).long()
        batch = (buf_tok, stk_tok, pos, None)
        logits = parser.get_logits(batch)

        action_order = (-logits).argsort().squeeze() #largest probabilities first
        for i, action in enumerate(action_order):
            predicted_action = idx_label[action.item()]
            # print(predicted_action)
            if(predicted_action == "discard" and anno.discard() == 0):
                pbar.update(1)
                break
            elif(predicted_action == "merge" and anno.merge_action() == 0):
                pbar.update(1)
                break
            elif(predicted_action == "pop" and anno.pop_action() == 0):
                break
            elif(predicted_action == "subordinate" and anno.subordinate_action() == 0):
                pbar.update(1)
                break
            else:
                assert(i != 3)
        
    
serialize(anno)

100%|██████████| 800/800 [06:45<00:00,  1.97it/s]

Serializing... <annotation_object.AnnotationObject object at 0x17f5bdeb0>
Dumped to ./data/pkl/BhattCV 221.pkl.



