# `frontend.ipynb` 
This file is used to run the ML model (imported from `model/`) on a PDF file. 

In [3]:
import pandas as pd 
import torch
from torchmetrics import Accuracy
import transformers
import lightning.pytorch as pl
from tqdm import tqdm

In [4]:
from model.model import ResumeParser

In [5]:
from model.utils import label_idx, idx_label

# model & tokeniser arguments

args = {
    'positional_dim': 32,
    'hidden_dim': 256,
    'classifier_dropout': 0.3,
    'num_classes': 4,
    'use_llm': False,
    'n_hidden': 1, # total layers: n_hidden + 2
}

tokenizer_args = {
    'padding': 'max_length',
    'return_tensors': 'pt',
}

In [6]:
from transformers import BertTokenizerFast, BertModel

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

In [11]:
# This is the model from model/!

parser = ResumeParser.load_from_checkpoint("model/epoch_style.ckpt", backend=model, args=args).eval()

Device:  cpu


In [12]:
# Change the file to your pdf file!
in_file = "PATH_TO_YOUR_PDF"

In [13]:
from annotation_object import AnnotationObject, serialize
anno = AnnotationObject(in_file)

Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


4it [00:00,  7.27it/s]


Document has  4 pages
Length of dep: 246


In [14]:
# The bulk of the code, gets the attributes from the line and simulates parsing


json_format = anno.json_format
wrapper = anno.wrapper
lines = wrapper.lines
with tqdm(total = anno.n_lines) as pbar:
    while not anno.is_done:
        stk_idx = anno.stack[-1]
        buf_idx = anno.current_idx
        # print(json_format[buf_idx])
        buf_string = None
        stk_string = None
        lbuf = None
        rbuf = None 
        lstk = None 
        rstk = None 
        buf_string = None
        stk_string = None
        lbuf = None
        rbuf = None 
        lstk = None 
        rstk = None 
        hstk = None
        boldbuf = None
        italbuf = None
        boldstk = None
        italstk = None
        hbuf = None
        if(buf_idx == -1):
            buf_string = "$ROOT"
            lbuf = 0
            rbuf = 100
            hbuf = 30
            boldbuf = 0
            italbuf = 0
        else:
            buf_string = "$ROOT" if buf_idx == -1 else json_format[buf_idx]['text']
            lbuf = json_format[buf_idx]['x']
            rbuf = json_format[buf_idx]['x'] + json_format[buf_idx]['width']
            hbuf = int(json_format[buf_idx]['height'])
            try:
                linebuf = lines[json_format[buf_idx]['page']][json_format[buf_idx]['idx_in_page']]
            except:
                print(f"Tried to get line #{json_format[buf_idx]['idx_in_page']} of page {json_format[buf_idx]['page']}; document has {len(wrapper.elements)}/{len(wrapper.lines)} pages, and that page has {len(lines[json_format[buf_idx]['idx_in_page']])} lines")
                raise KeyError
            fontname = linebuf._objs[0].fontname.lower()
            boldbuf = 1 if "bold" in fontname else 0
            italbuf = 1 if "italic" in fontname else 0

        if(stk_idx == -1):
            stk_string = "$ROOT"
            lstk = 0
            rstk = 100
            hstk = 30
            boldstk = 0
            italstk = 0
        else:
            stk_string = "$ROOT" if stk_idx == -1 else json_format[stk_idx]['text']
            lstk = json_format[stk_idx]['x']
            rstk = json_format[stk_idx]['x'] + json_format[stk_idx]['width']
            hstk = int(json_format[stk_idx]['height'])
            linebuf = lines[json_format[stk_idx]['page']][json_format[stk_idx]['idx_in_page']]
            fontname = linebuf._objs[0].fontname.lower()
            boldstk = 1 if "bold" in fontname else 0
            italstk = 1 if "italic" in fontname else 0
        sty = torch.Tensor([[italbuf, boldbuf, italstk, boldstk]]).long()
        pos = torch.floor(torch.Tensor([lbuf, rbuf, lstk, rstk])).long()
        batch = (None, None, pos, sty, None)
        logits = parser.get_logits(batch)
        action_order = (-logits).argsort().squeeze() #largest probabilities first
        for i, action in enumerate(action_order):
            predicted_action = idx_label[action.item()]
            # print(predicted_action)
            if(predicted_action == "discard" and anno.discard() == 0):
                pbar.update(1)
                break
            elif(predicted_action == "merge" and anno.merge_action() == 0):
                pbar.update(1)
                break
            elif(predicted_action == "pop" and anno.pop_action() == 0):
                break
            elif(predicted_action == "subordinate" and anno.subordinate_action() == 0):
                pbar.update(1)
                break
            else:
                assert(i != 3)
        
    
serialize(anno)

100%|██████████| 246/246 [00:00<00:00, 4006.25it/s]


Serializing... <annotation_object.AnnotationObject object at 0x16db73040>
Dumped to ./data/pkl/mbeckman.pkl.
