# pkl to json
This file takes the `.pkl` annotation record files found under `data/pkl` and converts them into `.json` files (under `data/json`). 

In [6]:
import pickle as pkl
import numpy as np
from tqdm import tqdm
import os
import json

In [7]:
in_dir = "data/pkl_fine"
in_file = "cv.pkl"
out_dir = "data/json/"
out_file = "data_fine.json"

In [8]:
in_path = f'{in_dir}/{in_file}'
out_path = f'{out_dir}/{out_file}'

In [None]:
from pdf_wrapper import PDFWrapper

# This function aggregates data from a .pkl file into an iterable python dictionary, 
# to be later used to be written into a CSV file or otherwise; just modifiy this function
# if you want other fields to be added into the model


def get_record(in_path):
    with open(in_path, "rb") as f_in, open(out_path, "w") as f_out:
        data = pkl.load(f_in)
        json_format = data.json_format
        pdf_path = in_path.replace("pkl_fine", "pdf")
        pdf_path = pdf_path.replace(".pkl", ".pdf")
        wrapper = PDFWrapper(fname = pdf_path)
        lines = wrapper.lines
        for record in tqdm(data.record):
            # There must be a more elegant way of writing this 
            # but isn't this clear? 
            buf_idx = record['from']
            stk_idx = record['to']
            type_str = record['type']
            buf_string = None
            stk_string = None
            lbuf = None
            rbuf = None 
            lstk = None 
            rstk = None 
            hstk = None
            boldbuf = None
            italbuf = None
            boldstk = None
            italstk = None
            hbuf = None
            if(buf_idx == -1):
                # Special treatment for the ROOT (-1) element. 
                buf_string = "$ROOT"
                lbuf = 0
                rbuf = 100
                hbuf = 30
                boldbuf = 0
                italbuf = 0
            else:
                buf_string = "$ROOT" if buf_idx == -1 else json_format[buf_idx]['text']
                lbuf = json_format[buf_idx]['x']
                rbuf = json_format[buf_idx]['x'] + json_format[buf_idx]['width']
                hbuf = int(json_format[buf_idx]['height'])
                try:
                    linebuf = lines[json_format[buf_idx]['page']][json_format[buf_idx]['idx_in_page']]
                except:
                    print(f"Tried to get line #{json_format[buf_idx]['idx_in_page']} of page {json_format[buf_idx]['page']}; document has {len(wrapper.elements)}/{len(wrapper.lines)} pages, and that page has {len(lines[json_format[buf_idx]['idx_in_page']])} lines")
                    raise KeyError
                fontname = linebuf._objs[0].fontname.lower()
                boldbuf = 1 if "bold" in fontname else 0
                italbuf = 1 if "italic" in fontname else 0

            if(stk_idx == -1):
                stk_string = "$ROOT"
                lstk = 0
                rstk = 100
                hstk = 30
                boldstk = 0
                italstk = 0
            else:
                stk_string = "$ROOT" if stk_idx == -1 else json_format[stk_idx]['text']
                lstk = json_format[stk_idx]['x']
                rstk = json_format[stk_idx]['x'] + json_format[stk_idx]['width']
                hstk = int(json_format[stk_idx]['height'])
                linebuf = lines[json_format[stk_idx]['page']][json_format[stk_idx]['idx_in_page']]
                fontname = linebuf._objs[0].fontname.lower()
                boldstk = 1 if "bold" in fontname else 0
                italstk = 1 if "italic" in fontname else 0
            yield {
                'buf_str': buf_string,
                'lbuf': lbuf,
                'rbuf': rbuf,
                'hbuf': hbuf,
                'boldbuf': boldbuf,
                'italbuf': italbuf,
                'stk_str': stk_string,
                'lstk': lstk,
                'rstk': rstk,
                'boldstk': boldstk,
                'italstk': italstk,
                'type': type_str,
                'hstk': hstk
            }

In [15]:
# This function takes in an input directory of .pkl files, an output directory and name
# and aggregates all .pkl files in that folder to produce a JSON file at the desired output location.

def pkl_to_json(in_dir = "data/pkl_fine", out_dir = "data/json/", out_name = "data_fine.json"):
    to_write = {
        'buf_str': [],
        'lbuf': [],
        'rbuf': [],
        'hbuf': [],
        'boldbuf': [],
        'italbuf': [],
        'stk_str': [],
        'lstk': [],
        'rstk': [],
        'hstk': [],
        'boldstk': [],
        'italstk': [],
        'type': []
    }

    for file in os.listdir(in_dir):
        full_path = f'{in_dir}/{file}'
        print(full_path)
        if(os.path.isfile(full_path)):
            for entry in get_record(full_path):
                for key, val in entry.items():
                    to_write[key].append(val)

    print(f'Found {len(to_write["buf_str"])} elements')
    out_path = f'{out_dir}/{out_name}'  

    with open(out_path, "w") as f_out:
        json.dump(to_write, f_out)

In [16]:
pkl_to_json()


data/pkl_fine/Yoon-current-cv-web.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


36it [00:01, 23.62it/s]
100%|██████████| 1996/1996 [00:00<00:00, 230007.99it/s]


data/pkl_fine/cv (1).pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


17it [00:03,  5.33it/s]
100%|██████████| 1163/1163 [00:00<00:00, 274429.00it/s]


data/pkl_fine/BhattCV 221.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


18it [00:01, 13.58it/s]
100%|██████████| 1310/1310 [00:00<00:00, 277305.86it/s]


data/pkl_fine/cv-amato.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


41it [00:01, 24.18it/s]
100%|██████████| 2500/2500 [00:00<00:00, 218494.30it/s]


data/pkl_fine/cv-Qizheng2022.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


4it [00:00,  6.20it/s]
100%|██████████| 206/206 [00:00<00:00, 174128.70it/s]


data/pkl_fine/vita_web.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


15it [00:00, 23.62it/s]
100%|██████████| 753/753 [00:00<00:00, 237324.23it/s]


data/pkl_fine/cv.pkl
Using laparams =  <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>


22it [00:04,  5.20it/s]
100%|██████████| 1607/1607 [00:00<00:00, 269308.24it/s]

Found 9535 elements



