In [1]:
import argparse
import os
import sys
sys.path.append('/users/vesalaia/pipelinv2')
sys.path.append("/users/vesalaia/.local/lib/python3.9/site-packages/bin")
sys.path.append("/users/vesalaia/.local/lib/python3.9/site-packages/lib/python3.9/site-packages")

In [2]:
import logging

import datetime
now = datetime.datetime.now()

timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
log_file_name = f"table_recognition_{timestamp}.log"

logging.basicConfig(
    filename=log_file_name,
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)


In [3]:
from config.options import Options

from data.dataset import OCRDatasetInstanceSeg
from pipeline.engine import initFolder, extractText, pipelineTask
from utils.config_check import configuration_ok

In [4]:
def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        logging.error('A serious error occurred. Boolean value expected.')
        raise argparse.ArgumentTypeError('Boolean value expected.')

In [5]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

def crop_polygon_from_image(image, polygon):
    mask = np.zeros_like(image)
    cv2.fillPoly(mask, [polygon], (255, 255, 255))
    cropped_image = cv2.bitwise_and(image, mask)
    x, y, w, h = cv2.boundingRect(polygon)
    cropped_image = cropped_image[y:y+h, x:x+w]
    return cropped_image


In [6]:
"""
Tasks:
    init:
    region:
    line:
    table:
    recognize:
    update:
    text:
    json:
"""

'\nTasks:\n    init:\n    region:\n    line:\n    table:\n    recognize:\n    update:\n    text:\n    json:\n'

In [14]:
def processTask(cfgfile, executeTask, infolder, inpage, outpage):
    logging.info(f"Configuration file: {cfgfile}")
    if cfgfile != None:
        opts =  Options(cfgfile)

    if configuration_ok(opts, executeTask.lower()):    
        if executeTask.lower() in ["init", "i"]:
            executeTask = "init"
            if outpage == "":
                outpage = "page"
            if infolder != "":
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-output: {outpage}")
                initFolder(opts, infolder, outpage) 
        elif executeTask.lower() in ["text", "json", "csv"]:
            executeTask = executeTask.lower()
            if inpage == "":
                inpage = "pageText"
            if outpage == "":
                outpage = "text"
            if infolder != "":
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                extractText(opts, infolder, inpage, outpage) 
        elif executeTask.lower() in ["detectregion", "dr", "region"]:
            executeTask = "region"
            if inpage == "":
                inpage = "page"
            if outpage == "":
                outpage = "pageRD"
            if infolder != "":
                RO_groups = opts.RO_region_groups
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, tryMerge=tryMerge, 
                             reading_order=reading_order)
        elif executeTask.lower() in ["detectlines", "dl", "line"]:
            executeTask = "line"
            if inpage == "":
                inpage = "pageRD"
            if outpage == "":
                outpage = "pageLD"
            if infolder != "":
                RO_groups = opts.RO_region_groups
                one_page_per_image = True
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, tryMerge=tryMerge,
                            reading_order=reading_order, line_model="mask r-cnn")
        elif executeTask.lower() in ["recognizetext", "rt", "recognize"]:
            executeTask = "recognize"
            if inpage == "":
                inpage = "pageLD"
            if outpage == "":
                outpage = "pageText"
            if infolder != None:
                RO_groups = opts.RO_region_groups
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, 
                             reading_order=reading_order)
        elif executeTask.lower() in ["update", "u"]:
            executeTask = "update"
            if inpage == "":
                inpage = "pageLD"
            if outpage == "":
                outpage = "pageU"
                if infolder != "":
                    RO_groups = opts.RO_line_groups
                    logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                    pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, 
                             reading_order=reading_order, combine=combine)
        elif executeTask.lower() in ["table", "t"]:
            executeTask = "table"
            if inpage == "":
                inpage = "page"
            if outpage == "":
                outpage = "pageTbl"
            if infolder != "":
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, 
                             reading_order=False, combine=False)
        elif executeTask.lower() in ["cell", "c"]:
            executeTask = "cell"
            if inpage == "":
                inpage = "pageTbl"
            if outpage == "":
                outpage = "pageCell"
            if infolder != "":
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage, 
                             reading_order=False, combine=False)
        elif executeTask.lower() in ["cellrecognize", "cr"]:
            executeTask = "cellrecognize"
            if inpage == "":
                inpage = "pageCell"
            if outpage == "":
                outpage = "pageText"
            if infolder != None:
                logging.info(f"Task:{executeTask} Folder:{infolder} XML-input:{inpage} XML-output: {outpage}")
                pipelineTask(opts, executeTask, infolder, inpage=inpage, outpage=outpage)
        else:
            logging.error(f"Task not recognized: {executeTask}")


In [None]:
cfgfile = "/users/vesalaia/config/config_table_775.ini"
opts = Options(cfgfile)

In [None]:
print(opts.installed_pckgs)
configuration_ok(opts, "cellrecognize")


In [None]:
infolder = "/scratch/project_2005488/Muutto/Pielavesi" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_775.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)



In [9]:
infolder = "/scratch/project_2005488/Muutto/Pielavesi" 

executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_hd_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

Loading YOLO table models
[{'type': 'content', 'coords': [1103, 187, 2119, 1655]}, {'type': 'content', 'coords': [82, 188, 1079, 1660]}, {'type': 'header', 'coords': [374, 93, 764, 178]}, {'type': 'header', 'coords': [1393, 107, 1761, 188]}]
[[1353, 1073, 1822, 1134], [311, 682, 775, 741], [312, 853, 774, 910], [1356, 683, 1824, 743], [1356, 854, 1823, 912], [1353, 1187, 1821, 1246], [1358, 508, 1823, 573], [1352, 1243, 1821, 1305], [312, 962, 773, 1018], [1357, 740, 1824, 798], [1355, 1016, 1821, 1075], [313, 1133, 772, 1190], [1356, 962, 1823, 1019], [311, 1073, 772, 1136], [311, 1246, 771, 1309], [313, 797, 772, 855], [315, 1015, 775, 1075], [317, 503, 774, 567], [313, 564, 776, 624], [312, 1305, 771, 1363], [316, 738, 775, 799], [1354, 1131, 1821, 1190], [315, 905, 773, 964], [312, 331, 778, 386], [313, 1465, 770, 1518], [1358, 571, 1822, 627], [1357, 625, 1822, 683], [80, 1566, 224, 1660], [313, 1189, 772, 1246], [1357, 796, 1823, 855], [1352, 1299, 1821, 1362], [312, 1361, 771, 1

In [None]:
infolder = "/scratch/project_2005488/Muutto/end-to-end-printed" 
inpage = "pageTbl"
dataset_files = [[infolder, os.path.join(infolder, inpage)]]
dataset = OCRDatasetInstanceSeg(dataset_files, {})

In [None]:
page = dataset.__getXMLitem__(0)

In [None]:
page['regions']

In [None]:
from text_recognition.line2text import TRline2Text, text_recognition

In [None]:
from model.inference import load_text_recognize_model 
load_text_recognize_model(opts, opts.device)

In [None]:
def htr(image, processor, model, device):
    """
    :param image: PIL Image.
    :param processor: Huggingface OCR processor.
    :param model: Huggingface OCR model.

    Returns:
        generated_text: the OCR'd text string.
    """
    # We can directly perform OCR on cropped images.
    pixel_values = processor(image, return_tensors='pt').pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [None]:
idx = 4
page = dataset.__getXMLitem__(idx)
image = cv2.imread(dataset.__getfullname__(idx))
c_image = crop_polygon_from_image(image, page['regions'][0]['polygon'])
plt.imshow(c_image)
plt.show()

In [None]:
extracted_text = htr(c_image, opts.text_recognize_processor, opts.text_recognize_model, opts.device)
print(extracted_text)

In [10]:
infolder = "/scratch/project_2005488/Muutto/Pielavesi" 
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table_line.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [11]:
executeTask = "cellrecognize"
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr2.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

In [15]:
executeTask = "csv"
infolder = "/scratch/project_2005488/Muutto/Pielavesi" 
inpage = "pageText"
outpage = "pageCsv"
cfgfile = "/users/vesalaia/config/config_hd_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

In [None]:
infolder = "/scratch/project_2005488/Muutto/man-ds-test1-all-printed" 
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table_line.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#




In [None]:
infolder = "/scratch/project_2005488/Muutto/sample10-all-printed" 
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table_line.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()

In [None]:
infolder = "/scratch/project_2005488/Muutto/sample5-all-printed" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

executeTask = "table"
inpage = "page"
outpage = "pageTbl2"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#

In [None]:
infolder = "/scratch/project_2005488/Muutto/sample6-all-printed" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

executeTask = "table"
inpage = "page"
outpage = "pageTbl2"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#

In [None]:
infolder = "/scratch/project_2005488/Muutto/sample7-all-printed" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

executeTask = "table"
inpage = "page"
outpage = "pageTbl2"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#

In [None]:
infolder = "/scratch/project_2005488/Muutto/sample8-all-printed" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

executeTask = "table"
inpage = "page"
outpage = "pageTbl2"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#

In [None]:
print_time()
infolder = "/scratch/project_2005488/Muutto/sample10-all-printed" 

executeTask = "init"
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

executeTask = "table"
inpage = "page"
outpage = "pageTbl2"
cfgfile = "/users/vesalaia/config/config_table_275.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
executeTask = "table"
inpage = "page"
outpage = "pageTbl3"
cfgfile = "/users/vesalaia/config/config_table_373.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#

print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl4"
cfgfile = "/users/vesalaia/config/config_table_1100.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl5"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)



In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/skewed" 
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()

executeTask = "cellrecognize"
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/deskewed" 
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table_1492.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()

executeTask = "cellrecognize"
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/sample9-all-printed" 
inpage = ""
outpage = "page2"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page2"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
#print_time()
#executeTask = "cell"
#inpage = "pageTbl"
#outpage = "pageCell"
#cfgfile = "/users/vesalaia/config/config_table.ini"
#processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
#print_time()

#executeTask = "cellrecognize"
#inpage = "pageCell"
#outpage = "pageText"
#cfgfile = "/users/vesalaia/config/config_trocr.ini"
#processTask(cfgfile, executeTask, infolder, inpage, outpage)
#rint_time()

In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/sample6-all-printed" 
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()

executeTask = "cellrecognize"
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/sample7-all-printed" 
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()

executeTask = "cellrecognize"
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/sample9-all-printed" 
inpage = ""
outpage = "pageTest"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
executeTask = "table"
infolder = "/scratch/project_2005488/Muutto/sample9-all-printed" 
inpage = "pageTest"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)


In [None]:
print_time()
executeTask = "cellrecognize"
infolder = "/scratch/project_2005488/Muutto/sample3-all-printed" 
inpage = "pageCell"
outpage = "pageText2"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
executeTask = "cellrecognize"
infolder = "/scratch/project_2005488/Muutto/Test"  
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)

In [None]:
print_time()
executeTask = "cellrecognize"
infolder = "/scratch/project_2005488/Muutto/Test" 
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
print_time()
executeTask = "init"
infolder = "/scratch/project_2005488/Muutto/debug" 
inpage = ""
outpage = "page"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "table"
inpage = "page"
outpage = "pageTbl"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()
executeTask = "cell"
inpage = "pageTbl"
outpage = "pageCell"
cfgfile = "/users/vesalaia/config/config_table.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
#
print_time()



In [None]:
executeTask = "cellrecognize"
infolder = "/scratch/project_2005488/Muutto/debug" 
inpage = "pageCell"
outpage = "pageText"
cfgfile = "/users/vesalaia/config/config_trocr.ini"
processTask(cfgfile, executeTask, infolder, inpage, outpage)
print_time()

In [None]:
inpage = "pageText"
dataset_files = [[infolder, os.path.join(infolder, inpage)]]
dataset = OCRDatasetInstanceSeg(dataset_files, {})

In [None]:
page = dataset.__getXMLitem__(0)

In [None]:
page