In [1]:
import os
import subprocess
from subprocess import list2cmdline


def _rm_on_fail(name, ground_truth_dir):
    print(f"rm {name} files (png, box, gt.txt, lstmf)")
    os.remove(os.path.join(ground_truth_dir, f"{name}.box"))
    os.remove(os.path.join(ground_truth_dir, f"{name}.png"))
    os.remove(os.path.join(ground_truth_dir, f"{name}.gt.txt"))
    os.remove(os.path.join(ground_truth_dir, f"{name}.lstmf"))


def _run_prepare(name, ground_truth_dir, tesstrain_dir):
    with open(os.path.join(ground_truth_dir, f"{name}.box"), "w") as box_file:
        cmd1 = [
            "PYTHONIOENCODING=utf-8",
            "python3",
            os.path.join(tesstrain_dir, "generate_line_box.py"),
            "-i",
            os.path.join(ground_truth_dir, f"{name}.png"),
            "-t",
            os.path.join(ground_truth_dir, f"{name}.gt.txt"),
        ]
        cmd1 = list2cmdline(cmd1)
        try:
            subprocess.run(cmd1, shell=True, stdout=box_file, text=True)
        except Exception as e:
            print(e)
            _rm_on_fail(name, ground_truth_dir)

    # Команда для tesseract
    cmd2 = [
        "tesseract",
        os.path.join(ground_truth_dir, f"{name}.png"),
        os.path.join(ground_truth_dir, f"{name}"),
        "--psm",
        "13",
        "lstm.train",
    ]
    cmd2 = list2cmdline(cmd2)

    try:
        subprocess.run(cmd2, shell=True, text=True)
    except Exception as e:
        print(e)
        _rm_on_fail(name, ground_truth_dir)

In [2]:
import concurrent
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

par_factor = 4


def run_prepare(boxes, ground_truth_dir, tesstrain_dir):
    with tqdm(total=len(boxes)) as pbar, ThreadPoolExecutor(max_workers=par_factor) as executor:
        futures = [
            executor.submit(
                _run_prepare,
                b,
                ground_truth_dir,
                tesstrain_dir,
            )
            for b in boxes
        ]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
                pbar.update(1)
            except Exception as e:
                raise e

In [None]:
tesstrain_dir = "/Users/panagoa/PycharmProjects/tesstrain"
tess_data_dir = "/Users/panagoa/tesstrain_data"

for dir_name in [
    "joined_words_freq_600000_oshamaho_aa",
    "joined_words_freq_600000_oshamaho_ab",
    "joined_words_freq_900000_adyghepsale_ru_aa",
    "joined_words_freq_900000_adyghepsale_ru_ab",
    "joined_words_freq_900000_adyghepsale_ru_ac",
]:
    ground_truth_dir = os.path.join(tess_data_dir, dir_name)
    ground_truth = os.listdir(ground_truth_dir)

    txt = [f for f in ground_truth if f.endswith(".txt")]
    png = [f for f in ground_truth if f.endswith(".png")]
    box = [f for f in ground_truth if f.endswith(".box")]
    lstmf = [f for f in ground_truth if f.endswith(".lstmf")]

    wo_boxes = set([f.replace(".png", "") for f in png]).difference([f.replace(".lstmf", "") for f in lstmf])
    print(len(wo_boxes))
    run_prepare(wo_boxes, ground_truth_dir, tesstrain_dir)

0
0
0


In [None]:
tesstrain_dir = "/Users/panagoa/PycharmProjects/tesstrain"
ground_truth_dir = os.path.join(tesstrain_dir, "data", "kbd-ground-truth-ng-font")
print(os.listdir(ground_truth_dir), "\n")