Fetching contributors…
Cannot retrieve contributors at this time
executable file 162 lines (125 sloc) 5.21 KB
#!/usr/bin/env python
import __builtin__ as python
import random as pyrandom
import sys
import os.path
import re
import glob
import argparse
import codecs
import numpy as np
from matplotlib.pyplot import imread
import ocrolib
from ocrolib import hocr
parser = argparse.ArgumentParser("""
Construct an HTML output file in hOCR format by putting together
the recognition results for each page in sequence.
You should usually invoke this program as
ocropus-hocr 'book/????.bin.png'
For each page like 'book/0001.bin.png', it uses the following files:
book/0001.bin.png # page image
book/0001.pseg.png # page segmentation
book/0001/010001.txt # recognizer output for lines
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)
ostream =,"w","utf-8")
def E(*args):
args = [str(x) for x in args]
sys.stderr.write(" ".join(args))
def P(*args):
def PN(*args):
E("writing to",args.output)
median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
if len(xhfiles)>5:
xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
if len(xheights)>0:
median_xheight = np.median(xheights)
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
if len(lfiles)>0:
median_xheight = 0.5*np.median([imread(f).shape[0] for f in lfiles[:100]])
last_coords = None
for arg in args.files:
base,_ = ocrolib.allsplitext(arg)
image = ocrolib.read_image_binary(arg)
height, width = image.shape
P("<div class='ocr_page' title='image %s; bbox 0 0 %d %d'>"%(arg,width,height))
# to proceed, we need a pseg file and a
# subdirectory containing text lines
if not os.path.exists(base+".pseg.png"):
E("%s: no such file"%(base+".pseg.png",))
if not os.path.isdir(base):
E("%s: no such directory"%base)
# iterate through the text lines in reading order, based
# on the page segmentation file
pseg = ocrolib.read_page_segmentation(base+".pseg.png")
regions = ocrolib.RegionExtractor()
for i in range(1,regions.length()):
# keep track of the bounding box information for each line
# and insert paragraph breaks as needed
id =
y0,x0,y1,x1 = regions.bbox(i)
if last_coords is not None:
lx0,ly0 = last_coords
dx,dy = x0-lx0,y1-ly0
par = 0
if dy>0:
par = 0 # column break... moving upwards
if median_xheight is not None:
if abs(dy)>5*median_xheight: par = 1 # whitespace separator
if dx>2*median_xheight: par = 1 # indented paragraph
if abs(dx)>10*median_xheight: par = 1 # something else
if par and not args.nopars: P("<p />")
last_coords = (x0,y0)
# get the text for the line itself
lbase = "%s/%06x"%(base,id)
if not os.path.exists(lbase+".txt"):
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
text = ocrolib.read_text(lbase+".txt")
text = re.sub(r'\&','\&amp;',text)
text = re.sub(r'\<','\&lt;',text)
# accumulate information for each line here
style = ""
info = ""
# estimate the font size for this line
if median_xheight is not None and os.path.exists(lbase+".xheight"):
xheight = float(ocrolib.read_text(lbase+".xheight"))
perc = int(np.clip(xheight*100.0/median_xheight,30,300))
perc = 10*((perc+5)//10)
if perc!=100:
style += "font-size:%d%%;"%perc
# output geometric information
info += "bbox %d %d %d %d"%(x0,y0,x1,y1)
if os.path.exists(lbase+".baseline"):
info += "; baseline "+ocrolib.read_text(lbase+".baseline")
# put it all together into a SPAN
if style!="": PN(" style='"+style+"'")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
if not args.nobreaks: P("<br />")
else: P()