Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 114 lines (101 sloc) 4.15 KB
#!/usr/bin/env python
from __future__ import print_function
import glob
import sys
import os
import signal
import argparse
import matplotlib
matplotlib.use("AGG")
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import interpolation
import ocrolib
from ocrolib import morph
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
parser = argparse.ArgumentParser(description = """
Generate HTML for debugging a book directory.
Input: a directory in standard OCRopus book format
Output: index.html files and thumbnails showing recognition results
""")
parser.add_argument("book",default="book")
parser.add_argument("-N","--npages",type=int,default=100000,help="max number of pages, default: %(default)s")
args = parser.parse_args()
def write_cseg(stream,cseg_file):
cseg = ocrolib.read_line_segmentation(cseg_file)
cseg = ocrolib.read_line_segmentation(cseg_file)
csegs = linerec.extract_csegs(cseg)
stream.write("<table><tr>")
for i,c in enumerate(csegs):
out = ".__"+cseg_file+"_%03d.png"%i
plt.imsave(out,np.amax(c.img)-c.img,cmap=plt.cm.gray)
stream.write("<td><img src=%s height=%d style='border: 1px #ccccff solid;'></td>"%(out,max(2,c.img.shape[0]/2)))
stream.write("</tr></table>")
stream.write("\n")
def genpage(d):
print("===", d)
here = os.getcwd()
try:
os.chdir(d)
with open("index.html","w") as stream:
stream.write("<h1>%s</h1>\n"%d)
images = sorted(glob.glob("??????.bin.png"))
for img in images:
txt = ocrolib.fvariant(img,"txt","")
if os.path.exists(txt):
with open(txt) as tf: text = tf.read()
stream.write("<font color='#000066'><b>%s</b></font><br>\n"%text)
rtxt = ocrolib.fvariant(img,"txt","raw")
if os.path.exists(rtxt):
with open(rtxt) as tf: rtext = tf.read()
stream.write("<font color='gray'><b>%s</b></font><br>\n"%rtext)
stream.write("<p />\n")
image = ocrolib.read_image_gray(img)
stream.write("<img width='%d' src='%s'>\n"%(max(10,image.shape[1]/2),img))
stream.write("<br />\n")
stream.write("<font size=-2>")
stream.write("<a href=%s>%s</a> / "%("..",args.book))
stream.write("<a href=%s>%s</a> / "%("../"+d,d))
stream.write("<a href=%s>%s</a>"%(img,img))
stream.write("</font>")
stream.write("<p />\n")
cseg = ocrolib.fvariant(img,"cseg")
if os.path.exists(cseg):
write_cseg(stream,cseg)
rseg_file = ocrolib.fvariant(img,"rseg")
if os.path.exists(rseg_file):
rseg = ocrolib.read_line_segmentation(rseg_file)
plt.figure(figsize=(20,1),dpi=150)
morph.showlabels(rseg)
figfile = ".__"+rseg_file+"_.png"
plt.savefig(figfile)
stream.write("<img height='50' src='%s'><br>\n"%figfile)
stream.write("<hr>\n")
finally:
os.chdir(here)
os.chdir(args.book)
with open("index.html","w") as stream:
for d in sorted(glob.glob("????"))[:args.npages]:
genpage(d)
if os.path.exists(d+".bin.png"):
image = ocrolib.read_image_gray(d+".bin.png")
else:
image = np.zeros((300,300))
out = ".__"+d+".png"
image = interpolation.zoom(image,(0.125,0.125),order=1)
plt.imsave(out,image,cmap=plt.cm.gray)
stream.write("<table border=1><tr>\n")
stream.write("<td>")
stream.write("<a href='%s/index.html'><img src='%s'></a>"%(d,out))
stream.write("<br>%s<br>"%d)
stream.write("</td>\n")
stream.write("<td>")
count = 0
for fname in sorted(glob.glob(d+"/??????.txt")):
with open(fname) as tf: s = tf.read()
if len(s)<20: continue
stream.write("%s<br>\n"%s[:100])
count += 1
if count>=10: break
stream.write("</td>\n")
stream.write("</tr></table>\n")