More cleanup.

ocropus-archive · Nov 2, 2014 · 0918118 · 0918118
1 parent 758e023
commit 0918118
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 68 deletions.
diff --git a/OLD/README b/OLD/README
@@ -1,53 +1,3 @@
-To install, use:
-
-    $ sudo apt-get install $(cat PACKAGES)
-    $ python setup.py download_models
-    $ sudo python setup.py install
-
-To test the recognizer, run:
-
-    $ ./run-test
-
-OCRopus is really a collection of document analysis programs, not a turn-key OCR system.
-
-In addition to the recognition scripts themselves, there are a number of scripts for
-ground truth editing and correction, measuring error rates, determining confusion matrices, etc.
-OCRopus commands will generally print a stack trace along with an error message;
-this is not generally indicative of a problem (in a future release, we'll suppress the stack
-trace by default since it seems to confuse too many users).
-
-To recognize pages of text, you need to run separate commands: binarization, page layout
-analysis, and text line recognition. 
-
-    # perform binarization
-    ./ocoropus-nlbin tests/ersch.png -o book
-
-    # perform page layout analysis
-    ./ocropus-gpageseg 'book/????.bin.png'
-
-    # perform text line recognition (on four cores, with a fraktur model)
-    ./ocropus-rpred -Q 4 -m models/fraktur.pyrnn.gz 'book/????/??????.bin.png'
-
-    # generate HTML output
-    ./ocropus-hocr 'book/????.bin.png' -o ersch.html
-
-    # display the output
-    firefox ersch.html
-
-There are also a number of older commands for text line recognition,
-layout analysis, etc., kept for backwards compatibility. The binarization
-and layout analysis commands of the current release will be replaced in 
-the next release with entirely new, trainable commands.
-
-The main feature of this release is ocropus-rpred, which achieves very
-low error rates on a wide variety of fonts and inputs (even degraded)
-of body text, even without language models or dictionaries. The model
-has been trained on UW3 and UNLV data.  Test set error on UW3 is
-about 0.5% without a language model or dictionary.
-
-There are some things the currently trained models for ocropus-rpred
-will not handle well, largely because they are nearly absent in the
-current training data. That includes all-caps text, some special symbols
-(including "?"), typewriter fonts, and subscripts/superscripts. This will
-be addressed in a future release, and, of course, you are welcome to contribute
-new, trained models.
+This directory contains a lot of old Python code. This code isn't generally
+runnable anymore, but it's still included (for the time being) because bits
+and pieces may be useful.
diff --git a/OLD/README.2014 b/OLD/README.2014
@@ -0,0 +1,53 @@
+To install, use:
+
+    $ sudo apt-get install $(cat PACKAGES)
+    $ python setup.py download_models
+    $ sudo python setup.py install
+
+To test the recognizer, run:
+
+    $ ./run-test
+
+OCRopus is really a collection of document analysis programs, not a turn-key OCR system.
+
+In addition to the recognition scripts themselves, there are a number of scripts for
+ground truth editing and correction, measuring error rates, determining confusion matrices, etc.
+OCRopus commands will generally print a stack trace along with an error message;
+this is not generally indicative of a problem (in a future release, we'll suppress the stack
+trace by default since it seems to confuse too many users).
+
+To recognize pages of text, you need to run separate commands: binarization, page layout
+analysis, and text line recognition. 
+
+    # perform binarization
+    ./ocoropus-nlbin tests/ersch.png -o book
+
+    # perform page layout analysis
+    ./ocropus-gpageseg 'book/????.bin.png'
+
+    # perform text line recognition (on four cores, with a fraktur model)
+    ./ocropus-rpred -Q 4 -m models/fraktur.pyrnn.gz 'book/????/??????.bin.png'
+
+    # generate HTML output
+    ./ocropus-hocr 'book/????.bin.png' -o ersch.html
+
+    # display the output
+    firefox ersch.html
+
+There are also a number of older commands for text line recognition,
+layout analysis, etc., kept for backwards compatibility. The binarization
+and layout analysis commands of the current release will be replaced in 
+the next release with entirely new, trainable commands.
+
+The main feature of this release is ocropus-rpred, which achieves very
+low error rates on a wide variety of fonts and inputs (even degraded)
+of body text, even without language models or dictionaries. The model
+has been trained on UW3 and UNLV data.  Test set error on UW3 is
+about 0.5% without a language model or dictionary.
+
+There are some things the currently trained models for ocropus-rpred
+will not handle well, largely because they are nearly absent in the
+current training data. That includes all-caps text, some special symbols
+(including "?"), typewriter fonts, and subscripts/superscripts. This will
+be addressed in a future release, and, of course, you are welcome to contribute
+new, trained models.
diff --git a/ocropus-gpageseg b/ocropus-gpageseg
@@ -13,12 +13,13 @@
 
 from pylab import *
 import argparse,glob,os,os.path
+import traceback
 from scipy.ndimage import measurements
 from scipy.misc import imsave
 from scipy.ndimage.filters import gaussian_filter,uniform_filter,maximum_filter
-import ocrolib
-from ocrolib import psegutils,morph,improc,sl
 from multiprocessing import Pool
+import ocrolib
+from ocrolib import psegutils,morph,sl
 from ocrolib.toplevel import *
 
 parser = argparse.ArgumentParser()
@@ -85,6 +86,8 @@ parser.add_argument('files',nargs='+')
 args = parser.parse_args()
 args.files = ocrolib.glob_all(args.files)
 
+def norm_max(v):
+    return v/amax(v)
 def check_page(image):
     if len(image.shape)==3: return "input image is color image %s"%(image.shape,)
     if mean(image)<median(image): return "image may be inverted"
@@ -237,8 +240,8 @@ def compute_gradmaps(binary,scale):
         grad = gaussian_filter(1.0*cleaned,(max(4,args.vscale*0.3*scale),
                                             args.hscale*scale),order=(1,0))
         grad = uniform_filter(grad,(args.vscale,args.hscale*6*scale))
-    bottom = improc.norm_max((grad<0)*(-grad))
-    top = improc.norm_max((grad>0)*grad)
+    bottom = ocrolib.norm_max((grad<0)*(-grad))
+    top = ocrolib.norm_max((grad>0)*grad)
     return bottom,top,boxmap
 
 def compute_line_seeds(binary,bottom,top,colseps,scale):
@@ -391,7 +394,7 @@ def process1(job):
         os.mkdir(outputdir)
     lines = [lines[i] for i in lsort]
     ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)
-    cleaned = improc.remove_noise(binary,args.noise)
+    cleaned = ocrolib.remove_noise(binary,args.noise)
     for i,l in enumerate(lines):
         binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
         ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)

diff --git a/ocropus-nlbin b/ocropus-nlbin
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 from pylab import *
+from numpy.ctypeslib import ndpointer
 import argparse,os,os.path
 from scipy.ndimage import filters,interpolation,morphology,measurements
 from scipy import stats

diff --git a/ocropus-rpred b/ocropus-rpred
@@ -5,7 +5,6 @@ import codecs
 from pylab import *
 import os.path
 import ocrolib
-from ocrolib import lineest
 import argparse
 import matplotlib
 from multiprocessing import Pool
@@ -21,12 +20,10 @@ parser.add_argument('-n','--nocheck',action="store_true",
                     help="disable error checking on inputs")
 
 # line dewarping (usually contained in model)
-parser.add_argument("-e","--lineest",default=None,
-                    help="line dewarping model (overrides recognizer)")
+parser.add_argument("-e","--nolineest",action="store_true",
+                    help="target line height (overrides recognizer)")
 parser.add_argument("-l","--height",default=-1,type=int,
                     help="target line height (overrides recognizer)")
-parser.add_argument("-E","--nolineest",action='store_true',
-                    help="skip dewarping (lines are already dewarped)")
 
 # recognition
 parser.add_argument('-m','--model',default=ocrolib.default.rnnmodel,
@@ -108,8 +105,6 @@ network = ocrolib.load_object(args.model,verbose=1)
 # let the user override it (this is not very useful)
 
 lnorm = getattr(network,"lnorm",None)
-if args.lineest is not None:
-    lnorm = lineest.load_normalizer(args.lineest)
 
 if args.height>0:
     lnorm.setHeight(args.height)

diff --git a/ocropus-rtrain b/ocropus-rtrain
@@ -91,7 +91,10 @@ print "# tests",(len(tests) if tests is not None else "None")
 
 # load the line normalizer
 
-lnorm = lineest.load_normalizer(args.lineest)
+if args.lineest=="center":
+  lnorm = lineest.CenterNormalizer()
+else:
+  raise Exception(args.lineest+": unknown line normalizer")
 lnorm.setHeight(args.height)
 
 # The `codec` maps between strings and arrays of integers.

diff --git a/ocropus-visualize-results b/ocropus-visualize-results
@@ -1,14 +1,14 @@
 #!/usr/bin/python
 
+import glob
 import matplotlib
 matplotlib.use("AGG")
 import sys,os,signal
 from scipy.ndimage import interpolation
 import pylab
 from pylab import *
-import glob
 import ocrolib
-from ocrolib import linerec,morph
+from ocrolib import morph
 from scipy.misc import imsave
 
 signal.signal(signal.SIGINT,lambda *args:sys.exit(1))

diff --git a/run-coverage b/run-coverage
@@ -0,0 +1,17 @@
+#!/bin/bash -e
+
+rm -rf .coverage
+rm -rf .coverage.*
+rm -rf temp 
+python -m coverage run -p ocropus-nlbin tests/testpage.png -o temp
+python -m coverage run -p ocropus-gpageseg 'temp/????.bin.png'
+python -m coverage run -p ocropus-rpred -n 'temp/????/??????.bin.png'
+python -m coverage run -p ocropus-hocr 'temp/????.bin.png' -o temp.html
+python -m coverage run -p ocropus-visualize-results temp
+python -m coverage run -p ocropus-gtedit html temp/????/??????.bin.png -o temp-correction.html
+python -m coverage run -p ocropus-rpred tests/0079-01000d.png
+python -m coverage run -p ocropus-errs tests/0079-01000d.gt.txt
+python -m coverage run -p ocropus-econf tests/0079-01000d.gt.txt
+python -m coverage merge
+rm -rf htmlcov
+python -m coverage html
diff --git a/run-rtrain b/run-rtrain
@@ -0,0 +1,4 @@
+#!/bin/bash -ex
+
+tar -zxvf tests/uw3-500.tgz
+ocropus-rtrain 'book/*/*.bin.png' -d 5 -o uw3-500-model