Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

First draft of OCR cleanup ... Docsplit::TextCleaner

  • Loading branch information...
commit 41e257a869a07d2bb207f2eefb770b62e98d66e5 1 parent e2d8c2d
@jashkenas jashkenas authored
View
6 lib/docsplit.rb
@@ -72,6 +72,11 @@ def self.extract_#{key}(pdfs, opts={})
EOS
end
+ # Utility method to clean OCR'd text with garbage characters.
+ def self.clean_text(text)
+ TextCleaner.new.clean(text)
+ end
+
private
@@ -103,3 +108,4 @@ def self.normalize_value(value)
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
View
5 lib/docsplit/command_line.rb
@@ -71,7 +71,7 @@ def usage
# Use the OptionParser library to parse out all supported options. Return
# options formatted for the Ruby API.
def parse_options
- @options = {:ocr => :default}
+ @options = {:ocr => :default, :clean => true}
@option_parser = OptionParser.new do |opts|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
@options[:output] = d
@@ -88,6 +88,9 @@ def parse_options
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
@options[:ocr] = o
end
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+ @options[:clean] = false
+ end
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
@options[:rolling] = true
end
View
90 lib/docsplit/text_cleaner.rb
@@ -0,0 +1,90 @@
+require 'strscan'
+
+module Docsplit
+
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
+ # words. Algorithms taken from:
+ #
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
+ # -- Taghva, Nartker, Condit, and Borsack
+ #
+ # Improving Search and Retrieval Performance through Shortening Documents,
+ # Detecting Garbage, and Throwing out Jargon
+ # -- Kulp
+ #
+ class TextCleaner
+
+ # Cached regexes we plan on using.
+ WORD = /\S+/
+ SPACE = /\s+/
+ NEWLINE = /[\r\n]/
+ ALNUM = /[a-z0-9]/i
+ PUNCT = /[^a-z0-9\s]/i
+ REPEAT = /(.)\1{2,}/
+ UPPER = /[A-Z]/
+ LOWER = /[a-z]/
+ ACRONYM = /^\(?[A-Z]+('?s|[.,])?\)?$/
+ ALL_ALPHA = /^[a-z]+$/i
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
+ VOWEL = /([aeiou]|y$)/i
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
+ VOWEL_4 = /[aeiou]{4}/i
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
+ SINGLETONS = /^[AaIi]$/
+
+ # For the time being, `clean` uses the regular StringScanner, and not the
+ # multibyte-aware version.
+ def clean(text)
+ scanner = StringScanner.new(text)
+ cleaned = []
+ spaced = false
+ loop do
+ if space = scanner.scan(SPACE)
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
+ spaced = true
+ elsif word = scanner.scan(WORD)
+ unless garbage(word)
+ cleaned.push(word)
+ spaced = false
+ end
+ elsif scanner.eos?
+ return cleaned.join('').gsub(REPEATED, '')
+ end
+ end
+ end
+
+ # Is a given word OCR garbage?
+ def garbage(w)
+ # More than 20 bytes in length.
+ (w.length > 20) ||
+
+ # If there are three or more identical characters in a row in the string.
+ (w =~ REPEAT) ||
+
+ # More punctuation than alpha numerics.
+ (w.scan(ALNUM).length < w.scan(PUNCT).length) ||
+
+ # Ignoring the first and last characters in the string, if there are three or
+ # more different punctuation characters in the string.
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+
+ # Four or more consecutive vowels, or five or more consecutive consonants.
+ ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+
+ # Number of uppercase letters greater than lowercase letters, but the word is
+ # not all uppercase + punctuation.
+ ((w.scan(UPPER).length > w.scan(LOWER).length) && (w !~ ACRONYM)) ||
+
+ # Single letters that are not A or I.
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+
+ # All characters are alphabetic and there are 8 times more vowels than
+ # consonants, or 8 times more consonants than vowels.
+ ((w.length > 2 && (w =~ ALL_ALPHA) && (w !~ ACRONYM)) &&
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+ (cons > vows * 8)))
+ end
+
+ end
+
+end
View
15 lib/docsplit/text_extractor.rb
@@ -62,14 +62,17 @@ def extract_from_ocr(pdf, pages)
if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+ file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
- run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+ run "tesseract #{tiff} #{file} 2>&1"
+ clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
+ clean_text(base_path + '.txt') if @clean_ocr
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ def extract_from_ocr(pdf, pages)
private
+ def clean_text(file)
+ File.open(file, 'r+') do |f|
+ text = f.read
+ f.truncate(0)
+ f.rewind
+ f.write(Docsplit.clean_text(text))
+ end
+ end
+
# Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
@@ -106,6 +118,7 @@ def extract_options(options)
@pages = options[:pages]
@force_ocr = options[:ocr] == true
@forbid_ocr = options[:ocr] == false
+ @clean_ocr = options[:clean]
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.