-
Notifications
You must be signed in to change notification settings - Fork 168
/
Copy pathtesseract1.py
102 lines (90 loc) · 3.49 KB
/
tesseract1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Demo script using Tesseract OCR.
Extract text of a page and interpret unrecognized characters using Tesseract.
MuPDF codes unrecognizable characters as 0xFFFD = 65533.
Extraction option is "dict", which delivers contiguous text pieces within one
line, that have the same font properties (color, fontsize, etc.). Together with
the language parameter, this helps Tesseract finding the correct character.
The basic approach is to only invoke OCR, if the span text contains
chr(65533). Because Tesseract's response ignores leading spaces and appends
line break characters, some adjustments are made.
--------------
This demo will OCR only text, that is known to be text. This means, it
does not look at parts of a page containing images or text encoded as drawings.
--------------
Dependencies:
Tesseract must be installed and invocable via Python's 'subprocess' module.
You also must install all the Tesseract language support you need to detect.
"""
import fitz
import subprocess
import time
# Tesseract invocation command (Windows version)
# Assume: language English. Detect more languages by add e.g. '+deu' for German.
# Assume: text represents one line (--psm 7)
# Note: Language mix spec increases duration by >40% - only use when needed!
tess = "tesseract stdin stdout --psm 7 -l eng"
mat = fitz.Matrix(4, 4) # high resolution matrix
ocr_time = 0
pix_time = 0
def get_tessocr(page, bbox):
"""Return OCR-ed span text using Tesseract.
Args:
page: fitz.Page
bbox: fitz.Rect or its tuple
Returns:
The OCR-ed text of the bbox.
"""
global ocr_time, pix_time, tess, mat
# Step 1: Make a high-resolution image of the bbox.
t0 = time.perf_counter()
pix = page.get_pixmap(
colorspace=fitz.csGRAY, # we need no color
matrix=mat,
clip=bbox,
)
image = pix.tobytes("png") # make a PNG image
t1 = time.perf_counter()
# Step 2: Invoke Tesseract to OCR the image. Text is stored in stdout.
rc = subprocess.run(
tess, # the command
input=image, # the pixmap image
stdout=subprocess.PIPE, # find the text here
shell=True,
)
# because we told Tesseract to interpret the image as one line, we now need
# to strip off the line break characters from the tail.
text = rc.stdout.decode() # convert to string
text = text[:-3] # remove line end characters
t2 = time.perf_counter()
ocr_time += t2 - t1
pix_time += t1 - t0
return text
doc = fitz.open("v110-changes.pdf")
ocr_count = 0
for page in doc:
blocks = page.get_text("dict", flags=0)["blocks"]
for b in blocks:
for l in b["lines"]:
for s in l["spans"]:
text = s["text"]
if chr(65533) in text: # invalid characters encountered!
# invoke OCR
ocr_count += 1
print("before: '%s'" % text)
text1 = text.lstrip()
sb = " " * (len(text) - len(text1)) # leading spaces
text1 = text.rstrip()
sa = " " * (len(text) - len(text1)) # trailing spaces
new_text = sb + get_tessocr(page, s["bbox"]) + sa
print(" after: '%s'" % new_text)
print("-------------------------")
print("OCR invocations: %i." % ocr_count)
print(
"Pixmap time: %g (avg %g) seconds."
% (round(pix_time, 5), round(pix_time / ocr_count, 5))
)
print(
"OCR time: %g (avg %g) seconds."
% (round(ocr_time, 5), round(ocr_time / ocr_count, 5))
)