-
Notifications
You must be signed in to change notification settings - Fork 589
/
Copy pathtest_mupdf_regressions.py
116 lines (97 loc) · 3.68 KB
/
test_mupdf_regressions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pymupdf
import os
import gentle_compare
scriptdir = os.path.abspath(os.path.dirname(__file__))
def test_707448():
"""Confirm page content cleaning does not destroy page appearance."""
filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words")
page.clean_contents(sanitize=True)
words1 = page.get_text("words")
assert gentle_compare.gentle_compare(words0, words1)
def test_707673():
"""Confirm page content cleaning does not destroy page appearance.
Fails starting with MuPDF v1.23.9.
Fixed in:
commit 779b8234529cb82aa1e92826854c7bb98b19e44b (golden/master)
"""
filename = os.path.join(scriptdir, "resources", "test-707673.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words")
page.clean_contents(sanitize=True)
words1 = page.get_text("words")
ok = gentle_compare.gentle_compare(words0, words1)
if pymupdf.mupdf_version_tuple >= (1, 24, 1):
assert ok
else:
assert not ok
def test_707727():
"""Confirm page content cleaning does not destroy page appearance.
MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707727
"""
filename = os.path.join(scriptdir, "resources", "test_3362.pdf")
doc = pymupdf.open(filename)
page = doc[0]
pix0 = page.get_pixmap()
page.clean_contents(sanitize=True)
page = doc.reload_page(page) # required to prevent re-use
pix1 = page.get_pixmap()
rms = gentle_compare.pixmaps_rms(pix0, pix1)
print(f'{rms=}', flush=1)
pix0.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix0.png'))
pix1.save(os.path.normpath(f'{__file__}/../../tests/test_707727_pix1.png'))
if pymupdf.mupdf_version_tuple >= (1, 25, 2):
# New sanitising gives small fp rounding errors.
assert rms < 0.05
elif pymupdf.mupdf_version_tuple > (1, 24, 1):
assert rms == 0
else:
assert rms != 0
if pymupdf.mupdf_version_tuple <= (1, 24, 1):
# We expect warnings.
wt = pymupdf.TOOLS.mupdf_warnings()
print(f"{wt=}")
assert wt
def test_707721():
"""Confirm text extraction works for nested MCID with Type 3 fonts.
PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3357
MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
"""
if pymupdf.mupdf_version_tuple < (1, 24, 2):
print(
"test_707721(): not running because MuPDF-{pymupdf.mupdf_version} known to hang."
)
return
filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
doc = pymupdf.open(filename)
page = doc[0]
ok = page.get_text()
assert ok
def test_3376():
"""Check fix of MuPDF bug 707733.
https://bugs.ghostscript.com/show_bug.cgi?id=707733
PyMuPDF issue https://github.com/pymupdf/PyMuPDF/issues/3376
Test file contains a redaction for the first 3 words: "Table of Contents".
Test strategy:
- extract all words (sorted)
- apply redactions
- extract words again
- confirm: we now have 3 words less and remaining words are equal.
"""
filename = os.path.join(scriptdir, "resources", "test_3376.pdf")
doc = pymupdf.open(filename)
page = doc[0]
words0 = page.get_text("words", sort=True)
words0_s = words0[:3] # first 3 words
words0_e = words0[3:] # remaining words
assert " ".join([w[4] for w in words0_s]) == "Table of Contents"
page.apply_redactions()
words1 = page.get_text("words", sort=True)
ok = gentle_compare.gentle_compare(words0_e, words1)
if pymupdf.mupdf_version_tuple >= (1, 24, 2):
assert ok
else:
assert not ok