/
TidyTools.py
executable file
·128 lines (102 loc) · 3.21 KB
/
TidyTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#
# ElementTree
# $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $
#
# tools to run the "tidy" command on an HTML or XHTML file, and return
# the contents as an XHTML element tree.
#
# history:
# 2002-10-19 fl added to ElementTree library; added getzonebody function
#
# Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
##
# Tools to build element trees from HTML, using the external <b>tidy</b>
# utility.
##
import glob, string, os, sys
from ElementTree import ElementTree, Element
NS_XHTML = "{http://www.w3.org/1999/xhtml}"
##
# Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
# command line utility.
#
# @param file Filename.
# @param new_inline_tags An optional list of valid but non-standard
# inline tags.
# @return An element tree, or None if not successful.
def tidy(file, new_inline_tags=None):
command = ["tidy", "-qn", "-asxml"]
if new_inline_tags:
command.append("--new-inline-tags")
command.append(string.join(new_inline_tags, ","))
# FIXME: support more tidy options!
# convert
os.system(
"%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file)
)
# check that the result is valid XML
try:
tree = ElementTree()
tree.parse(file + ".out")
except:
print "*** %s:%s" % sys.exc_info()[:2]
print ("*** %s is not valid XML "
"(check %s.err for info)" % (file, file))
tree = None
else:
if os.path.isfile(file + ".out"):
os.remove(file + ".out")
if os.path.isfile(file + ".err"):
os.remove(file + ".err")
return tree
##
# Get document body from a an HTML or HTML-like file. This function
# uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
# up the resulting XML tree.
#
# @param file Filename.
# @return A <b>body</b> element, or None if not successful.
def getbody(file, **options):
# get clean body from text file
# get xhtml tree
try:
tree = apply(tidy, (file,), options)
if tree is None:
return
except IOError, v:
print "***", v
return None
NS = NS_XHTML
# remove namespace uris
for node in tree.getiterator():
if node.tag.startswith(NS):
node.tag = node.tag[len(NS):]
body = tree.getroot().find("body")
return body
##
# Same as <b>getbody</b>, but turns plain text at the start of the
# document into an H1 tag. This function can be used to parse zone
# documents.
#
# @param file Filename.
# @return A <b>body</b> element, or None if not successful.
def getzonebody(file, **options):
body = getbody(file, **options)
if body is None:
return
if body.text and string.strip(body.text):
title = Element("h1")
title.text = string.strip(body.text)
title.tail = "\n\n"
body.insert(0, title)
body.text = None
return body
if __name__ == "__main__":
import sys
for arg in sys.argv[1:]:
for file in glob.glob(arg):
print file, "...", tidy(file)