/
strip_tags.py
48 lines (37 loc) · 1.71 KB
/
strip_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
'''
strip_tags maintains byte position of all visible text in an HTML
document and removes all of parts that are not visible in a web
browser. This allows taggers to operate on the visible-only text and
any standoff taggs can refer to the original byte positions.
'''
import re
## regex to identify all HTML tags, including SCRIPT and STYLE tags
invisible = re.compile('(?P<before>(.|\n)*?)(?P<invisible>(<script(.|\n)*?</script>|<style(.|\n)*?</style>|<(.|\n)*?>))', re.I)
def strip_tags(html):
'''
Takes an HTML-like binary string as input and returns a binary
string of the same length with all tags replaced by whitespace.
This also detects script and style tags, and replaces the text
between them with whitespace.
Pre-existing whitespace of any kind (newlines, tabs) is converted
to single spaces ' ', which has the same byte length (and
character length).
Note: this does not change any characters like ’ and ,
so taggers operating on this text must cope with such symbols.
Converting them to some other character would change their byte
length, even if equivalent from a character perspective.
'''
text = ''
for m in invisible.finditer(html):
text += m.group('before')
text += ' ' * len(m.group('invisible'))
## text better be >= original
assert len(html) >= len(text), '%d !>= %d' % (len(html), len(text))
## capture any characters after the last tag... such as newlines
tail = len(html) - len(text)
text += html[-tail:]
## now they must be equal
assert len(html) == len(text), '%d != %d' % (len(html), len(text))
return text
if __name__ == '__main__':
print strip_tags(open('sample-input.html').read())