Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

making a tool for generating visible text without using boilerpipe

  • Loading branch information...
commit e0ff8488b09c29ef1c386667d6e37a6017e6f95a 1 parent fc09866
John R. Frank authored
Showing with 48 additions and 0 deletions.
  1. +48 −0 strip_tags.py
View
48 strip_tags.py
@@ -0,0 +1,48 @@
+'''
+strip_tags maintains byte position of all visible text in an HTML
+document and removes all of parts that are not visible in a web
+browser. This allows taggers to operate on the visible-only text and
+any standoff taggs can refer to the original byte positions.
+'''
+
+import re
+## regex to identify all HTML tags, including SCRIPT and STYLE tags
+invisible = re.compile('(?P<before>(.|\n)*?)(?P<invisible>(<script(.|\n)*?</script>|<style(.|\n)*?</style>|<(.|\n)*?>))', re.I)
+
+def strip_tags(html):
+ '''
+ Takes an HTML-like binary string as input and returns a binary
+ string of the same length with all tags replaced by whitespace.
+ This also detects script and style tags, and replaces the text
+ between them with whitespace.
+
+ Pre-existing whitespace of any kind (newlines, tabs) is converted
+ to single spaces ' ', which has the same byte length (and
+ character length).
+
+ Note: this does not change any characters like &rsquo; and &nbsp;,
+ so taggers operating on this text must cope with such symbols.
+ Converting them to some other character would change their byte
+ length, even if equivalent from a character perspective.
+ '''
+ text = ''
+ for m in invisible.finditer(html):
+ text += m.group('before')
+ text += ' ' * len(m.group('invisible'))
+
+ ## text better be >= original
+ assert len(html) >= len(text), '%d !>= %d' % (len(html), len(text))
+
+ ## capture any characters after the last tag... such as newlines
+ tail = len(html) - len(text)
+ text += html[-tail:]
+
+ ## now they must be equal
+ assert len(html) == len(text), '%d != %d' % (len(html), len(text))
+
+ return text
+
+
+if __name__ == '__main__':
+ print strip_tags(open('sample-input.html').read())
+
Please sign in to comment.
Something went wrong with that request. Please try again.