Skip to content

Commit

Permalink
Upgraded to html2text v3.01
Browse files Browse the repository at this point in the history
  • Loading branch information
turbodog committed Feb 17, 2011
1 parent 7a992d8 commit 23a4d02
Showing 1 changed file with 63 additions and 41 deletions.
104 changes: 63 additions & 41 deletions html2text.py
@@ -1,18 +1,36 @@
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.37"
__version__ = "3.01"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]

# TODO:
# Support decoded entities with unifiable.

if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try:
True
except NameError:
setattr(__builtins__, 'True', 1)
setattr(__builtins__, 'False', 0)

def has_key(x, y):
if hasattr(x, 'has_key'): return x.has_key(y)
else: return y in x

try:
import htmlentitydefs
import urlparse
import HTMLParser
except ImportError: #Python3
import html.entities as htmlentitydefs
import urllib.parse as urlparse
import html.parser as HTMLParser
try: #Python3
import urllib.request as urllib
except:
import urllib
import re, sys, codecs, types

try: from textwrap import wrap
except: pass
Expand Down Expand Up @@ -64,15 +82,22 @@ def charref(name):
if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
try:
return unichr(c)
except NameError: #Python3
return chr(c)

def entityref(c):
if not UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
except KeyError: return "&" + c + ';'
else:
try:
return unichr(name2cp(c))
except NameError: #Python3
return chr(name2cp(c))

def replaceEntities(s):
s = s.group(1)
Expand All @@ -83,14 +108,6 @@ def replaceEntities(s):
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)

def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs

### End Entity Nonsense ###

Expand Down Expand Up @@ -133,13 +150,16 @@ def hn(tag):
if n in range(1, 10): return n
except ValueError: return 0

class _html2text(sgmllib.SGMLParser):
class _html2text(HTMLParser.HTMLParser):
def __init__(self, out=None, baseurl=''):
sgmllib.SGMLParser.__init__(self)
HTMLParser.HTMLParser.__init__(self)

if out is None: self.out = self.outtextf
else: self.out = out
self.outtext = u''
try:
self.outtext = unicode()
except NameError: # Python3
self.outtext = str()
self.quiet = 0
self.p_p = 0
self.outcount = 0
Expand All @@ -162,7 +182,7 @@ def outtextf(self, s):
self.outtext += s

def close(self):
sgmllib.SGMLParser.close(self)
HTMLParser.HTMLParser.close(self)

self.pbr()
self.o('', 0, 'end')
Expand All @@ -175,10 +195,10 @@ def handle_charref(self, c):
def handle_entityref(self, c):
self.o(entityref(c))

def unknown_starttag(self, tag, attrs):
def handle_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)

def unknown_endtag(self, tag):
def handle_endtag(self, tag):
self.handle_tag(tag, None, 0)

def previousIndex(self, attrs):
Expand All @@ -187,16 +207,16 @@ def previousIndex(self, attrs):
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
if not has_key(attrs, 'href'): return None

i = -1
for a in self.a:
i += 1
match = 0

if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
if has_key(a, 'href') and a['href'] == attrs['href']:
if has_key(a, 'title') or has_key(attrs, 'title'):
if (has_key(a, 'title') and has_key(attrs, 'title') and
a['title'] == attrs['title']):
match = True
else:
Expand All @@ -205,7 +225,7 @@ def previousIndex(self, attrs):
if match: return i

def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs)
#attrs = fixattrs(attrs)

if hn(tag):
self.p()
Expand Down Expand Up @@ -246,7 +266,7 @@ def handle_tag(self, tag, attrs, start):

self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
if has_key(attrs, 'title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
Expand All @@ -259,7 +279,7 @@ def handle_tag(self, tag, attrs, start):
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.o("[")
else:
Expand All @@ -276,13 +296,13 @@ def handle_tag(self, tag, attrs, start):
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
self.o("][" + str(a['count']) + "]")

if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
if has_key(attrs, 'src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
Expand All @@ -295,7 +315,7 @@ def handle_tag(self, tag, attrs, start):
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
self.o("]["+ str(attrs['count']) +"]")

if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
Expand All @@ -319,7 +339,7 @@ def handle_tag(self, tag, attrs, start):
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.o(str(li['num'])+". ")
self.start = 1
else:
self.pbr()
Expand Down Expand Up @@ -388,8 +408,8 @@ def o(self, data, puredata=0, force=0):
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
if has_key(link, 'title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
Expand All @@ -413,7 +433,7 @@ def handle_data(self, data):

def unknown_decl(self, data): pass

def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def wrapwrite(text): sys.stdout.write(text)

def html2text_file(html, out=wrapwrite, baseurl=''):
h = _html2text(out, baseurl)
Expand All @@ -428,7 +448,7 @@ def html2text(html, baseurl=''):
baseurl = ''
if sys.argv[1:]:
arg = sys.argv[1]
if arg.startswith('http://'):
if arg.startswith('http://') or arg.startswith('https://'):
baseurl = arg
j = urllib.urlopen(baseurl)
try:
Expand All @@ -444,8 +464,10 @@ def html2text(html, baseurl=''):
encoding = 'utf8'
if len(sys.argv) > 2:
encoding = sys.argv[2]
data = open(arg, 'r').read().decode(encoding)
try: #Python3
data = open(arg, 'r', encoding=encoding).read()
except TypeError:
data = open(arg, 'r').read().decode(encoding)
else:
data = sys.stdin.read().decode('utf8')
data = sys.stdin.read()
wrapwrite(html2text(data, baseurl))

0 comments on commit 23a4d02

Please sign in to comment.