Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

We’re showing branches in this repository, but you can also compare across forks.

...
  • 2 commits
  • 3 files changed
  • 0 commit comments
  • 1 contributor
Showing with 786 additions and 6 deletions.
  1. +1 −0  config.yaml
  2. +9 −6 exitwp.py
  3. +776 −0 html2text.py
1  config.yaml
View
@@ -5,6 +5,7 @@ wp_exports: wordpress-xml
build_dir: build
# Output format: primary choices are html or markdown.
+#target_format: markdown
target_format: markdown
# The date format of the wikipedia export file.
15 exitwp.py
View
@@ -12,7 +12,7 @@
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse, urljoin
from urllib import urlretrieve
-from html2text import html2text
+from html2text import html2text_file
'''
exitwp - Wordpress xml exports to Jekykll blog format conversion
@@ -37,12 +37,14 @@
date_fmt=config['date_format']
def html2fmt(html, target_format):
- html = html.replace("\n\n", '<br>')
+# html = html.replace("\n\n", '<br/><br/>')
+ # html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
+ # html = html.replace('</pre>', ']]></pre>')
if target_format=='html':
return html
else:
# This is like very stupid but I was having troubles with unicode encodings and process.POpen
- return html2text(html, '')
+ return html2text_file(html, None)
def parse_wp_xml(file):
ns = {
@@ -107,7 +109,6 @@ def gi(q, unicode_wrap=True):
export_item = {
'title' : gi('title'),
- 'author' : gi('dc:creator'),
'date' : gi('wp:post_date'),
'slug' : gi('wp:post_name'),
'status' : gi('wp:status'),
@@ -237,7 +238,6 @@ def get_attachment_path(src, dir, dir_prefix='a'):
yaml_header = {
'title' : i['title'],
'date' : i['date'],
- 'author' : i['author'],
'slug' : i['slug'],
'status' : i['status'],
'wordpress_id' : i['wp_id'],
@@ -285,7 +285,10 @@ def toyaml(data):
if len(tax_out)>0: out.write(toyaml(tax_out))
out.write('---\n\n')
- out.write(html2fmt(i['body'], target_format))
+ try:
+ out.write(html2fmt(i['body'], target_format))
+ except:
+ print "\n Parse error on: "+i['title']
out.close()
print "\n"
776 html2text.py
View
@@ -0,0 +1,776 @@
+#!/usr/bin/env python
+"""html2text: Turn HTML into equivalent Markdown-structured text."""
+__version__ = "3.1"
+__author__ = "Aaron Swartz (me@aaronsw.com)"
+__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
+__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
+
+# TODO:
+# Support decoded entities with unifiable.
+
+try:
+ True
+except NameError:
+ setattr(__builtins__, 'True', 1)
+ setattr(__builtins__, 'False', 0)
+
+def has_key(x, y):
+ if hasattr(x, 'has_key'): return x.has_key(y)
+ else: return y in x
+
+try:
+ import htmlentitydefs
+ import urlparse
+ import HTMLParser
+except ImportError: #Python3
+ import html.entities as htmlentitydefs
+ import urllib.parse as urlparse
+ import html.parser as HTMLParser
+try: #Python3
+ import urllib.request as urllib
+except:
+ import urllib
+import optparse, re, sys, codecs, types
+
+try: from textwrap import wrap
+except: pass
+
+# Use Unicode characters instead of their ascii psuedo-replacements
+UNICODE_SNOB = 0
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = 0
+
+# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
+BODY_WIDTH = 78
+
+# Don't show internal links (href="#local-anchor") -- corresponding link targets
+# won't be visible in the plain text file anyway.
+SKIP_INTERNAL_LINKS = True
+
+# Use inline, rather than reference, formatting for images and links
+INLINE_LINKS = True
+
+# Number of pixels Google indents nested lists
+GOOGLE_LIST_INDENT = 36
+
+IGNORE_ANCHORS = False
+IGNORE_IMAGES = False
+
+### Entity Nonsense ###
+
+def name2cp(k):
+ if k == 'apos': return ord("'")
+ if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
+ return htmlentitydefs.name2codepoint[k]
+ else:
+ k = htmlentitydefs.entitydefs[k]
+ if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
+ return ord(codecs.latin_1_decode(k)[0])
+
+unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
+'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
+'ndash':'-', 'oelig':'oe', 'aelig':'ae',
+'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
+'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
+'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
+'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
+'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
+'lrm':'', 'rlm':''}
+
+unifiable_n = {}
+
+for k in unifiable.keys():
+ unifiable_n[name2cp(k)] = unifiable[k]
+
+def charref(name):
+ if name[0] in ['x','X']:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if not UNICODE_SNOB and c in unifiable_n.keys():
+ return unifiable_n[c]
+ else:
+ try:
+ return unichr(c)
+ except NameError: #Python3
+ return chr(c)
+
+def entityref(c):
+ if not UNICODE_SNOB and c in unifiable.keys():
+ return unifiable[c]
+ else:
+ try: name2cp(c)
+ except KeyError: return "&" + c + ';'
+ else:
+ try:
+ return unichr(name2cp(c))
+ except NameError: #Python3
+ return chr(name2cp(c))
+
+def replaceEntities(s):
+ s = s.group(1)
+ if s[0] == "#":
+ return charref(s[1:])
+ else: return entityref(s)
+
+r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+def unescape(s):
+ return r_unescape.sub(replaceEntities, s)
+
+### End Entity Nonsense ###
+
+def onlywhite(line):
+ """Return true if the line does only consist of whitespace characters."""
+ for c in line:
+ if c is not ' ' and c is not ' ':
+ return c is ' '
+ return line
+
+def optwrap(text):
+ """Wrap all paragraphs in the provided text."""
+ if not BODY_WIDTH:
+ return text
+
+ assert wrap, "Requires Python 2.3."
+ result = ''
+ newlines = 0
+ for para in text.split("\n"):
+ if len(para) > 0:
+ if para[0] != ' ' and para[0] != '-' and para[0] != '*':
+ for line in wrap(para, BODY_WIDTH):
+ result += line + "\n"
+ result += "\n"
+ newlines = 2
+ else:
+ if not onlywhite(para):
+ result += para + "\n"
+ newlines = 1
+ else:
+ if newlines < 2:
+ result += "\n"
+ newlines += 1
+ return result
+
+def hn(tag):
+ if tag[0] == 'h' and len(tag) == 2:
+ try:
+ n = int(tag[1])
+ if n in range(1, 10): return n
+ except ValueError: return 0
+
+def dumb_property_dict(style):
+ """returns a hash of css attributes"""
+ return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
+
+def dumb_css_parser(data):
+ """returns a hash of css selectors, each of which contains a hash of css attributes"""
+ # remove @import sentences
+ importIndex = data.find('@import')
+ while importIndex != -1:
+ data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
+ importIndex = data.find('@import')
+
+ # parse the css. reverted from dictionary compehension in order to support older pythons
+ elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
+ elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
+
+ return elements
+
+def element_style(attrs, style_def, parent_style):
+ """returns a hash of the 'final' style attributes of the element"""
+ style = parent_style.copy()
+ if 'class' in attrs:
+ for css_class in attrs['class'].split():
+ css_style = style_def['.' + css_class]
+ style.update(css_style)
+ if 'style' in attrs:
+ immediate_style = dumb_property_dict(attrs['style'])
+ style.update(immediate_style)
+ return style
+
+def google_list_style(style):
+ """finds out whether this is an ordered or unordered list"""
+ if 'list-style-type' in style:
+ list_style = style['list-style-type']
+ if list_style in ['disc', 'circle', 'square', 'none']:
+ return 'ul'
+ return 'ol'
+
+def google_nest_count(style):
+ """calculate the nesting count of google doc lists"""
+ nest_count = 0
+ if 'margin-left' in style:
+ nest_count = int(style['margin-left'][:-2]) / GOOGLE_LIST_INDENT
+ return nest_count
+
+def google_has_height(style):
+ """check if the style of the element has the 'height' attribute explicitly defined"""
+ if 'height' in style:
+ return True
+ return False
+
+def google_text_emphasis(style):
+ """return a list of all emphasis modifiers of the element"""
+ emphasis = []
+ if 'text-decoration' in style:
+ emphasis.append(style['text-decoration'])
+ if 'font-style' in style:
+ emphasis.append(style['font-style'])
+ if 'font-weight' in style:
+ emphasis.append(style['font-weight'])
+ return emphasis
+
+def google_fixed_width_font(style):
+ """check if the css of the current element defines a fixed width font"""
+ font_family = ''
+ if 'font-family' in style:
+ font_family = style['font-family']
+ if 'Courier New' == font_family or 'Consolas' == font_family:
+ return True
+ return False
+
+def list_numbering_start(attrs):
+ """extract numbering from list element attributes"""
+ if 'start' in attrs:
+ return int(attrs['start']) - 1
+ else:
+ return 0
+
+class _html2text(HTMLParser.HTMLParser):
+ def __init__(self, out=None, baseurl=''):
+ HTMLParser.HTMLParser.__init__(self)
+
+ if out is None: self.out = self.outtextf
+ else: self.out = out
+ self.outtextlist = [] # empty list to store output characters before they are "joined"
+ try:
+ self.outtext = unicode()
+ except NameError: # Python3
+ self.outtext = str()
+ self.quiet = 0
+ self.p_p = 0 # number of newline character to print before next output
+ self.outcount = 0
+ self.start = 1
+ self.space = 0
+ self.a = []
+ self.astack = []
+ self.acount = 0
+ self.list = []
+ self.blockquote = 0
+ self.pre = 0
+ self.startpre = 0
+ self.code = False
+ self.br_toggle = ''
+ self.lastWasNL = 0
+ self.lastWasList = False
+ self.style = 0
+ self.style_def = {}
+ self.tag_stack = []
+ self.emphasis = 0
+ self.drop_white_space = 0
+ self.inheader = False
+ self.abbr_title = None # current abbreviation definition
+ self.abbr_data = None # last inner HTML (for abbr being defined)
+ self.abbr_list = {} # stack of abbreviations to write later
+ self.baseurl = baseurl
+
+ if options.google_doc:
+ del unifiable_n[name2cp('nbsp')]
+ unifiable['nbsp'] = '&nbsp_place_holder;'
+
+ def feed(self, data):
+ data = data.replace("</' + 'script>", "</ignore>")
+ HTMLParser.HTMLParser.feed(self, data)
+
+ def outtextf(self, s):
+ self.outtextlist.append(s)
+ if s: self.lastWasNL = s[-1] == '\n'
+
+ def close(self):
+ HTMLParser.HTMLParser.close(self)
+
+ self.pbr()
+ self.o('', 0, 'end')
+
+ self.outtext = self.outtext.join(self.outtextlist)
+
+ if options.google_doc:
+ self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');
+
+ return self.outtext
+
+ def handle_charref(self, c):
+ self.o(charref(c), 1)
+
+ def handle_entityref(self, c):
+ self.o(entityref(c), 1)
+
+ def handle_starttag(self, tag, attrs):
+ self.handle_tag(tag, attrs, 1)
+
+ def handle_endtag(self, tag):
+ self.handle_tag(tag, None, 0)
+
+ def previousIndex(self, attrs):
+ """ returns the index of certain set of attributes (of a link) in the
+ self.a list
+
+ If the set of attributes is not found, returns None
+ """
+ if not has_key(attrs, 'href'): return None
+
+ i = -1
+ for a in self.a:
+ i += 1
+ match = 0
+
+ if has_key(a, 'href') and a['href'] == attrs['href']:
+ if has_key(a, 'title') or has_key(attrs, 'title'):
+ if (has_key(a, 'title') and has_key(attrs, 'title') and
+ a['title'] == attrs['title']):
+ match = True
+ else:
+ match = True
+
+ if match: return i
+
+ def drop_last(self, nLetters):
+ if not self.quiet:
+ self.outtext = self.outtext[:-nLetters]
+
+ def handle_emphasis(self, start, tag_style, parent_style):
+ """handles various text emphases"""
+ tag_emphasis = google_text_emphasis(tag_style)
+ parent_emphasis = google_text_emphasis(parent_style)
+
+ # handle Google's text emphasis
+ strikethrough = 'line-through' in tag_emphasis and options.hide_strikethrough
+ bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
+ italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
+ fixed = google_fixed_width_font(tag_style) and not \
+ google_fixed_width_font(parent_style) and not self.pre
+
+ if start:
+ # crossed-out text must be handled before other attributes
+ # in order not to output qualifiers unnecessarily
+ if bold or italic or fixed:
+ self.emphasis += 1
+ if strikethrough:
+ self.quiet += 1
+ if italic:
+ self.o("_")
+ self.drop_white_space += 1
+ if bold:
+ self.o("**")
+ self.drop_white_space += 1
+ if fixed:
+ self.o('`')
+ self.drop_white_space += 1
+ self.code = True
+ else:
+ if bold or italic or fixed:
+ # there must not be whitespace before closing emphasis mark
+ self.emphasis -= 1
+ self.space = 0
+ self.outtext = self.outtext.rstrip()
+ if fixed:
+ if self.drop_white_space:
+ # empty emphasis, drop it
+ self.drop_last(1)
+ self.drop_white_space -= 1
+ else:
+ self.o('`')
+ self.code = False
+ if bold:
+ if self.drop_white_space:
+ # empty emphasis, drop it
+ self.drop_last(2)
+ self.drop_white_space -= 1
+ else:
+ self.o("**")
+ if italic:
+ if self.drop_white_space:
+ # empty emphasis, drop it
+ self.drop_last(1)
+ self.drop_white_space -= 1
+ else:
+ self.o("_")
+ # space is only allowed after *all* emphasis marks
+ if (bold or italic) and not self.emphasis:
+ self.o(" ")
+ if strikethrough:
+ self.quiet -= 1
+
+ def handle_tag(self, tag, attrs, start):
+ if attrs is None:
+ attrs = {}
+ else:
+ attrs = dict(attrs)
+
+ if self.pre and not tag == "pre":
+ if start:
+ attrs_string = ""
+ for k, v in attrs.items():
+ attrs_string += " " + k + "=\"" + v + "\""
+ self.o("<"+tag+attrs_string+">")
+ else:
+ self.o("</"+tag+">")
+ return None
+
+ if options.google_doc:
+ # the attrs parameter is empty for a closing tag. in addition, we
+ # need the attributes of the parent nodes in order to get a
+ # complete style description for the current element. we assume
+ # that google docs export well formed html.
+ parent_style = {}
+ if start:
+ if self.tag_stack:
+ parent_style = self.tag_stack[-1][2]
+ tag_style = element_style(attrs, self.style_def, parent_style)
+ self.tag_stack.append((tag, attrs, tag_style))
+ else:
+ dummy, attrs, tag_style = self.tag_stack.pop()
+ if self.tag_stack:
+ parent_style = self.tag_stack[-1][2]
+
+ if hn(tag):
+ self.p()
+ if start:
+ self.inheader = True
+ self.o(hn(tag)*"#" + ' ')
+ else:
+ self.inheader = False
+ return # prevent redundant emphasis marks on headers
+
+ if tag in ['p', 'div']:
+ if options.google_doc:
+ if start and google_has_height(tag_style):
+ self.p()
+ else:
+ self.soft_br()
+ else:
+ self.p()
+
+ if tag == "br" and start: self.o(" \n")
+
+ if tag == "hr" and start:
+ self.p()
+ self.o("* * *")
+ self.p()
+
+ if tag in ["head", "style", 'script']:
+ if start: self.quiet += 1
+ else: self.quiet -= 1
+
+ if tag == "style":
+ if start: self.style += 1
+ else: self.style -= 1
+
+ if tag in ["body"]:
+ self.quiet = 0 # sites like 9rules.com never close <head>
+
+ if tag == "blockquote":
+ if start:
+ self.p(); self.o('> ', 0, 1); self.start = 1
+ self.blockquote += 1
+ else:
+ self.blockquote -= 1
+ self.p()
+
+ if tag in ['em', 'i', 'u']: self.o("_")
+ if tag in ['strong', 'b']: self.o("**")
+ if tag in ['del', 'strike']:
+ if start:
+ self.o("<"+tag+">")
+ else:
+ self.o("</"+tag+">")
+
+ if options.google_doc:
+ if not self.inheader:
+ # handle some font attributes, but leave headers clean
+ self.handle_emphasis(start, tag_style, parent_style)
+
+ if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
+ if tag == "abbr":
+ if start:
+ self.abbr_title = None
+ self.abbr_data = ''
+ if has_key(attrs, 'title'):
+ self.abbr_title = attrs['title']
+ else:
+ if self.abbr_title != None:
+ self.abbr_list[self.abbr_data] = self.abbr_title
+ self.abbr_title = None
+ self.abbr_data = ''
+
+ if tag == "a" and not IGNORE_ANCHORS:
+ if start:
+ if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
+ self.astack.append(attrs)
+ self.o("[")
+ else:
+ self.astack.append(None)
+ else:
+ if self.astack:
+ a = self.astack.pop()
+ if a:
+ if INLINE_LINKS:
+ self.o("](" + a['href'] + ")")
+ else:
+ i = self.previousIndex(a)
+ if i is not None:
+ a = self.a[i]
+ else:
+ self.acount += 1
+ a['count'] = self.acount
+ a['outcount'] = self.outcount
+ self.a.append(a)
+ self.o("][" + str(a['count']) + "]")
+
+ if tag == "img" and start and not IGNORE_IMAGES:
+ if has_key(attrs, 'src'):
+ attrs['href'] = attrs['src']
+ alt = attrs.get('alt', '')
+ if INLINE_LINKS:
+ self.o("![")
+ self.o(alt)
+ self.o("]("+ attrs['href'] +")")
+ else:
+ i = self.previousIndex(attrs)
+ if i is not None:
+ attrs = self.a[i]
+ else:
+ self.acount += 1
+ attrs['count'] = self.acount
+ attrs['outcount'] = self.outcount
+ self.a.append(attrs)
+ self.o("![")
+ self.o(alt)
+ self.o("]["+ str(attrs['count']) +"]")
+
+ if tag == 'dl' and start: self.p()
+ if tag == 'dt' and not start: self.pbr()
+ if tag == 'dd' and start: self.o(' ')
+ if tag == 'dd' and not start: self.pbr()
+
+ if tag in ["ol", "ul"]:
+ # Google Docs create sub lists as top level lists
+ if (not self.list) and (not self.lastWasList):
+ self.p()
+ if start:
+ if options.google_doc:
+ list_style = google_list_style(tag_style)
+ else:
+ list_style = tag
+ numbering_start = list_numbering_start(attrs)
+ self.list.append({'name':list_style, 'num':numbering_start})
+ else:
+ if self.list: self.list.pop()
+ self.lastWasList = True
+ else:
+ self.lastWasList = False
+
+ if tag == 'li':
+ self.pbr()
+ if start:
+ if self.list: li = self.list[-1]
+ else: li = {'name':'ul', 'num':0}
+ if options.google_doc:
+ nest_count = google_nest_count(tag_style)
+ else:
+ nest_count = len(self.list)
+ self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
+ if li['name'] == "ul": self.o(options.ul_item_mark + " ")
+ elif li['name'] == "ol":
+ li['num'] += 1
+ self.o(str(li['num'])+". ")
+ self.start = 1
+
+ if tag in ["table", "tr"] and start: self.p()
+ if tag == 'td': self.pbr()
+
+ if tag == "pre":
+ if start:
+ self.startpre = 1
+ self.pre = 1
+ else:
+ self.pre = 0
+ self.p()
+
+ def pbr(self):
+ if self.p_p == 0: self.p_p = 1
+
+ def p(self): self.p_p = 2
+
+ def soft_br(self):
+ self.pbr()
+ self.br_toggle = ' '
+
+ def o(self, data, puredata=0, force=0):
+ if self.abbr_data is not None: self.abbr_data += data
+
+ if not self.quiet:
+ if options.google_doc:
+ # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
+ lstripped_data = data.lstrip()
+ if self.drop_white_space and not (self.pre or self.code):
+ data = lstripped_data
+ if lstripped_data != '':
+ self.drop_white_space = 0
+
+ if puredata and not self.pre:
+# data = re.sub('\s+', ' ', data)
+ if data and data[0] == ' ':
+ self.space = 1
+ data = data[1:]
+ if not data and not force: return
+
+ if self.startpre:
+ #self.out(" :") #TODO: not output when already one there
+ self.startpre = 0
+
+ bq = (">" * self.blockquote)
+ if not (force and data and data[0] == ">") and self.blockquote: bq += " "
+
+ if self.pre:
+ bq += " "
+ data = data.replace("\n", "\n"+bq)
+
+ if self.start:
+ self.space = 0
+ self.p_p = 0
+ self.start = 0
+
+ if force == 'end':
+ # It's the end.
+ self.p_p = 0
+ self.out("\n")
+ self.space = 0
+
+ if self.p_p:
+ self.out((self.br_toggle+'\n'+bq)*self.p_p)
+ self.space = 0
+ self.br_toggle = ''
+
+ if self.space:
+ if not self.lastWasNL: self.out(' ')
+ self.space = 0
+
+ if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
+ if force == "end": self.out("\n")
+
+ newa = []
+ for link in self.a:
+ if self.outcount > link['outcount']:
+ self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
+ if has_key(link, 'title'): self.out(" ("+link['title']+")")
+ self.out("\n")
+ else:
+ newa.append(link)
+
+ if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
+
+ self.a = newa
+
+ if self.abbr_list and force == "end":
+ for abbr, definition in self.abbr_list.items():
+ self.out(" *[" + abbr + "]: " + definition + "\n")
+
+ self.p_p = 0
+ self.out(data)
+ self.outcount += 1
+
+ def handle_data(self, data):
+ if r'\/script>' in data: self.quiet -= 1
+
+ if self.style:
+ self.style_def.update(dumb_css_parser(data))
+
+ self.o(data, 1)
+
+ def unknown_decl(self, data): pass
+
+def wrapwrite(text):
+ text = text.encode('utf-8')
+ try: #Python3
+ sys.stdout.buffer.write(text)
+ except AttributeError:
+ sys.stdout.write(text)
+
+def html2text_file(html, out=wrapwrite, baseurl=''):
+ h = _html2text(out, baseurl)
+ h.feed(html)
+ h.feed("")
+ return h.close()
+
+def html2text(html, baseurl=''):
+ return optwrap(html2text_file(html, None, baseurl))
+
+class Storage: pass
+options = Storage()
+options.google_doc = False
+options.ul_item_mark = '*'
+
+if __name__ == "__main__":
+ baseurl = ''
+
+ p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
+ version='%prog ' + __version__)
+ p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
+ default=False, help="convert an html-exported Google Document")
+ p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
+ default=False, help="use a dash rather than a star for unordered list items")
+ p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
+ default=78, help="number of characters per output line, 0 for no wrap")
+ p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
+ default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
+ p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
+ default=False, help="hide strike-through text. only relevent when -g is specified as well")
+ (options, args) = p.parse_args()
+
+ # handle options
+ if options.ul_style_dash:
+ options.ul_item_mark = '-'
+ else:
+ options.ul_item_mark = '*'
+
+ BODY_WIDTH = options.body_width
+ GOOGLE_LIST_INDENT = options.list_indent
+
+ # process input
+ if len(args) > 0:
+ file_ = args[0]
+ encoding = None
+ if len(args) == 2:
+ encoding = args[1]
+ if len(args) > 2:
+ p.error('Too many arguments')
+
+ if file_.startswith('http://') or file_.startswith('https://'):
+ baseurl = file_
+ j = urllib.urlopen(baseurl)
+ text = j.read()
+ if encoding is None:
+ try:
+ from feedparser import _getCharacterEncoding as enc
+ except ImportError:
+ enc = lambda x, y: ('utf-8', 1)
+ encoding = enc(j.headers, text)[0]
+ if encoding == 'us-ascii':
+ encoding = 'utf-8'
+ data = text.decode(encoding)
+
+ else:
+ data = open(file_, 'rb').read()
+ if encoding is None:
+ try:
+ from chardet import detect
+ except ImportError:
+ detect = lambda x: {'encoding': 'utf-8'}
+ encoding = detect(data)['encoding']
+ data = data.decode(encoding)
+ else:
+ data = sys.stdin.read()
+ wrapwrite(html2text(data, baseurl))

No commit comments for this range

Something went wrong with that request. Please try again.