Skip to content

Commit

Permalink
Wrote common.html2text.boilerpipe_html2text
Browse files Browse the repository at this point in the history
  • Loading branch information
turian committed Jul 15, 2011
1 parent fab1899 commit a6664c0
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from common.stats import stats
import common.json
import re
import urllib,urllib2

def html2text(html, html2textrc=os.path.expanduser("~/dev/common-scripts/html2text/html2textrc"), forceoutput=True, veryquiet=True):
"""
Expand Down Expand Up @@ -74,11 +75,16 @@ def batch_nclean(htmls, strip_html_output=True, ncleaner=os.path.join(os.environ
shutil.rmtree(outdir, ignore_errors=False, onerror=lambda function, path, excinfo: sys.stderr.write("Could not shutil.rmtree, function=%s, path=%s, excinfo=%s\n" % function, path, excinfo))
return txts

def boilerpipe_html2text(html):
values = {"text": html.encode("utf-8"), "extractor": "DefaultExtractor", "output": "text"}
data = urllib.urlencode(values)
boilerpipe_response = urllib2.urlopen("http://localhost:8080/boilerpipe-api/extract", data)
return boilerpipe_response.read().decode("utf-8")

def boilerpipe_url2text(url):
"""
Use Kohlschuetter Search Intelligence's boilerpipe boilerplate stripper.
"""
import urllib,urllib2
newurl = "http://boilerpipe-web.appspot.com/extract?url=%s+&extractor=ArticleExtractor&output=text" % urllib.quote_plus(url)
# print newurl
f = urllib2.urlopen(newurl)
Expand Down

0 comments on commit a6664c0

Please sign in to comment.