Skip to content

Commit

Permalink
merge scrapeshell
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesturk committed Feb 14, 2011
2 parents a971a77 + 8076592 commit 34b00ad
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 12 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG
@@ -1,6 +1,16 @@
scrapelib changelog
===================

0.4.3 - 11 February 2011
------------------------
* fix retry on certain httplib2 errors
* add a top-level urlopen function

0.4.2 - 8 February 2011
-----------------------
* fix retry on socket errors
* close temporary file handle

0.4.1 - 7 December 2010
-----------------------
* support retry of requests that produce socket timeouts
Expand All @@ -23,4 +33,4 @@ scrapelib changelog
-------------------
* use_cache_first option to avoid extra HTTP HEAD requests
* raise_errors option to treat HTTP errors as exceptions
* addition of urlretrieve
* addition of urlretrieve
52 changes: 41 additions & 11 deletions scrapelib.py
Expand Up @@ -21,7 +21,7 @@
except ImportError:
USE_HTTPLIB2 = False

__version__ = '0.4.1'
__version__ = '0.4.3'
_user_agent = 'scrapelib %s' % __version__


Expand Down Expand Up @@ -378,8 +378,13 @@ def _do_request(self, url, method, body, headers, use_httplib2):
# return on a success/redirect/404
if resp.status < 400 or resp.status == 404:
return resp, content
except socket.timeout, e:
except socket.error, e:
exception_raised = True
except AttributeError, e:
if e.message == "'NoneType' object has no attribute 'makefile'":
exception_raised = True
else:
raise
else:
try:
_log.info("getting %s using urllib2" % url)
Expand Down Expand Up @@ -514,9 +519,11 @@ def urlretrieve(self, url, filename=None, method='GET', body=None):
result = self.urlopen(url, method, body)

if not filename:
_, filename = tempfile.mkstemp()
fd, filename = tempfile.mkstemp()
f = os.fdopen(fd, 'w')
else:
f = open(filename, 'w')

f = open(filename, 'w')
f.write(result)
f.close()

Expand All @@ -541,26 +548,48 @@ def _save_error(self, url, body):
with open(path, 'w') as fp:
json.dump(out, fp, ensure_ascii=False)

_default_scraper = Scraper(follow_robots=False, requests_per_minute=0)

def urlopen(url):
return _default_scraper.urlopen(url)

def scrapeshell():
try:
from IPython.Shell import IPShellEmbed
except ImportError:
print 'scrapeshell requires ipython'
return
try:
import argparse
except ImportError:
print 'scrapeshell requires argparse'
return
try:
import lxml.html
USE_LXML = True
except ImportError:
USE_LXML = False

scraper = Scraper(follow_robots=False)
parser = argparse.ArgumentParser(description='interactive python shell for'
' scraping')
parser.add_argument('url', help="url to scrape")
parser.add_argument('--ua', dest='user_agent', default=_user_agent,
help='user agent to make requests with')
parser.add_argument('--robots', dest='robots', action='store_true',
default=False, help='obey robots.txt')
parser.add_argument('--noredirect', dest='redirects', action='store_false',
default=True, help="don't follow redirects")

args = parser.parse_args()

import sys
for arg in sys.argv[1:]:
url = arg
html = scraper.urlopen(url)
if USE_LXML:
doc = lxml.html.fromstring(html)
scraper = Scraper(user_agent=args.user_agent,
follow_robots=args.robots,
follow_redirects=args.redirects)
url = args.url
html = scraper.urlopen(args.url)

if USE_LXML:
doc = lxml.html.fromstring(html)

print 'local variables'
print '---------------'
Expand All @@ -569,3 +598,4 @@ def scrapeshell():
if USE_LXML:
print 'doc: `lxml HTML element`'
IPShellEmbed()()

0 comments on commit 34b00ad

Please sign in to comment.