Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow POST requests, and passing arbitrary opts to curl.setopt #4

Merged
merged 1 commit into from May 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
@@ -1,4 +1,4 @@
fetcher
=======

pycurl wrapper
concurrent pycurl wrapper
28 changes: 28 additions & 0 deletions example.py
@@ -0,0 +1,28 @@
import sys
import time

from fetcher import fetch


def get_requests(count, url):
print 'GETting %s from %s' % (count, url)
return ((url, 'request-%s' % i) for i in range(count))


def post_requests(count, url):
print 'POSTting %s from %s' % (count, url)
return ((url, i, 'request=%s' % i) for i in range(count))


def make_requests(requests):
start = time.time()
for ok, resp in fetch(requests, concurrent=100):
print ok, resp
delta = time.time() - start
print '%.02f req/s' % (count / delta)

if __name__ == '__main__':
count = int(sys.argv[1])
url = sys.argv[2]
requests_method = post_requests if sys.argv[3:] == ['POST'] else get_requests
sys.exit(make_requests(requests_method(count, url)))
60 changes: 36 additions & 24 deletions fetcher/__init__.py
@@ -1,14 +1,28 @@
import pycurl
import sys
import time

from cStringIO import StringIO


def fetch(requests, concurrent=50, timeout_ms=1000, follow_redirects=True):
def fetch(requests, concurrent=50, timeout_ms=1000, follow_redirects=True,
curlopts=None):
"""
requests argument is a generator with the following structure:

(url, echo_field) - for GET requests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason why these are a tuple and not just separate args?

Copy link
Contributor Author

@rupa rupa May 24, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because the elements can and will be different for each request. The args to fetch apply to all of the requests.

(url, echo_field, post_data) - for POST requests

curlopts allows arbitrary options to be passed to pycurl.setopt. It is a list
of two-tuples, eg:

(pycurl.HTTPHEADER, ['Content-Type', 'application/javascript'])

responses:
success: (True, (echo_field, server_response))
error: (False, (echo_field, error, effective_URL))
"""
multi = pycurl.CurlMulti()

# Sadly, we need to track of pending curls, or they'll get CG'd and
# Sadly, we need to track of pending curls, or they'll get GC'd and
# mysteriously disappear. Don't ask me!
curls = []
num_handles = 0
Expand All @@ -18,26 +32,39 @@ def fetch(requests, concurrent=50, timeout_ms=1000, follow_redirects=True):
# If the concurrency cap hasn't been reached yet, another request can be
# pulled off and added to the multi.
if unscheduled_reqs and num_handles < concurrent:

try:
url, payload = requests.next()
request = requests.next()
except StopIteration:
unscheduled_reqs = False
continue

if len(request) == 3:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this check would be a lot more readable if they were args.

if request.post_data:
  # POST
else:
  # GET

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above - each "multi" contains len(requests) actual requests, each with (presumably) different POST data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could add a "method" argument, but the requests arity stuff would still need to happen, so I don't see that helping any ...

url, payload, post_data = request
elif len(request) == 2:
url, payload = request
post_data = None
else:
raise Exception('Bad request: {}'.format(repr(request)))

body = StringIO()

curl = pycurl.Curl()
curl.setopt(pycurl.URL, url)
curl.setopt(pycurl.WRITEFUNCTION, body.write)
curl.setopt(pycurl.TIMEOUT_MS, timeout_ms)
curl.setopt(pycurl.CONNECTTIMEOUT_MS, timeout_ms)
curl.setopt(pycurl.FOLLOWLOCATION, 1 if follow_redirects else 0)
curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64;' +\
' rv:21.0) Gecko/20100101 Firefox/21.0')

if follow_redirects:
curl.setopt(pycurl.FOLLOWLOCATION, 1)
else:
curl.setopt(pycurl.FOLLOWLOCATION, 0)
# arbitrary options
if curlopts is not None:
for option, value in curlopts:
curl.setopt(option, value)

if post_data is not None:
curl.setopt(pycurl.POSTFIELDS, post_data)

curl.body = body
curl.payload = payload
Expand Down Expand Up @@ -72,18 +99,3 @@ def fetch(requests, concurrent=50, timeout_ms=1000, follow_redirects=True):

if not num_q:
break

def main(count, url):
print 'Getting %s from %s' % (count, url)

requests = ((url, 'req-%s' % i) for i in range(count))
start = time.time()
for ok, resp in fetch(requests, concurrent=100):
print ok, resp
delta = time.time() - start
print '%.02f req/s' % (count / delta)

if __name__ == '__main__':
count = int(sys.argv[1])
url = sys.argv[2]
sys.exit(main(count, url))