Permalink
Fetching contributors…
Cannot retrieve contributors at this time
105 lines (92 sloc) 3.79 KB
#!/usr/bin/env python
import sys
import random
import urllib
import urllib2
from multiprocessing import Process, Queue
from Queue import Full
import time
import calendar
def send_line(search, destination):
# Since requests come in with timestamp resolution we assume they came in
# at some random point in the second
time.sleep(random.uniform(0, 1))
start = time.time()
params = "fulltext=Search&srbackend=CirrusSearch"
url = "%s/%s?%s" % (destination, search, params)
urllib2.urlopen(url)
print "Fetched ({:07.3f}) {}".format(time.time() - start, url)
def hostname(wiki):
wiki = wiki.split(":")[1]
if wiki == "commonswiki":
return "commons.wikimedia.org"
if wiki[2:] == "wiki":
return wiki[0:2] + ".wikipedia.org"
# Not perfect but decent
return wiki[0:2] + "." + wiki[2:] + ".org"
def send_lines(percent, jobs, destination):
queue = Queue(jobs) # Only allow a backlog of one per job
# Spawn jobs. Note that we just spawn them as daemon because we don't
# want to bother signaling them when the main process is done and we don't
# care if they die when it finishes either. In fact, we'd love for them
# to die immediately because we want to stop sending requests when the main
# process stops.
def work(queue):
while True:
try:
(hostname, search) = queue.get()
if "%s" in destination:
resolved_destination = destination % hostname
else:
resolved_destination = destination
if hostname == "commons.wikimedia.org":
search = "File:" + search
send_line(search, resolved_destination)
except (KeyboardInterrupt, SystemExit):
break
except:
continue
for i in range(jobs):
p = Process(target=work, args=(queue,))
p.daemon = True
p.start()
# Got to read stdin line by line even on old pythons....
line = sys.stdin.readline()
target_lag = None
while line:
if random.uniform(0, 100) > percent:
line = sys.stdin.readline()
continue
s = line.strip().split("\t")
target_time = calendar.timegm(
time.strptime(s[1][:-1] + "UTC", "%Y-%m-%dT%H:%M:%S%Z"))
lag = time.time() - target_time
if target_lag is None:
target_lag = time.time() - target_time
wait_time = target_lag - lag
if wait_time >= 0:
print "Sleeping %s to stay %s ahead of the logged time." % \
(wait_time, target_lag)
time.sleep(wait_time)
try:
queue.put((hostname(s[2]), urllib.unquote(s[3])), False)
except Full:
print "Couldn't keep up so dropping the request"
line = sys.stdin.readline()
if __name__ == "__main__":
from optparse import OptionParser
parser = OptionParser(usage="usage: %prog [options] destination")
parser.add_option("-p", dest="percent", type="int", default=1, metavar="N",
help="send this percent of search results")
parser.add_option("-j", "--jobs", type="int", default=1, metavar="JOBS",
help="number of processes used to send searches")
parser.add_option("-d", "--destination", dest="destination", type="string",
metavar="DESTINATION",
default="http://127.0.0.1:8080/wiki/Special:Search",
help="Where to send the searches. Add %s as hostname " +
"to send to hostname based the log line.")
(options, args) = parser.parse_args()
try:
send_lines(options.percent, options.jobs, options.destination)
except KeyboardInterrupt:
pass # This is how we expect to exit anyway