From bac52476b361c92672866eb3b55ad81664a8a781 Mon Sep 17 00:00:00 2001 From: thequbit Date: Tue, 20 Jan 2015 16:14:28 -0500 Subject: [PATCH] added json output dump to cli tool --- barking_owl/scraper/cli-scraper.py | 54 +++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/barking_owl/scraper/cli-scraper.py b/barking_owl/scraper/cli-scraper.py index a972db1..80fc285 100644 --- a/barking_owl/scraper/cli-scraper.py +++ b/barking_owl/scraper/cli-scraper.py @@ -1,3 +1,4 @@ +import json import datetime from optparse import OptionParser from scraper import Scraper @@ -7,8 +8,6 @@ def print_doc_info(_data, document_url): if __name__ == '__main__': - print " -- CLI BarkingOwl Scraper -- " - parser = OptionParser() parser.add_option("-u", "--target-url", dest="target_url", @@ -20,11 +19,24 @@ def print_doc_info(_data, document_url): parser.add_option("-l", "--max-link-level", dest="max_link_level", help="Maximum links to follow.", metavar="MAXLEVEL") + parser.add_option("-j", "--json-output", action="store_true", + dest="json_output", help="Produce Pretty JSON output.", + default=False) + (options, args) = parser.parse_args() if not options.target_url == '' and not options.target_url == None and \ not options.doc_type == '' and not options.doc_type == None and \ - not options.max_link_level == '' and not options.max_link_level == None: + not options.max_link_level == '' and \ + not options.max_link_level == None and \ + not options.json_output == '' and not options.json_output == None: + + DEBUG = False + if options.json_output == False: + DEBUG = True + + if DEBUG == True: + print " -- CLI BarkingOwl Scraper -- " url = { 'target_url': options.target_url, @@ -39,23 +51,39 @@ def print_doc_info(_data, document_url): ], } - try: + #try: + if True: scraper = Scraper() scraper.set_callbacks( found_doc_callback = print_doc_info, ) scraper.set_url_data(url) - print "\nStarting Scraper on {0} ...\n\n".format(options.target_url) + if DEBUG == True: + print "\nStarting Scraper on {0} ...\n\n".format(options.target_url) data = scraper.start() - print "\n\nScraper complete.\n" + if DEBUG == True: + print "\n\nScraper complete.\n" - print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format( - len(data['documents']), - options.target_url, - ) - except: - print "Yikes! An error occured while the scraper was running. Exiting." - pass + if DEBUG == True: + print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format( + len(data['documents']), + options.target_url, + ) + + if options.json_output == True: + data = scraper._data + for key in data: + if isinstance(data[key], datetime.datetime) or \ + isinstance(data[key], datetime.timedelta): + data[key] = str(data[key]) + print json.dumps(scraper._data, sort_keys=True, + indent=4, separators=(',', ': ')) + + #except: + # if DEBUG == True: + # print "Yikes! An error occured while the scraper was running. Exiting." + # else: + # print '{"error_text": " An error occured while the scraper was running."}' else: print "Error: missing CLI arguments. Try -h for help."