Skip to content

Commit

Permalink
added json output dump to cli tool
Browse files Browse the repository at this point in the history
  • Loading branch information
thequbit committed Jan 20, 2015
1 parent 270854a commit bac5247
Showing 1 changed file with 41 additions and 13 deletions.
54 changes: 41 additions & 13 deletions barking_owl/scraper/cli-scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import datetime
from optparse import OptionParser
from scraper import Scraper
Expand All @@ -7,8 +8,6 @@ def print_doc_info(_data, document_url):

if __name__ == '__main__':

print " -- CLI BarkingOwl Scraper -- "

parser = OptionParser()

parser.add_option("-u", "--target-url", dest="target_url",
Expand All @@ -20,11 +19,24 @@ def print_doc_info(_data, document_url):
parser.add_option("-l", "--max-link-level", dest="max_link_level",
help="Maximum links to follow.", metavar="MAXLEVEL")

parser.add_option("-j", "--json-output", action="store_true",
dest="json_output", help="Produce Pretty JSON output.",
default=False)

(options, args) = parser.parse_args()

if not options.target_url == '' and not options.target_url == None and \
not options.doc_type == '' and not options.doc_type == None and \
not options.max_link_level == '' and not options.max_link_level == None:
not options.max_link_level == '' and \
not options.max_link_level == None and \
not options.json_output == '' and not options.json_output == None:

DEBUG = False
if options.json_output == False:
DEBUG = True

if DEBUG == True:
print " -- CLI BarkingOwl Scraper -- "

url = {
'target_url': options.target_url,
Expand All @@ -39,23 +51,39 @@ def print_doc_info(_data, document_url):
],
}

try:
#try:
if True:
scraper = Scraper()
scraper.set_callbacks(
found_doc_callback = print_doc_info,
)
scraper.set_url_data(url)

print "\nStarting Scraper on {0} ...\n\n".format(options.target_url)
if DEBUG == True:
print "\nStarting Scraper on {0} ...\n\n".format(options.target_url)
data = scraper.start()
print "\n\nScraper complete.\n"
if DEBUG == True:
print "\n\nScraper complete.\n"

print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format(
len(data['documents']),
options.target_url,
)
except:
print "Yikes! An error occured while the scraper was running. Exiting."
pass
if DEBUG == True:
print "BarkingOwl Scraper found {0} documents on {1}.\n\n".format(
len(data['documents']),
options.target_url,
)

if options.json_output == True:
data = scraper._data
for key in data:
if isinstance(data[key], datetime.datetime) or \
isinstance(data[key], datetime.timedelta):
data[key] = str(data[key])
print json.dumps(scraper._data, sort_keys=True,
indent=4, separators=(',', ': '))

#except:
# if DEBUG == True:
# print "Yikes! An error occured while the scraper was running. Exiting."
# else:
# print '{"error_text": " An error occured while the scraper was running."}'
else:
print "Error: missing CLI arguments. Try -h for help."

0 comments on commit bac5247

Please sign in to comment.