Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2858b0d
commit 15b4cff
Showing
1 changed file
with
94 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/usr/bin/env python | ||
# | ||
# safebro_lookup.py | ||
# Dave Lassalle (@superponible) | ||
# | ||
# This script can either crawl a website from a starting URL or accept a file | ||
# containing one URL per line, and submit the crawled or provided URLs | ||
# to Google's Safe Browsing Lookup API and provide the results. | ||
# | ||
# A Google API key is needed from: | ||
# https://console.developers.google.com/project | ||
# A guide on the API is available at: | ||
# https://developers.google.com/safe-browsing/lookup_guide | ||
# | ||
# basic crawler modified from https://pypi.python.org/creepy | ||
# | ||
|
||
from creepy import Crawler | ||
import urllib | ||
import urllib2 | ||
import sys | ||
import argparse | ||
|
||
API_KEY = '' | ||
|
||
def print_result(result, url): | ||
print '-{}- {}'.format(result, url) | ||
|
||
def sb_lookup(url): | ||
global API_KEY | ||
url = urllib.quote_plus(url) | ||
result = "" | ||
try: | ||
result = urllib2.urlopen('https://sb-ssl.google.com/safebrowsing/api/lookup?client=safe-browsing&key=' + API_KEY + '&appver=1.0&pver=3.1&url=' + url) | ||
except urllib2.HTTPError, e: | ||
if e.code == 400: | ||
print e | ||
print "Bad Request - The HTTP request was not correctly formed." | ||
elif e.code == 401 or e.code == 403: | ||
print e | ||
print "Not Authorized - The API key is not authorized." | ||
elif e.code == 503: | ||
print e | ||
print 'Service Unavailable - The server cannot handle the request. Besides the normal server failures, this could also indicate that the client has been "throttled" for sending too many requests.' | ||
else: | ||
print "Unknown error: ", e | ||
exit(1) | ||
status = result.getcode() | ||
if status == 200: | ||
result = result.read() | ||
if status == 204: | ||
result = "none" | ||
return result | ||
|
||
class MyCrawler(Crawler): | ||
def process_document(self, doc): | ||
result = sb_lookup(doc.url) | ||
print_result(result, doc.url) | ||
|
||
def cliargs(): | ||
'''Parse CLI args''' | ||
global API_KEY | ||
parser = argparse.ArgumentParser(description="safebro_lookup.py -- Google Safe Browsing Lookup tool") | ||
parser.add_argument('-u', '--url', required=False, action='store', dest='start_url', help='Base URL for crawler (will add http:// if not given)') | ||
parser.add_argument('-f', '--file', required=False, action='store', dest='infile', help='File containing URLs to lookup') | ||
parser.add_argument('-a', '--api_key', required=False, action='store', dest='api_key', help='Specify/override API_KEY hardcoded in the script') | ||
args = parser.parse_args() | ||
if not (args.start_url or args.infile): | ||
parser.error('Specify at either a base URL, a file of URLs, or both') | ||
if args.api_key: | ||
API_KEY = args.api_key | ||
elif not API_KEY: | ||
parser.error('Google API Key must be specified in the script or provided with -a') | ||
return args | ||
|
||
def main(argv): | ||
args = cliargs() | ||
if args.infile: | ||
f = open(args.infile, 'r') | ||
lines = f.readlines() | ||
for line in lines: | ||
url = line.strip('\n') | ||
result = sb_lookup(url) | ||
print_result(result, url) | ||
if args.start_url: | ||
if not (args.start_url.startswith('http://') or args.start_url.startswith('https://')): | ||
args.start_url = 'http://' + args.start_url | ||
crawler = MyCrawler() | ||
crawler.set_follow_mode(Crawler.F_SAME_HOST) | ||
crawler.add_url_filter('\.(jpg|jpeg|gif|png|js|css|swf)$') | ||
crawler.crawl(args.start_url) | ||
|
||
if __name__ == '__main__': | ||
main(sys.argv[1:]) |