Skip to content
Permalink
Browse files

add commoncrawl indexer, drop dependency on cc.py

  • Loading branch information...
jgor committed Jun 24, 2019
1 parent bbb7f3f commit c4709a9a6907df13f124c6f3f3ff2fbb108c5bb6
Showing with 90 additions and 72 deletions.
  1. +5 −6 README.md
  2. +0 −66 dorkbot/indexers/cc_py.py
  3. +85 −0 dorkbot/indexers/commoncrawl.py
@@ -51,7 +51,6 @@ Tools
* [PhantomJS](http://phantomjs.org/)
* [Arachni](http://www.arachni-scanner.com/)
* [Wapiti](http://wapiti.sourceforge.net/)
* [cc.py](https://github.com/si9int/cc.py)

As needed, dorkbot will search for tools in the following order:
* Directory specified via relevant module option
@@ -122,15 +121,15 @@ Options:
* **query** - search query
* domain - limit searches to specified domain

### cc_py ###
Search for targets within commoncrawl.org results via cc.py.
### commoncrawl ###
Search for targets within commoncrawl.org results.

Requirements: [cc.py](https://github.com/si9int/cc.py)
Requirements: none

Options:
* **domain** - pull all results for given domain or subdomain
* cc_py_dir - cc.py base directory containing the file cc.py (default: tools/cc.py/)
* year - limit results to data sets from given year (17 or 18, defaults to all)
* index - search a specific index, e.g. CC-MAIN-2019-22 (default: latest)
* filter - query filter to apply to the search

### bing_api ###
Search for targets via Bing Web Search API.

This file was deleted.

@@ -0,0 +1,85 @@
from __future__ import print_function
try:
from urllib.request import urlopen
from urllib.parse import urlencode, urlparse
from urllib.error import HTTPError
except ImportError:
from urllib import urlencode
from urllib2 import urlopen, HTTPError
from urlparse import urlparse
import json
import sys

def run(args):
required = ["domain"]
for r in required:
if r not in args:
print ("ERROR: %s must be set" % r, file=sys.stderr)
sys.exit(1)

domain = args["domain"]
index = args.get("index", get_latest_index())
url_filter = args.get("filter", "")

data = {}
data["url"] = "*.%s" % domain
data["fl"] = "url"
data["output"] = "json"
if url_filter:
data["filter"] = url_filter

results = get_results(index, data)
return results

def get_latest_index():
url = "https://index.commoncrawl.org/collinfo.json"

try:
response_str = urlopen(url)
response_str = response_str.read().decode("utf-8")
response = json.loads(response_str)
except HTTPError as e:
print("error: %s" % str(e), file=sys.stderr)
sys.exit(1)

index = response[0]["id"]
return index

def get_num_pages(index, data):
data["showNumPages"] = "true"
url = "https://index.commoncrawl.org/" + index + "-index?" + urlencode(data)

try:
response_str = urlopen(url)
response_str = response_str.read().decode("utf-8")
response = json.loads(response_str)
except HTTPError as e:
print("error: %s" % str(e), file=sys.stderr)
sys.exit(1)

num_pages = response["pages"]
return num_pages

def get_results(index, data):
num_pages = get_num_pages(index, data)
del data["showNumPages"]

results = []
for page in range(0, num_pages):
data["page"] = page
url = "https://index.commoncrawl.org/" + index + "-index?" + urlencode(data)
try:
response_str = urlopen(url)
response_str = response_str.read().decode("utf-8")
response = response_str.splitlines()
except HTTPError as e:
print("error: %s" % str(e), file=sys.stderr)
sys.exit(1)

for item in response:
item_json = json.loads(item)
url = urlparse(item_json["url"].strip()).geturl()
results.append(url)

return results

0 comments on commit c4709a9

Please sign in to comment.
You can’t perform that action at this time.