Skip to content

Commit

Permalink
First attempt at a Bing Web Search API module
Browse files Browse the repository at this point in the history
  • Loading branch information
jgor committed Oct 16, 2018
1 parent 236c81d commit 870eb65
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ Options:
* cc_py_dir - cc.py base directory containing the file cc.py (default: tools/cc.py/)
* year - limit results to data sets from given year (17 or 18, defaults to all)

### bing_api ###
Search for targets via Bing Web Search API.

Requirements: none

Options:
* **key** - API key
* **query** - search query

### stdin ###
Read targets from standard input, one per line.

Expand Down
64 changes: 64 additions & 0 deletions indexers/bing_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import print_function
try:
from urllib.request import Request, urlopen
from urllib.parse import urlencode,urlparse
from urllib.error import HTTPError
except ImportError:
from urllib import urlencode
from urllib2 import Request, urlopen, HTTPError
from urlparse import urlparse
import json
import sys
import time

def run(args):
required = ["key", "query"]
for r in required:
if r not in args:
print("ERROR: %s must be set" % r, file=sys.stderr)
sys.exit(1)

results = get_results(args)
return results

def get_results(args):
data = {"q": args["query"],
"count": 50,
"offset": 0}

results = []
while data["offset"] < 1000:
items = issue_request(data, args["key"])
data["offset"] += data["count"]
if not items:
break
results.extend(items)

return results

def issue_request(data, key):
url = "https://api.cognitive.microsoft.com/bing/v7.0/search?" + urlencode(data)
while True:
try:
r = Request(url)
r.add_header("Ocp-Apim-Subscription-Key", key)
response_str = urlopen(r)
response_str = response_str.read().decode("utf-8")
response = json.loads(response_str)
break
except HTTPError as e:
response_str = e.read().decode("utf-8")
response = json.loads(response_str)
if e.code == 429:
time.sleep(0.5)

items = []

if "webPages" not in response or response["webPages"]["totalEstimatedMatches"] < data["offset"]:
return items

for item in response["webPages"]["value"]:
items.append(urlparse(item["url"].encode("utf-8")))

return items

0 comments on commit 870eb65

Please sign in to comment.