/
search.py
52 lines (47 loc) · 1.89 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
'''
Created on Oct 25, 2015
@author: teaddict
'''
#!/usr/bin/python3
import json
import urllib.request, urllib.parse
import os, datetime
import sys
downloadList = []
def search(searchfor):
query = urllib.parse.urlencode({'q': searchfor})
#this query brings only first 8 results -> API max give 8 results
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s&rsz=large' % query
search_response = urllib.request.urlopen(url)
search_results = search_response.read().decode("utf8")
results = json.loads(search_results)
data = results['responseData']
pages = int(data['cursor']['estimatedResultCount'])
print('Total results: %d' % pages)
getAllUrls(data)
pages = int(pages / 8)
for i in range(pages):
query = urllib.parse.urlencode({'q': searchfor})
#we got the first page and now we begin with second page 9 , 10 , 11 etc.
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&%s&start=%d' % (query, (i*8)+1)
search_response = urllib.request.urlopen(url)
search_results = search_response.read().decode("utf8")
results = json.loads(search_results)
data = results['responseData']
getAllUrls(data)
#at the end download all
downloadAll()
def getAllUrls(data):
hits = data['results']
for h in hits:
downloadList.append(h['url'])
def downloadAll():
# I create a driectory with timestamp so i download all files into this directory
mydir = os.path.join(os.getcwd(), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
os.makedirs(mydir)
for i in range(len(downloadList)):
fileName = str(downloadList[i]).split('/')[-1]
print("downloaded filename: " + fileName + " url: " + downloadList[i])
urllib.request.urlretrieve(downloadList[i], mydir +"/" + fileName)
print("all downloaded!!!")
search("site:"+sys.argv[1])