-
Notifications
You must be signed in to change notification settings - Fork 0
/
mightyFinder.py
61 lines (48 loc) 路 1.68 KB
/
mightyFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import urllib2
import urllib
import lxml.html
from lxml import html
from lxml.html import clean
import re
import unicodedata
import time
from bs4 import BeautifulSoup
class Googlear:
def __init__( self , query,nameID):
os.makedirs("Restaurantes/"+nameID)
s = google(query,True)
soup = BeautifulSoup(s)
i=0
for c in soup.select("h3.r a[href]"):
try:
url = c.get("href").replace("/url?q=","")
if(re.match('http.*',url)):
new=re.sub('&sa=U.*$', '', url)
i+=1
self.guardar(new,nameID,i)
except(urllib2.HTTPError) as e:
continue
def guardar(self, url,nameID,i):
#Obtener pagina
pagina = html.fromstring(google(url,False).read())
#Obtener texto limpio (sin html, javascript)
cls = clean.Cleaner(links=False,page_structure=False)
pagina = cls.clean_html(pagina)
texto = lxml.html.tostring(pagina,encoding='utf-8',pretty_print=True, method='text')
#guardar
archivo=open("Restaurantes/"+nameID+"_result_"+i+".txt",'w')
print>>archivo,texto
archivo.close()
def google(query,op):
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/19.0')]
if(op):
return opener.open('http://www.google.com/search?q=' + query)
else:
return opener.open(query)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', unicodedata.normalize('NFKD', element).encode('ascii','ignore')):
return False
return True