Skip to content

Commit

Permalink
fix(googlesearch): captcha handling using selenium
Browse files Browse the repository at this point in the history
Fix #6 -- refactored Googlesearch module to use Selenium and Firefox webdriver to handle Google captcha.
  • Loading branch information
sundowndev committed Jun 10, 2019
1 parent 9400fa0 commit 6a6f0db
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 23 deletions.
34 changes: 11 additions & 23 deletions lib/googlesearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,28 @@
from lib.request import send
from config import *

googleAbuseToken = ''
from selenium import webdriver

browser = webdriver.Firefox()

def search(req, stop):
global googleAbuseToken

if google_api_key and google_cx_id:
return searchApi(req, stop)

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip,deflate',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive': '115',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
'Cookie': 'Cookie: CGIC=Ij90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjg; CONSENT=YES+RE.fr+20150809-08-0; 1P_JAR=2018-11-28-14; NID=148=aSdSHJz71rufCokaUC93nH3H7lOb8E7BNezDWV-PyyiHTXqWK5Y5hsvj7IAzhZAK04-QNTXjYoLXVu_eiAJkiE46DlNn6JjjgCtY-7Fr0I4JaH-PZRb7WFgSTjiFqh0fw2cCWyN69DeP92dzMd572tQW2Z1gPwno3xuPrYC1T64wOud1DjZDhVAZkpk6UkBrU0PBcnLWL7YdL6IbEaCQlAI9BwaxoH_eywPVyS9V; SID=uAYeu3gT23GCz-ktdGInQuOSf-5SSzl3Plw11-CwsEYY0mqJLSiv7tFKeRpB_5iz8SH5lg.; HSID=AZmH_ctAfs0XbWOCJ; SSID=A0PcRJSylWIxJYTq_; APISID=HHB2bKfJ-2ZUL5-R/Ac0GK3qtM8EHkloNw; SAPISID=wQoxetHBpyo4pJKE/A2P6DUM9zGnStpIVt; SIDCC=ABtHo-EhFAa2AJrJIUgRGtRooWyVK0bAwiQ4UgDmKamfe88xOYBXM47FoL5oZaTxR3H-eOp7-rE; OTZ=4671861_52_52_123900_48_436380; OGPC=873035776-8:; OGP=-873035776:;'
}

try:
REQ = urlencode({ 'q': req, 'num': stop })
REQ = urlencode({ 'q': req, 'num': stop, 'hl': 'en' })
URL = 'https://www.google.com/search?tbs=li:1&{}&gws_rd=ssl&gl=us'.format(
REQ)
r = send('GET', URL + googleAbuseToken, headers=headers)
browser.get(URL)
htmlBody = browser.find_element_by_css_selector("body").get_attribute('innerHTML')

while r.status_code != 200:
warn('You are temporary blacklisted from Google search. Complete the captcha at the following URL and copy/paste the content of GOOGLE_ABUSE_EXEMPTION cookie : {}'.format(URL))
info('Need help ? Read https://github.com/sundowndev/PhoneInfoga/wiki')
token = ask('\nGOOGLE_ABUSE_EXEMPTION=')
googleAbuseToken = '&google_abuse=' + token
r = send('GET', URL + googleAbuseToken, headers=headers)
soup = BeautifulSoup(htmlBody, 'html5lib')

soup = BeautifulSoup(r.text, 'html5lib')
while soup.find("div", id="recaptcha") is not None:
warn('You are temporary blacklisted from Google search. Complete the captcha then press ENTER.')
token = ask('>')
htmlBody = browser.find_element_by_css_selector("body").get_attribute('innerHTML')
soup = BeautifulSoup(htmlBody, 'html5lib')

results = soup.find("div", id="search").find_all("div", class_="g")

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ phonenumbers==8.10.2
argparse==1.2.1
urllib3==1.24.2
colorama==0.4.1
selenium==3.141.0

0 comments on commit 6a6f0db

Please sign in to comment.