fix(googlesearch): captcha handling using selenium

Fix #6 -- refactored Googlesearch module to use Selenium and Firefox webdriver to handle Google captcha.
sundowndev · Jun 10, 2019 · 6a6f0db · 6a6f0db
1 parent 9400fa0
commit 6a6f0db
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 23 deletions.
diff --git a/lib/googlesearch.py b/lib/googlesearch.py
@@ -13,40 +13,28 @@
 from lib.request import send
 from config import *
 
-googleAbuseToken = ''
+from selenium import webdriver
 
+browser = webdriver.Firefox()
 
 def search(req, stop):
-    global googleAbuseToken
-
     if google_api_key and google_cx_id:
         return searchApi(req, stop)
 
-    headers = {
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-us,en;q=0.5',
-        'Accept-Encoding': 'gzip,deflate',
-        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
-        'Keep-Alive': '115',
-        'Connection': 'keep-alive',
-        'Cache-Control': 'no-cache',
-        'Cookie': 'Cookie: CGIC=Ij90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjg; CONSENT=YES+RE.fr+20150809-08-0; 1P_JAR=2018-11-28-14; NID=148=aSdSHJz71rufCokaUC93nH3H7lOb8E7BNezDWV-PyyiHTXqWK5Y5hsvj7IAzhZAK04-QNTXjYoLXVu_eiAJkiE46DlNn6JjjgCtY-7Fr0I4JaH-PZRb7WFgSTjiFqh0fw2cCWyN69DeP92dzMd572tQW2Z1gPwno3xuPrYC1T64wOud1DjZDhVAZkpk6UkBrU0PBcnLWL7YdL6IbEaCQlAI9BwaxoH_eywPVyS9V; SID=uAYeu3gT23GCz-ktdGInQuOSf-5SSzl3Plw11-CwsEYY0mqJLSiv7tFKeRpB_5iz8SH5lg.; HSID=AZmH_ctAfs0XbWOCJ; SSID=A0PcRJSylWIxJYTq_; APISID=HHB2bKfJ-2ZUL5-R/Ac0GK3qtM8EHkloNw; SAPISID=wQoxetHBpyo4pJKE/A2P6DUM9zGnStpIVt; SIDCC=ABtHo-EhFAa2AJrJIUgRGtRooWyVK0bAwiQ4UgDmKamfe88xOYBXM47FoL5oZaTxR3H-eOp7-rE; OTZ=4671861_52_52_123900_48_436380; OGPC=873035776-8:; OGP=-873035776:;'
-    }
-
     try:
-        REQ = urlencode({ 'q': req, 'num': stop })
+        REQ = urlencode({ 'q': req, 'num': stop, 'hl': 'en' })
         URL = 'https://www.google.com/search?tbs=li:1&{}&amp;gws_rd=ssl&amp;gl=us'.format(
             REQ)
-        r = send('GET', URL + googleAbuseToken, headers=headers)
+        browser.get(URL)
+        htmlBody = browser.find_element_by_css_selector("body").get_attribute('innerHTML')
 
-        while r.status_code != 200:
-            warn('You are temporary blacklisted from Google search. Complete the captcha at the following URL and copy/paste the content of GOOGLE_ABUSE_EXEMPTION cookie : {}'.format(URL))
-            info('Need help ? Read https://github.com/sundowndev/PhoneInfoga/wiki')
-            token = ask('\nGOOGLE_ABUSE_EXEMPTION=')
-            googleAbuseToken = '&google_abuse=' + token
-            r = send('GET', URL + googleAbuseToken, headers=headers)
+        soup = BeautifulSoup(htmlBody, 'html5lib')
 
-        soup = BeautifulSoup(r.text, 'html5lib')
+        while soup.find("div", id="recaptcha") is not None:
+            warn('You are temporary blacklisted from Google search. Complete the captcha then press ENTER.')
+            token = ask('>')
+            htmlBody = browser.find_element_by_css_selector("body").get_attribute('innerHTML')
+            soup = BeautifulSoup(htmlBody, 'html5lib')
 
         results = soup.find("div", id="search").find_all("div", class_="g")
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ phonenumbers==8.10.2
 argparse==1.2.1
 urllib3==1.24.2
 colorama==0.4.1
+selenium==3.141.0