### How to scrape google search result

https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal

https://serpwow.com/

https://pythondata.com/quick-tip-consuming-google-search-results-to-use-for-web-scraping/


In [1]:
import urllib
import requests
from bs4 import BeautifulSoup

In [2]:
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
HEADERS = {"user-agent" : USER_AGENT}
MAX_RESULTS = 100

In [3]:
def _get_search_url(query, start=0, engine='google'):
    """
    Get search result
    args:
        query: search text
        start: starting result index
        engine: bing or google
    """
    if engine == 'bing':
        SEARCH_URL = "https://www.bing.com/search"
        url = SEARCH_URL + "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&first={start}")
    elif engine == 'google':
        SEARCH_URL = "https://google.com/search"
        url = SEARCH_URL + "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&start={start}")
    else:
        raise ValueError('invalid search engine', engine)
    
    return url

In [4]:
def _get_search_result(url, headers):
    results = []
    resp = requests.get(url, headers=headers)

    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")

        for g in soup.find_all('div', class_='r'):
            anchors = g.find_all('a')
            if len(anchors) < 1:
                continue
                
            link = anchors[0]['href']
            title = g.find('h3').text
            item = {"title": title, "link": link}
            results.append(item)
            
    return results

In [5]:
engine = 'google'
search_text = "tensorflow"

In [6]:
search_text = urllib.parse.quote(search_text)

results = []
for start in range(0,MAX_RESULTS,10):
    URL = _get_search_url(search_text, start=start, engine=engine)
    res = _get_search_result(URL, HEADERS)
    if len(res):
        results.extend(res)


In [7]:
len(results), results

(95,
 [{'title': 'TensorFlow', 'link': 'https://www.tensorflow.org/'},
  {'title': 'tensorflow/tensorflow: An Open Source Machine ... - GitHub',
   'link': 'https://github.com/tensorflow/tensorflow'},
  {'title': 'TensorFlow - Wikipedia',
   'link': 'https://en.wikipedia.org/wiki/TensorFlow'},
  {'title': 'TensorFlow YouTube channel - TensorFlow - YouTube',
   'link': 'https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ'},
  {'title': 'TensorFlow for R', 'link': 'https://tensorflow.rstudio.com/'},
  {'title': 'Introduction to TensorFlow | Machine Learning Crash Course',
   'link': 'https://developers.google.com/machine-learning/crash-course/first-steps-with-tensorflow/toolkit'},
  {'title': 'TensorFlow (@TensorFlow) | Twitter',
   'link': 'https://twitter.com/tensorflow?lang=en'},
  {'title': 'Google Just Open Sourced TensorFlow, Its Artificial ... - Wired',
   'link': 'https://www.wired.com/2015/11/google-open-sources-its-artificial-intelligence-engine/'},
  {'title': 'What is Te

In [8]:
import json

with open(f"{search_text}-{engine}.json", "w") as f:
    f.write(json.dumps(results))