### How to scrape google search result

https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal

https://serpwow.com/

https://pythondata.com/quick-tip-consuming-google-search-results-to-use-for-web-scraping/


In [5]:
import urllib
import requests
from bs4 import BeautifulSoup

In [13]:
from concurrent.futures import ThreadPoolExecutor

In [6]:
CONFIG = {
    "USER_AGENT": {
        "desktop": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0",
        "mobile": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
    },
    "SEARCH_BASE_URL": {
        "bing": "https://www.bing.com/search",
        "google": "https://google.com/search"
    }
}

HEADERS = {"user-agent" : CONFIG["USER_AGENT"]["desktop"]}

In [7]:
def get_search_url(query:str, start:int = 0, engine:str = "google") -> str:
    """Get search url
    
    args:
        query (str): search text
        start (int): starting result index
        engine (str): "bing" or "google"
    """
    engine = engine.lower()
    query = urllib.parse.quote(query)

    if engine == "bing":
        url = CONFIG["SEARCH_BASE_URL"][engine] + \
                "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&first={start}")
    elif engine == "google":
        url = CONFIG["SEARCH_BASE_URL"][engine] + \
                "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&start={start}")
    else:
        raise ValueError(f"Unimplemented search engine {engine}")
    
    return url

# get_search_url("tensorflow", start=10, engine="google")   # 'https://google.com/search?q=tensorflow&start=10'
# get_search_url("tensor flow", start=10, engine="bing")   # 'https://www.bing.com/search?q=tensor%20flow&first=10'

In [21]:
def get_search_result(url_header):
    global link_title_map
    url, headers = url_header
    resp = requests.get(url, headers=headers)

    res_dic = {}  # key=link, val=title
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")

        if "google.com/search" in url:     # pickup link,title from Google page
            for g in soup.find_all('div', class_='r'):
                anchors = g.find_all('a')
                if len(anchors):                
                    link = anchors[0]['href']
                    if "http" in link:
                        link_title_map[link] = g.find('h3').text

            for g in soup.find_all('h3', class_='r'):
                anchors = g.find_all('a')
                if len(anchors):                
                    link = anchors[0]['href']
                    if "http" in link:
                        link_title_map[link] = g.text
                        
        elif "bing.com/search" in url:   # pickup link,title from Bing page                       
            for tag in ["h2", "h3"]:
                for g in soup.find_all(tag):
                    anchors = g.find_all('a')
                    if len(anchors):
                        link = anchors[0]['href']
                        if "http" in link:
                            link_title_map[link] = g.text
        
    return

In [35]:
search_text = "tensorflow"
MAX_RESULTS = 50

search_urls = []
for start in range(0,MAX_RESULTS,10):
    for engine in ["google", "bing"]:
        URL = get_search_url(search_text, start=start, engine=engine)
        search_urls.append(URL)

# print(search_urls)

url_headers = [(url, HEADERS) for url in search_urls]

In [36]:
%%time

link_title_map = {}
for x in url_headers:
    get_search_result(x)

len(link_title_map), link_title_map

CPU times: user 9.23 s, sys: 47.9 ms, total: 9.28 s
Wall time: 17.8 s


(93,
 {'https://www.tensorflow.org/': 'TensorFlow',
  'https://github.com/tensorflow/tensorflow': 'GitHub - tensorflow/tensorflow: An Open Source Machine ...',
  'https://github.com/tensorflow': 'tensorflow Â· GitHub',
  'https://en.wikipedia.org/wiki/TensorFlow': 'TensorFlow - Wikipedia',
  'https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ': 'TensorFlow - YouTube',
  'https://www.tensorflow.org/tutorials': 'Tutorials',
  'https://www.tensorflow.org/overview': 'TensorFlow Core',
  'https://www.tensorflow.org/install': 'Install',
  'https://www.tensorflow.org/guide': 'Guide',
  'https://www.tensorflow.org/learn': 'Learn',
  'https://www.tensorflow.org/tutorials/keras/classification': 'Basic classification: Classify ...',
  'https://pypi.org/project/tensorflow/': 'tensorflow · PyPI',
  'https://www.tensorflow.org/about': 'About',
  'https://www.tensorflow.org/versions': 'API',
  'https://www.tensorflow.org/tfx': 'TFX',
  'https://www.tensorflow.org/resources/learn-ml': 'Learn Ml'

In [37]:
%%time

link_title_map = {}
with ThreadPoolExecutor() as executor:
    executor.map(get_search_result, url_headers, timeout=60)

len(link_title_map), link_title_map

CPU times: user 12.5 s, sys: 171 ms, total: 12.7 s
Wall time: 13.3 s


(95,
 {'https://tensorflow.org': 'TensorFlow',
  'https://pypi.org/project/tensorflow/': 'tensorflow · PyPI',
  'https://aws.amazon.com/tensorflow/': 'TensorFlow on AWS - Deep Learning on the Cloud',
  'https://www.datacamp.com/community/tutorials/tensorflow-tutorial': 'TensorFlow Tutorial For Beginners - DataCamp',
  'https://medium.com/tensorflow': 'TensorFlow – Medium',
  'https://opensource.com/article/17/11/intro-tensorflow': 'What is TensorFlow? | Opensource.com',
  'https://en.wikipedia.org/wiki/Tensor_Flow': 'TensorFlow - Wikipedia',
  'https://www.edx.org/course/deep-learning-with-tensorflow': 'Deep Learning with Tensorflow | edX',
  'https://playground.tensorflow.org/': 'Tensorflow - A Neural Network Playground',
  'https://software.intel.com/content/www/us/en/develop/articles/intel-optimization-for-tensorflow-installation-guide.html': 'Intel® Optimization for TensorFlow* Installation Guide',
  'https://www.zhihu.com/question/49909565': 'TensorFlow 如何入门，如何快速学习？ - 知乎',
  'http