### How to scrape google search result

https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal

https://serpwow.com/

https://pythondata.com/quick-tip-consuming-google-search-results-to-use-for-web-scraping/


In [1]:
import urllib
import requests
from bs4 import BeautifulSoup

In [2]:
CONFIG = {
    "USER_AGENT": {
        "desktop": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0",
        "mobile": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
    },
    "SEARCH_BASE_URL": {
        "bing": "https://www.bing.com/search",
        "google": "https://google.com/search"
    }
}

HEADERS = {"user-agent" : CONFIG["USER_AGENT"]["desktop"]}

In [3]:
def get_search_url(query:str, start:int = 0, engine:str = "google") -> str:
    """Get search url
    
    args:
        query (str): search text
        start (int): starting result index
        engine (str): "bing" or "google"
    """
    engine = engine.lower()
    query = urllib.parse.quote(query)

    if engine == "bing":
        url = CONFIG["SEARCH_BASE_URL"][engine] + \
                "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&first={start}")
    elif engine == "google":
        url = CONFIG["SEARCH_BASE_URL"][engine] + \
                "?q="+ urllib.parse.quote(query) + \
                ("" if start==0 else f"&start={start}")
    else:
        raise ValueError(f"Unimplemented search engine {engine}")
    
    return url

# get_search_url("tensorflow", start=10, engine="google")   # 'https://google.com/search?q=tensorflow&start=10'
# get_search_url("tensor flow", start=10, engine="bing")   # 'https://www.bing.com/search?q=tensor%20flow&first=10'

In [4]:
def get_search_result_google(url, headers):
    res_dic = {}  # key=link, val=title
    resp = requests.get(url, headers=headers)

    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")

        for g in soup.find_all('div', class_='r'):
            anchors = g.find_all('a')
            if len(anchors):                
                link = anchors[0]['href']
                if "http" in link:
                    res_dic[link] = g.find('h3').text

        for g in soup.find_all('h3', class_='r'):
            anchors = g.find_all('a')
            if len(anchors):                
                link = anchors[0]['href']
                if "http" in link:
                    res_dic[link] = g.text
                    
    results = []
    for link,title in res_dic.items():
        item = {"title": title, "link": link}
        results.append(item)
        
    return results

In [5]:
engine = "google"
search_text = "tensorflow"
MAX_RESULTS = 10

In [6]:
results = []
for start in range(0,MAX_RESULTS,10):
    URL = get_search_url(search_text, start=start, engine=engine)
    res = get_search_result_google(URL, HEADERS)
    if len(res):
        results.extend(res)


In [7]:
len(results), results

(11,
 [{'title': 'TensorFlow', 'link': 'https://www.tensorflow.org/'},
  {'title': 'tensorflow/tensorflow: An Open Source Machine ... - GitHub',
   'link': 'https://github.com/tensorflow/tensorflow'},
  {'title': 'tensorflow Â· GitHub', 'link': 'https://github.com/tensorflow'},
  {'title': 'TensorFlow - Wikipedia',
   'link': 'https://en.wikipedia.org/wiki/TensorFlow'},
  {'title': 'TensorFlow YouTube channel - TensorFlow - YouTube',
   'link': 'https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ'},
  {'title': 'Tutorials', 'link': 'https://www.tensorflow.org/tutorials'},
  {'title': 'TensorFlow Core', 'link': 'https://www.tensorflow.org/overview'},
  {'title': 'Install', 'link': 'https://www.tensorflow.org/install'},
  {'title': 'Guide', 'link': 'https://www.tensorflow.org/guide'},
  {'title': 'Learn', 'link': 'https://www.tensorflow.org/learn'},
  {'title': 'Basic classification: Classify ...',
   'link': 'https://www.tensorflow.org/tutorials/keras/classification'}])

In [8]:
import json

with open(f"{search_text}-{engine}.json", "w") as f:
    f.write(json.dumps(results))

In [9]:
def get_search_result_bing(url, headers):
    res_dic = {}  # key=link, val=title
    resp = requests.get(url, headers=headers)

    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content, "html.parser")

        for tag in ["h2", "h3"]:
            for g in soup.find_all(tag):
                anchors = g.find_all('a')
                if len(anchors):
                    link = anchors[0]['href']
                    if "http" in link:
                        res_dic[link] = g.text


    results = []
    for link,title in res_dic.items():
        item = {"title": title, "link": link}
        results.append(item)
        
    return results

In [10]:
engine = "bing"  # "google"
search_text = "tensorflow"
MAX_RESULTS = 10

In [11]:
results = []
for start in range(0,MAX_RESULTS,10):
    URL = get_search_url(search_text, start=start, engine=engine)
    res = get_search_result_bing(URL, HEADERS)
    if len(res):
        results.extend(res)


In [12]:
len(results), results

(15,
 [{'title': 'TensorFlow', 'link': 'https://www.tensorflow.org/'},
  {'title': 'TensorFlow - Wikipedia',
   'link': 'https://en.wikipedia.org/wiki/TensorFlow'},
  {'title': 'GitHub - tensorflow/tensorflow: An Open Source Machine ...',
   'link': 'https://github.com/tensorflow/tensorflow'},
  {'title': 'What is TensorFlow? Introduction, Architecture & Example',
   'link': 'https://www.guru99.com/what-is-tensorflow.html'},
  {'title': 'tensorflow · PyPI',
   'link': 'https://pypi.org/project/tensorflow/'},
  {'title': 'Install', 'link': 'https://www.tensorflow.org/install'},
  {'title': 'Learn', 'link': 'https://www.tensorflow.org/learn'},
  {'title': 'The CORE Open Source Ml Li…',
   'link': 'https://www.tensorflow.org/overview/'},
  {'title': 'About', 'link': 'https://www.tensorflow.org/about'},
  {'title': 'API', 'link': 'https://www.tensorflow.org/versions'},
  {'title': 'TFX', 'link': 'https://www.tensorflow.org/tfx'},
  {'title': 'Learn Ml',
   'link': 'https://www.tensorflow.o

In [13]:
import json

with open(f"{search_text}-{engine}.json", "w") as f:
    f.write(json.dumps(results))

In [14]:
!pwd

/home/devopsgong/projects/py4kids/lesson-30-search
