In [1]:
import requests
import selenium
import os
import time
import io
from PIL import Image
from selenium import webdriver
from bs4 import BeautifulSoup
import hashlib

In [2]:
# setting directory for chromedriver
DRIVER_PATH = "./chromedriver"
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [3]:
wd.get('https://google.com')

In [4]:
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('masks on face')

In [5]:
wd.quit()

In [6]:
# fetch images by scrolling through results
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [7]:
# function for downloading the image and raise errors whenever necessary 
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [8]:
# combining the previous two functions 
# google search and download 

def search_and_download(search_term:str,driver_path:str,target_path:str,number_images):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [11]:
SEARCH_TERM_MASK = 'masks on face'
SEARCH_TERM_MASKOFF_1 = 'portrait'

TARGET_PATH = './images'


In [12]:
# demo for 10 mask images
search_and_download(
    search_term = SEARCH_TERM_MASK,
    driver_path = DRIVER_PATH,
    target_path = TARGET_PATH,
    number_images = 10
)

Found: 200 search results. Extracting links from 0:200
Found: 10 image links, done!
SUCCESS - saved https://images.squarespace-cdn.com/content/v1/57868893725e258ab3883472/1585973391951-ZXA7T36LILHGL92OONDK/ke17ZwdGBToddI8pDm48kGX3mPjCxRpnpZFLehGnHRF7gQa3H78H3Y0txjaiv_0fDoOvxcdMmMKkDsyUqMSsMWxHk725yiiHCCLfrh8O1z5QPOohDIaIeljMHgDF5CVlOqpeNLcJ80NK65_fV7S1UeXRcnltpXi7pYmG01K9Sxdr9jvdzJo9yQKZsg8lxCX0Dk-aW0WIdZ70CxZYZblwSA/DSC04504.jpg?format=2500w - as ./images/masks_on_face/010dafa0d1.jpg
SUCCESS - saved https://www.headcovers.com/media/catalog/product/cache/ba642c93a0efc71830935b1d4e0de39d/f/a/face-masks-for-coronavirus-protecive-facemasks-pink-flowers.jpg - as ./images/masks_on_face/62b8fce1a4.jpg
SUCCESS - saved https://apicms.thestar.com.my/uploads/images/2020/01/28/529574.jpg - as ./images/masks_on_face/0bd06910d6.jpg
SUCCESS - saved https://i.guim.co.uk/img/media/6b887b1b3b5bdec0921e4ef28ade834e2327e2f7/0_0_2500_1500/master/2500.jpg?width=1200&height=630&quality=85&auto=format&fit=cr

In [13]:
# demo for 10 images without mask
search_and_download(
    search_term = SEARCH_TERM_MASKOFF_1,
    driver_path = DRIVER_PATH,
    target_path = TARGET_PATH,
    number_images = 10
)

Found: 200 search results. Extracting links from 0:200
Found: 11 image links, done!
SUCCESS - saved https://images.unsplash.com/photo-1527082395-e939b847da0d?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80 - as ./images/portrait/8fce557b76.jpg
SUCCESS - saved https://i.pinimg.com/originals/dd/59/4e/dd594e241abf617abed2b7d586c19ef9.jpg - as ./images/portrait/f7aba13d3e.jpg
SUCCESS - saved https://i.pinimg.com/originals/03/33/21/033321112df5d5a0392082517cbe5143.jpg - as ./images/portrait/21e3b4fef5.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcR4Muui0K_Z5zZPReB1s04rLKY1otDBcUvsAq3knHP-1Y6BKrYy&usqp=CAU - as ./images/portrait/052a67584a.jpg
SUCCESS - saved https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Poster-sized_portrait_of_Barack_Obama.jpg/1200px-Poster-sized_portrait_of_Barack_Obama.jpg - as ./images/portrait/22e2a718f4.jpg
SUCCESS - saved https://i0.wp.com/zsuttonphoto.com/wp-content/uploads/2019/11/Los-Angeles-Beauty-Photography-19.jpg?