In [1]:
import requests
import selenium
import os
import time
import io
from PIL import Image
from selenium import webdriver
from bs4 import BeautifulSoup
import hashlib

In [2]:
# setting directory for chromedriver
DRIVER_PATH = "./chromedriver"
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [3]:
wd.get('https://google.com')

In [4]:
search_box = wd.find_element_by_css_selector('input.gLFyf')
search_box.send_keys('masks on face')

In [5]:
wd.quit()

In [6]:
# fetch images by scrolling through results
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [7]:
# function for downloading the image and raise errors whenever necessary 
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [8]:
# combining the previous two functions 
# google search and download 

def search_and_download(search_term:str,driver_path:str,target_path:str,number_images):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [9]:
SEARCH_TERM_MASK = 'masks on face'
SEARCH_TERM_MASKOFF_1 = 'portrait'

TARGET_PATH = './images'


In [11]:
# demo for 10 mask images
search_and_download(
    search_term = SEARCH_TERM_MASK,
    driver_path = DRIVER_PATH,
    target_path = TARGET_PATH,
    number_images = 200
)

Found: 200 search results. Extracting links from 0:200
Found: 200 image links, done!
SUCCESS - saved https://www.thetimes.co.uk/imageserver/image/%2Fmethode%2Ftimes%2Fprod%2Fweb%2Fbin%2F38d0166c-4471-11ea-a083-1ec392b38124.jpg?crop=1999%2C1125%2C0%2C104&resize=1180 - as ./images/masks_on_face/2ddb72da62.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRNwEEpobH-HiALWSvkByi4xYwwu-bM9B1gmyoZJyrlGjPsICW4&usqp=CAU - as ./images/masks_on_face/a5585af8aa.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQvN4Gmx2ZARUkSMzgGCImcyMltIINf0q1Je1YoOXnlyF3b9tqD&usqp=CAU - as ./images/masks_on_face/736148582f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQuGJmSsIMElxmzyEsb2N3vhU193CPtaBil7Xp00ivOQpgnaeiN&usqp=CAU - as ./images/masks_on_face/3a3c6cde64.jpg
SUCCESS - saved https://www.headcovers.com/media/magefan_blog/surgical-masks-for-coronavirus.jpg - as ./images/masks_on_face/da41a92fb8.jpg
SUCCESS - saved https://im

SUCCESS - saved https://img.topchinasupplier.com/file/upload/2020/03/31/Anti-Pollution-Unisex-Reusable-Face-Mask-Washable-Nouth-Mask-Funny-Free-Printable-Face-Masks.jpg - as ./images/masks_on_face/ef8254cef2.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQQmfE49gwweU3ziOrH-73rrHQhOhC8T8A7190WIhwd7BNFhFqP&usqp=CAU - as ./images/masks_on_face/acc87a6f7f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSKdhSUo_aZdoSJi66PByGYxJO9G3y2qZNeGQKOeiR6coFToMLz&usqp=CAU - as ./images/masks_on_face/5e51ab7f77.jpg
SUCCESS - saved https://specials-images.forbesimg.com/imageserve/5ea32f91c6cf13000769e8aa/960x0.jpg?fit=scale - as ./images/masks_on_face/8b884c7a3d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcS8ZIRrAvmZQGPuVaZCTpXIyjWr2RtH4iTzH8c290XbHXiDCHgH&usqp=CAU - as ./images/masks_on_face/741f1a9745.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRhU9FZvaJs0AtcNmPSTzecwldlDfywdCZCoIlQ

SUCCESS - saved https://media.gq.com/photos/5e8b80d9b985a70009dd6b77/master/w_1280%2Cc_limit/2020-04_GQ_Face-Masks_3x2-7.jpg - as ./images/masks_on_face/9585637d8d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSDgGjMcznvAgM1yE1Bb0JT04Cmu8s-O5fap079E892RqvSdws1&usqp=CAU - as ./images/masks_on_face/2617b0adc9.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQU0OpXl1TZKLlc3ucjNmiVl1dVXcg5-nuABAWUSpj7NNcpSkgl&usqp=CAU - as ./images/masks_on_face/a8515194da.jpg
SUCCESS - saved https://cdn.5280.com/2020/04/covid_face_mask_fancy_tiger_crafts-375x250.jpg - as ./images/masks_on_face/cff7b7729f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcS9aKvhQ3EoR60cF6Z44j8z4PWVNBtEAmbAsEVGxyI9P3QZT52o&usqp=CAU - as ./images/masks_on_face/9afab7fa9e.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSt3sJBvh3cLPTR0DI7nuP-gnR8xbPXAS5FVv5J7j0crisvzW_P&usqp=CAU - as ./images/masks_on_face/d82df7ada9.j



ERROR - Could not save https://www.refinery29.com/images/9500862.jpg?format=webp&width=760&height=912&quality=85&crop=5%3A6 - cannot identify image file <_io.BytesIO object at 0x10f30edb0>
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTygljuirps9NeBYKf9IdQFeMCGLmGlvmiEst3eqG2ZUq-f17I1&usqp=CAU - as ./images/masks_on_face/b24da596ad.jpg
SUCCESS - saved https://i.insider.com/5e8b62f0b3b092570e349002?width=600&format=jpeg&auto=webp - as ./images/masks_on_face/1eb548c288.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcRe1bJnj-Z2xmmBkfxZHm6q45DEOIaff040Qo7VuPAwtOyNPp4c&usqp=CAU - as ./images/masks_on_face/61bd2e40d7.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQaBQ-Z6uhim6pZ-C8hPOFbG9j4OyCQ0q4vVyR6hlrC8qSjfudm&usqp=CAU - as ./images/masks_on_face/ab6bcf665c.jpg
SUCCESS - saved https://images.squarespace-cdn.com/content/v1/58fd82dbbf629ab224f81b68/1556267018304-QHLZL6XZK7P2KD86AEPY/ke17ZwdGBToddI8pDm48kHgfR5

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTwlcEBYg1K3N-Xh-6K8zGVKyHUqAsywlv0Tgd5EI3CZ1kuWwbr&usqp=CAU - as ./images/masks_on_face/3422ede11d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTf3xf_2zzdZt2E3op5m9EpojbD-y2lTG5Uyeu_Rsml5tE0mNPs&usqp=CAU - as ./images/masks_on_face/8a1c4db76d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcTxyyjE-1alxsIZvFZu-22ChH-73M6c6-7rvnX0qOHL-4BWGE7h&usqp=CAU - as ./images/masks_on_face/458019f857.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQ1Xx96BqJqZM7XZ8laI8o23b4IBoystUouKNmjT0ohkPhGxbPy&usqp=CAU - as ./images/masks_on_face/2610154fd4.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcR_Jq-d_MXuHXxwO_J0kMoKMJF-WLp7rCd5bxCxpY7DgHHl2dtH&usqp=CAU - as ./images/masks_on_face/fb93397ab4.jpg
SUCCESS - saved https://i.insider.com/5e5e8b7cfee23d2c6d22c6d3?width=1100&format=jpeg&auto=webp - as ./images/masks_on_face/c

In [13]:
# demo for 10 images without mask
search_and_download(
    search_term = SEARCH_TERM_MASKOFF_1,
    driver_path = DRIVER_PATH,
    target_path = TARGET_PATH,
    number_images = 10
)

Found: 200 search results. Extracting links from 0:200
Found: 11 image links, done!
SUCCESS - saved https://images.unsplash.com/photo-1527082395-e939b847da0d?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80 - as ./images/portrait/8fce557b76.jpg
SUCCESS - saved https://i.pinimg.com/originals/dd/59/4e/dd594e241abf617abed2b7d586c19ef9.jpg - as ./images/portrait/f7aba13d3e.jpg
SUCCESS - saved https://i.pinimg.com/originals/03/33/21/033321112df5d5a0392082517cbe5143.jpg - as ./images/portrait/21e3b4fef5.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcR4Muui0K_Z5zZPReB1s04rLKY1otDBcUvsAq3knHP-1Y6BKrYy&usqp=CAU - as ./images/portrait/052a67584a.jpg
SUCCESS - saved https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Poster-sized_portrait_of_Barack_Obama.jpg/1200px-Poster-sized_portrait_of_Barack_Obama.jpg - as ./images/portrait/22e2a718f4.jpg
SUCCESS - saved https://i0.wp.com/zsuttonphoto.com/wp-content/uploads/2019/11/Los-Angeles-Beauty-Photography-19.jpg?