In [None]:
# Using the WebDriver in Selenium as the browser automation framework for this webscraping project.
from selenium import webdriver

# Using service to start and stop local drivers, which in this case would be Geckodriver. 
from selenium.webdriver.firefox.service import Service as FIREFOXSERVICE

# Using By to locate elements using their class names.
from selenium.webdriver.common.by import By

# Using Beautiful Soup to parse the HTML of a webpage.
import bs4

import time
import requests
import io
from PIL import Image, UnidentifiedImageError

This is the function that is used to obtain all of the image URLs. Here, the WebDriver first verifies that an element of the webpage is an image using its class name (to avoid clicking on a related search box). The WebDriver then clicks on it and verifies that its URL is usable. The URL is then stored and this process is repeated until the desired number of image URLs is obtained. Once the function has finished this process, all of the stored image URLs are returned.

In [None]:
def get_images(driver, thumbnails, max_images):

    image_URLs = set() # Where all of the usable image URLs are stored.

    for img in thumbnails[len(image_URLs): max_images]: # Loops through the images of the webpage to obtain and store the number of image URLs requested.
        try: 
            img.click()
            time.sleep(0.5)

        except:
            continue

        images = driver.find_elements(By.CLASS_NAME, "iPVvYb") # This class name is obtained from the actual image and not its thumbnail.
        for image in images:
            if image.get_attribute('src') in image_URLs: # Prevents an image URL from being stored if it is already there.
                max_images += 1 # Accounts for a duplicate image URL by ensuring that the function still returns the number of image URLs requested.
                break

            if image.get_attribute('src') and 'http' in image.get_attribute('src'): # Checks if an image URL is usable.
                    image_URLs.add(image.get_attribute('src')) # Stores an image URL if it is usable.
                    print(f"Found {len(image_URLs)}")
        
    return image_URLs # Returns all of the usable image URLs.
    


        

This is the function that downloads the obtained images to a folder requested by the user. Here, an HTTP GET request obtains the content of each image, which are then opened as BytesIO objects. These images are later downloaded in a JPEG format to the requested folder. The function also checks that each image is actually able to be downloaded and that it is not in an invalid format or mode.

In [None]:
def download_images(download_path, urlsToBeDownloaded, file_name):
    try:
        image_content = requests.get(urlsToBeDownloaded).content    # Obtains the content of the image from the URL.
        image_file = io.BytesIO(image_content)  # Creates a BytesIO object from the image.
        image = Image.open(image_file)  # Opens the image as a BytesIO object.
    
        if image.mode == 'P': # Prints an error message if the image is in mode P, which prevents it from being downloaded. 
            print(f"Skipping image with mode 'P': {urlsToBeDownloaded}")
            return

        file_path = download_path + file_name   # The file path to the folder which the image will be stored in.

        with open(file_path, "wb") as f:    # Saves the image to the folder in a JPEG format.
            image.save(f, "JPEG")

    except UnidentifiedImageError:  # Prints an error message if the image format cannot be identified.
        print(f"Error: Unable to identify the image format for URL: {urlsToBeDownloaded}")
    except Exception as e:  # Prints an error message if the image format cannot be downloaded.
        print(f"Error occurred: {e}")

First, the WebDriver is initialized to open Firefox. This is done by configuring FirefoxService to use GeckoDriver, which is located on my computer.

Following, the WebDriver navigates to a webpage with a Google Image Search of surprised faces and executes JavaScript code that scrolls down until every image is loaded. The while loop ensures that this process is repeated and that the program does not continue until this actually happens.

In [None]:
geckodriver_path = '/Users/keigotamaki/UCLA/NSDC/Drivers/geckodriver'
firefoxService = FIREFOXSERVICE(executable_path=geckodriver_path)
driver = webdriver.Firefox(service = firefoxService)

search_URL = "https://www.google.com/search?client=firefox-b-d&sca_esv=f917e2823a31333e&q=human+surprised+face&tbm=isch&source=lnms&sa=X&ved=2ahUKEwjMq7WOgqiEAxWNIEQIHXkQDyQQ0pQJegQICBAB&biw=1280&bih=701&dpr=2"
driver.get(search_URL)

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
driver.execute_script("window.scrollTo(0, 0);")

The WebDriver then stores the HTML of the webpage, and BeautifulSoup parses the HTML to find elements that are candidates to be an image with a usable URL. In this case, BeautifulSoup finds all div elements with a class name of "Q4LuWd" and stores them into a variable.

In [None]:
page_html = driver.page_source
pageSoup = bs4.BeautifulSoup(page_html, 'html.parser')
thumbnails = driver.find_elements(By.CLASS_NAME, "Q4LuWd")
time.sleep(3)

len_thumbnails = len(thumbnails)
print("Found %s image candidates"%(len_thumbnails))

Following, this function (implemented above) goes through each image candidate that Beautiful Soup found above and returns their image URLs if they are usable.

In [None]:
image_request = 325 # For this project, we are trying to download around 300 images but we also want to account for the number of images that cannot be downloaded.

urlsToBeDownloaded = get_images(driver, thumbnails, image_request) # A function that obtains and stores the number of image URLs requested.

Then, another function (also implemented above) downloades the images into a chosen file. Once every image is downloaded, the driver quits and the Firefox browser is closed.

In [None]:
for i, urlsToBeDownloaded in enumerate(urlsToBeDownloaded):
    download_images("surprised_images/", urlsToBeDownloaded, str(i) + ".jpg") # A function that downloades each obtained image into a requested folder.

driver.quit()