# Download Google Images by Keyword using Selenium

**Steps:**
- Check Google Chrome version, and download [ChromeDriver](https://chromedriver.chromium.org/downloads) according to your version
- Take note of where you put your ChromeDriver.exe, you will need it :)
- `pip install selenium` and `pip install Pillow`
- Funny, Pillow is one of the powerful tools that can colorized grayscale picture, but here, we are using it to save Image and also, to save as both normal color version, and grayscale version `img = img.convert("L")`
- Here I focus in downloading images, not yet gray scaled it lol


### Import Libraries

In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import io
import os
from PIL import Image
import hashlib

### Example of Selenium Webdriver
- Input specific website for your web browser, just an example to better understand Selenium lol, don't need to run if you don't wanna

In [2]:
# Where I place the chromedriver.exe
DRIVER_PATH = "C:\\Users\\Alice\\Desktop\\PYTHON\\PIC 16B\\PROJECT\\chromedriver.exe"

# Open Webdriver, then get google main search page "https://google.com"
wd = selenium.webdriver.Chrome(executable_path = DRIVER_PATH)
wd.get('https://google.com')

In [3]:
# Inspect google search page --> select search box = 'input.gLFyf'
search_box = wd.find_element_by_css_selector('input.gLFyf')

# send keyword 'Portraits in search box
search_box.send_keys('Portraits')

In [4]:
# close the webdriver
wd.quit()

# Fetch Images Urls on Google Image Search

In [5]:
DRIVER_PATH = "C:\\Users\\Alice\\Desktop\\PYTHON\\PIC 16B\\PROJECT\\chromedriver.exe"

In [6]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver):
    """
    Fetch google image links
    Argurment:
    - query:search term, example: nature, portrait,...
    - max_link_to_fetch: as name indicate
    - wd: instantiate webdriver (DRIVER_PATH = global variable)
        (wd = webdriver.Chrome(executable_path = DRIVER_PATH 
    
    """
    
    # Indicate to keep scrolling to get more images
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # build the google query, just a image search url for random image with input query {q}
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page with search_url and input keyword query
    wd.get(search_url.format(q=query))
    
    # initiate 
    image_urls = set()
    image_count = 0
    results_start = 0
    
    # loops to get all image links < max input
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results, inspect for css of all images = img.Q4LuWd (the small images)
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        
        # for each images in css thumbnails list 
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                
            # some link can't be click or exception were raisedm, pass with continue
            except Exception:
                continue

            # extract image urls: after click google image, a bigger images will shows up --> css = img.n3VNCb
            # this is where we can actually get the exact image urls to download
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            
            # for each image in the image urls list, the links is under src attribute, so check both src att and http in link
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))
                    
            image_count = len(image_urls)
            
            # break the loops when hit the max number of link
            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            # After scrolling down quite a lot, there will be a "show more result" button --> css = input.mye4qd
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button: # do similar to scroll more
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

# Downloading the Images

In [7]:
def download_image(folder_path:str, gray_folder_path:str, url:str):
    """
    Download Images as color and as gray to train/test
    Argument:
    - folder_path = color folder (download as it is online)
    - gray_folder_path = gray images folder (convert original to gray)
    
    """
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        gray_image = Image.open(image_file).convert('LA').convert('RGB')
        
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        gray_file_path = os.path.join(gray_folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
                                   
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
            
        with open(gray_file_path, 'wb') as f:
            gray_image.save(f, "JPEG", quality=85)
            
#         print(f"SUCCESS - saved {url} - as {file_path} and {gray_file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

# Combine Search Url and Download Images:
* Return 1 large folder with name = search keyword
* Inside that folder is 2 other: color and gray folder

In [8]:
def search_and_download(search_term:str, driver_path:str, target_path='./images', number_images=5):
    """
    Combine search and download and drop all in 1 large folder with name = search keyword
    Argument:
    - search_term: search keyword for google image
    - driver_path: global variable of where we put that ChromeDriver.exe
    - target_path: create a "images" folder where this notebook lies, then inside are smaller folder with names = search_term
    - number_images: how many images you want?
    
    """
    
    # create folder
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')), str('Color'))
    gray_target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')), str('Gray'))

    if not os.path.exists(target_folder): # if the folder images not exist, create it
        os.makedirs(target_folder)

    if not os.path.exists(gray_target_folder): # if the folder images not exist, create it
        os.makedirs(gray_target_folder)
        
    # use webdriver with google search and fetch image links
    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd)
    
    # for each link, download it down
    for elem in res:
        download_image(target_folder, gray_target_folder, elem)
    

In [9]:
search_and_download(search_term = 'cat', driver_path=DRIVER_PATH, number_images=100)

Found: 84 image links, looking for more ...
Found: 100 image links, done!


In [10]:
search_and_download(search_term = 'portrait', driver_path=DRIVER_PATH, number_images=100)

Found: 100 image links, done!


In [11]:
search_and_download(search_term = 'nature', driver_path=DRIVER_PATH, number_images=100)

Found: 81 image links, looking for more ...
Found: 100 image links, done!


In [12]:
search_and_download(search_term = 'dog', driver_path=DRIVER_PATH, number_images=100)

Found: 40 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 59 image links, looking for more ...
Found: 70 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 79 image links, looking for more ...
Found: 90 image links, looking for more ...
Found: 100 image links, done!


In [13]:
search_and_download(search_term = 'photograph', driver_path=DRIVER_PATH, number_images=100)

Found: 29 image links, looking for more ...
Found: 30 image links, looking for more ...
Found: 30 image links, looking for more ...
Found: 30 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 42 image links, looking for more ...
Found: 53 image links, looking for more ...
Found: 63 image links, looking for more ...
Found: 63 image links, looking for more ...
Found: 63 image links, looking for more ...
Found: 63 image links, looking for more ...
Found: 63 image links, looking for more ...
Found: 75 image links, looking for more ...
Found: 88 image links, looking for more ...
Found: 99 image links, looking for more ...
Found: 102 image links, done!


### Hey, 1000 (500color and 500gray) images in around 3-5minutes actually, so all fine! 

Just little calculation here: for 10,000 picture (5,000 color and 5,000gray) cost around 50minutes. Lmao, so let's only do that much. =)) I think that's plentiful already!<br>
Also, we can increase the subject for sure, or just search something random like "photograph" like above, it's pretty random, and "portrait" is a must! lol HAVE FUNNNNNN