In [3]:
!pip3 install pillow
!pip3 install selenium

Collecting selenium
  Using cached selenium-3.141.0-py2.py3-none-any.whl (904 kB)
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [4]:
import os
import time
import io
import hashlib
import signal
import requests

from glob import glob
from PIL import Image
from selenium import webdriver

In [6]:
number_of_images = 200
GET_IMAGE_TIMEOUT = 2
SLEEP_BETWEEN_INTERACTIONS = 0.1
SLEEP_BEFORE_MORE = 5
IMAGE_QUALITY = 85

DRIVER_PATH = '.\\ads-image-classifier\\chromedriver.exe'        #insert path to chromedriver
OUTPUT_PATH = '.\\ads-image-classifier\\food'                    # insert path, where images should be saved

In [7]:
search_terms = ["grapes", "lasagna", "tacos", "pizza", "sushi", "salad"]

In [8]:
dirs = glob(OUTPUT_PATH + "*")
dirs = [dir.split("/")[-1].replace("_", " ") for dir in dirs]
search_terms = [term for term in search_terms if term not in dirs]

In [10]:
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
wd.get("https://google.com")

In [11]:
class timeout:
    def __init__(self, seconds=1, error_message="Timeout"):
        self.seconds = seconds
        self.error_message = error_message

    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)

    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)

    def __exit__(self, type, value, traceback):
        signal.alarm(0)

In [12]:
def fetch_image_urls(
    query: str,
    max_links_to_fetch: int,
    wd: webdriver,
    sleep_between_interactions: int = 1,
):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # Build the Google Query.
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    # Declared as a set, to prevent duplicates.
    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # Get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(
            f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}"
        )

        # Loop through image thumbnail identified
        for img in thumbnail_results[results_start:number_results]:
            
            # Try to click every thumbnail such that we can get the real image behind it.
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # Extract image urls
            actual_images = wd.find_elements_by_css_selector("img.n3VNCb")
            for actual_image in actual_images:
                if actual_image.get_attribute(
                    "src"
                ) and "http" in actual_image.get_attribute("src"):
                    image_urls.add(actual_image.get_attribute("src"))

            image_count = len(image_urls)

            # If the number images found exceeds our `num_of_images`, end the seaerch.
            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            # If we haven't found all the images we want, let's look for more.
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(SLEEP_BEFORE_MORE)

            # Check for button signifying no more images.
            not_what_you_want_button = ""
            try:
                not_what_you_want_button = wd.find_element_by_css_selector(".r0zKGf")
            except:
                pass

            # If there are no more images return.
            if not_what_you_want_button:
                print("No more images available.")
                return image_urls
            
            # If there is a "Load More" button, click it.
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button and not not_what_you_want_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")
            
            #break if no more images
            if results_start == number_results: 
                print("No more images found")
                return image_urls
                

        # Move the result startpoint further down.
        results_start = len(thumbnail_results)

    return image_urls


def persist_image(folder_path: str, url: str):
#     try:
#         print("Getting image")
#         # Download the image.  If timeout is exceeded, throw an error.
#         with timeout(GET_IMAGE_TIMEOUT):
#             image_content = requests.get(url).content

#     except Exception as e:
#         print(f"ERROR - Could not download {url} - {e}")

    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
      
    try:
        # Convert the image into a bit stream, then save it.
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert("RGB")
        # Create a unique filepath from the contents of the image.
        file_path = os.path.join(
            folder_path, hashlib.sha1(image_content).hexdigest()[:10] + ".jpg"
        )
        with open(file_path, "wb") as f:
            image.save(f, "JPEG", quality=IMAGE_QUALITY)
#         print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

def search_and_download(search_term: str, target_path="./images", number_images=5):
    # Create a folder name.
    target_folder = os.path.join(target_path, "_".join(search_term.lower().split(" ")))

    # Create image folder if needed.
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # Open Chrome
    with webdriver.Chrome() as wd:
        # Search for images URLs.
        res = fetch_image_urls(
            search_term,
            number_images,
            wd=wd,
            sleep_between_interactions=SLEEP_BETWEEN_INTERACTIONS,
        )

        # Download the images.
        if res is not None:
            for elem in res:
                persist_image(target_folder, elem)
        else:
            print(f"Failed to return links for term: {search_term}")


In [13]:
# Loop through all the search terms.
for term in search_terms:
    print("Searching images for " + term)
    search_and_download(term, output_path, number_of_images)
    print("Done saving images for " + term)

print("Done")

Searching images for grapes
Found: 100 search results. Extracting links from 0:100
Found: 107 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 200 image links, done!
ERROR - Could not save https://www.veg2you.net/wp-content/uploads/Seedless-Grape.jpg - cannot identify image file <_io.BytesIO object at 0x0000022EC32249F0>
ERROR - Could not save https://www.collinsdictionary.com/images/full/grape_229112122.jpg - cannot identify image file <_io.BytesIO object at 0x0000022EC38DDB80>
Done saving images for grapes
Searching images for lasagna
Found: 100 search results. Extracting links from 0:100
Found: 130 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 200 image links, done!
ERROR - Could not save https://cdn3.tmbi.com/toh/GoogleImages/Traditional-Lasagna_EXPS_THND16_12003_C07_26_6b.jpg - cannot identify image file <_io.BytesIO object at 0x0000022EC49C87C0>




ERROR - Could not save https://www.afamilyfeast.com/wp-content/uploads/2013/12/lasagna1.jpg - cannot identify image file <_io.BytesIO object at 0x0000022EC387EEF0>
Done saving images for lasagna
Searching images for tacos
Found: 100 search results. Extracting links from 0:100
Found: 137 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 200 image links, done!
Done saving images for tacos
Searching images for pizza
Found: 100 search results. Extracting links from 0:100
Found: 128 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 200 image links, done!




ERROR - Could not save https://www.lieferando.de/foodwiki/uploads/sites/8/2018/01/pizza-3.jpg - cannot identify image file <_io.BytesIO object at 0x0000022EC38DD130>
ERROR - Could not save https://rezept-db.womenshealth.de/image/rezept-db/fullWidthMobile/sh_Marzia-Giacobbe_172628543_Pizza-Capricciosa_Belag_Rezept_800x462.jpg.webp - cannot identify image file <_io.BytesIO object at 0x0000022EC387EEF0>
ERROR - Could not save https://static.wixstatic.com/media/b4dd99_cad6e95fb459427eb398b38d68fde76f~mv2_d_2048_1292_s_2.jpg/v1/fill/w_640,h_722,al_b,q_85,usm_0.66_1.00_0.01/b4dd99_cad6e95fb459427eb398b38d68fde76f~mv2_d_2048_1292_s_2.webp - cannot identify image file <_io.BytesIO object at 0x0000022EC49D37C0>
ERROR - Could not save https://static.wixstatic.com/media/77991f_021b94a2072d4d66a2c0397417f72cacf002.jpg/v1/fill/w_320,h_244,al_c,q_80,usm_0.33_1.00_0.00/77991f_021b94a2072d4d66a2c0397417f72cacf002.webp - cannot identify image file <_io.BytesIO object at 0x0000022EC387EEF0>
Done saving 