In [4]:
! pip install requests  bs4  selenium  tqdm 

Collecting requests
  Using cached requests-2.28.1-py3-none-any.whl (62 kB)
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting selenium
  Using cached selenium-4.7.2-py3-none-any.whl (6.3 MB)
Collecting tqdm
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Collecting charset-normalizer<3,>=2
  Using cached charset_normalizer-2.1.1-py3-none-any.whl (39 kB)
Collecting urllib3<1.27,>=1.21.1
  Using cached urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
Collecting trio-websocket~=0.9
  Using cached trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting trio~=0.17
  Using cached trio-0.22.0-py3-none-any.whl (384 kB)
Collecting exceptiongroup>=1.0.0rc9
  Using cached exceptiongroup-1.1.0-py3-none-any.whl (14 kB)
Collecting async-generator>=1.9
  Using cached async_generator-1.10-py3-none-any.whl (18 kB)
Collecting sortedcontainers
  Using cached sortedcontainers-2.4.0-p

In [1]:
import os
import re
import time
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm


In [2]:
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium import webdriver
from selenium.webdriver.common.by import By

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

buffer_size = 1024

In [3]:

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_images_from_link(url):
    """
    Function to find image links in a given website
    :param url: the URL of the desired website
    :return: a list of image links (URLs)
    """

    try:
        soup = bs(requests.get(url).content, "html.parser")
    except:
        return []

    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        alt = img.attrs.get("alt")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)

        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass

            # finally, if the url is valid
            if is_valid(img_url):
                urls.append(img_url)
    return urls


def download_image_from_url(url, pathname):
    """
    Download an image online from a URL and save it locally
    :param url: the image URL
    :param pathname: where to save the image
    :return: non
    """

    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    try:
        response = requests.get(url, stream=True)
    except:
        return 0
    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))
    if file_size < 30 * 1024:
        return 0

    try:
        img_name = url.split("/")[-1]
        img_name = img_name[:150]
        img_name = re.sub(r'[\\:/*"<>]', '_', img_name)

        # get the file name
        filename = os.path.join(pathname, img_name)

        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
        progress = tqdm(response.iter_content(buffer_size), f"Downloading {filename}", total=file_size, unit="B",
                        unit_scale=True, unit_divisor=1024)
        with open(filename, "wb") as f:
            for data in progress:
                # write data read to the file
                f.write(data)
                # update the progress bar manually
                progress.update(len(data))
    except:
        return 0


def download_images_from_website(url, path):
    """
    Download all the images from a given URL
    :param url: the website URL
    :param path: where to save these images
    :return: none
    """
    imgs = get_all_images_from_link(url)
    imgs = list(set(imgs))
    for img in imgs:
        # for each image, download it
        if img.split('.')[-1] in ['jpg', 'png', 'jpeg', 'jfif']:
            download_image_from_url(img, path)


def search_google_images(browser, url, query, max_google_pages=1):
    """
    Search a query keywork in Google images
    :param url: the URL of google images
    :param query: the keyword to search for
    :return: the HTML code of the Google Image result page
    """

    # Open the link
    browser.get(url)
    time.sleep(1)
    
    # Accept Google cookies:
    browser.find_element(By.XPATH,"//.[@aria-label='Accept all']").click()

    print("[%] Successfully opened link.")

    element = browser.find_element(By.TAG_NAME,'body')# browser.find_element_by_name("body")

    print("[%] Scrolling down.")
    # Scroll down
    for i in range(max_google_pages):
        element.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.3)  # bot id protection

    try:
        for i in range(max_google_pages):
            element.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.3)  # bot id protection
    except Exception:
        for i in range(10):
            element.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.3)  # bot id protection

    print("[%] Reached end of Page.")

    time.sleep(1)
    # Get page source and close the browser
    source = browser.page_source
    with open("{}/dataset/soups/{}.html".format(os.getcwd(), query), 'w', encoding='utf-8',
              errors='replace') as f:
        f.write(source)
    browser.close()
    print("[%] Closed Driver.")

    return source


In [4]:

# List of queries to search in Google images
queries = ['cute funny dogs']
max_google_pages = 1

google_url_base = "https://www.google.com/search?q={}&source=lnms&tbm=isch"

# Create a folder to save the output
cwd = os.getcwd()
if not os.path.isdir("{}/dataset/".format(cwd)):
    os.makedirs("{}/dataset/".format(cwd))
if not os.path.isdir("{}/dataset/soups/".format(cwd)):
    os.makedirs("{}/dataset/soups/".format(cwd))


# Open FireFox browser
firefox_binary = FirefoxBinary()
browser = webdriver.Firefox(firefox_binary=firefox_binary)
#     browser.set_window_size(1024, 768)
#     browser.minimize_window()
print("\n===============================================\n")
print("[%] Successfully launched FirefoxBinary")


for query in queries:
    url = google_url_base.format(query)

    # Return the HTML page of Google Image search of the query
    source_html = search_google_images(browser, url, query.replace(' ', '_'), max_google_pages=max_google_pages)

    # Parse the HTML using BS4
    soup = bs(source_html, "html.parser")

    # Find all images href
    all_href = soup.find_all('a', href=True)

    # Return the original Web pages of these images
    pages = []
    for a in all_href:
        link = a.attrs.get('href')
        if link.startswith('http'):
            pages.append(link)
    pages = list(set(pages))

    # download the images from their official website
    for p in pages:
        download_images_from_website(p, './dataset/images/{}/'.format(query))


  browser = webdriver.Firefox(firefox_binary=firefox_binary)




[%] Successfully launched FirefoxBinary
[%] Successfully opened link.
[%] Scrolling down.
[%] Reached end of Page.
[%] Closed Driver.


Extracting images: 0it [00:00, ?it/s]
Extracting images: 0it [00:00, ?it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 38/38 [00:00<00:00, 31099.23it/s]
Downloading ./dataset/images/cute funny dogs/lovable-dogs.jpg:   0%|               | 31.0/31.0k [00:00<00:11, 2.72kB/s]
Downloading ./dataset/images/cute funny dogs/cute-dog-1.jpg:   0%|                 | 52.0/51.9k [00:00<00:20, 2.55kB/s]
Downloading ./dataset/images/cute funny dogs/pets-world-sale.jpg:   0%|              | 168/167k [00:00<00:42, 4.04kB/s]
Downloading ./dataset/images/cute funny dogs/cute-dog-2.jpg:   0%|                 | 74.0/73.3k [00:00<00:30, 2.47kB/s]
Downloading ./dataset/images/cute funny dogs/lovable-dog-2.jpg:   0%|              | 39.0/38.5k [00:00<00:12, 3.10kB/s]
Downloading ./dataset/images/cute funny dogs/img1^4EE404791E61AB2248A58D4F1A624BF4B1E8B53797A5E9921F^pimgpsh_fullsize_d
Downloading ./dataset/images/cute funny dogs/lovable-dog-3.jpg:   0%|              |

Extracting images: 100%|████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 15982.11it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 88/88 [00:00<00:00, 85045.80it/s]
Extracting images: 0it [00:00, ?it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 64795.57it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 19355.35it/s]
Extracting images: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1906.94it/s]
Downloading ./dataset/images/cute funny dogs/0014f0380d573bb968de4c60949f2768.jpg:   0%| | 70.0/69.4k [00:00<00:14, 5.0
Extracting images: 0it [00:00, ?it/s]
Extracting images: 100%|██████████████████████████████████████████████████████████| 226/226 [00:00<00:00, 18230.49it/s]
Downloading ./dataset/images/cute funny dogs/hilarious-dogs-snapchats-24-5c740301acf

Downloading ./dataset/images/cute funny dogs/funny-beagle_working_from_home.jpg:   0%| | 31.0/30.3k [00:00<00:04, 7.36k
Downloading ./dataset/images/cute funny dogs/funny-gsp_bubble_wrap.jpg:   0%|      | 33.0/32.6k [00:00<00:03, 8.38kB/s]
Extracting images: 0it [00:00, ?it/s]
Extracting images: 100%|███████████████████████████████████████████████████████████████████████| 43/43 [00:00<?, ?it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 68900.27it/s]
Downloading ./dataset/images/cute funny dogs/HD-wallpaper-funny-dog-cute-dog-dogs-lick-puppies.jpg:   0%| | 50.0/49.8k 
Extracting images: 100%|██████████████████████████████████████████████████████████| 397/397 [00:00<00:00, 15638.63it/s]
Downloading ./dataset/images/cute funny dogs/20160504_200401-57a8c42b3dea3__700.jpg:   0%| | 39.0/38.5k [00:00<00:07, 5
Downloading ./dataset/images/cute funny dogs/DSC_11-57aa01e00516f__700.jpg:   0%|    | 168/167k [00:00<00:57, 2.98kB/s]
Do

Downloading ./dataset/images/cute funny dogs/Who-Were-The-Main-Characters-In-Sofia-The-First-370x297.jpg:   0%| | 31.0/
Downloading ./dataset/images/cute funny dogs/wet-dogs-before-after-bath-102-57a87ea89170d__700.jpg:   0%| | 66.0/65.6k 
Downloading ./dataset/images/cute funny dogs/wet-dogs-before-after-bath-40-57a439c780f8b__700.jpg:   0%| | 117/117k [00
Downloading ./dataset/images/cute funny dogs/wet-dogs-before-after-bath-104-57a87fdb8135e__700.jpg:   0%| | 77.0/76.7k 
Downloading ./dataset/images/cute funny dogs/wet-dogs-before-after-bath-25-57a4399f4c85c__700.jpg:   0%| | 43.0/42.5k [
Downloading ./dataset/images/cute funny dogs/image-57abedf71ec33__700.jpg:   0%|     | 88.0/87.5k [00:00<03:02, 490B/s]
Downloading ./dataset/images/cute funny dogs/image-57a93b5bb6ff9__700.jpg:   0%|   | 49.0/48.1k [00:00<00:16, 3.05kB/s]
Downloading ./dataset/images/cute funny dogs/wet-dogs-before-after-bath-33-57a439b19ef7e__700.jpg:   0%| | 112/112k [00
Downloading ./dataset/images/cute funny 

Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-136-5aa2631619aa6__700.jpg:   0%| | 408/408k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-7-5a9d5a81d4c18__700.jpg:   0%| | 97.0/96.4k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-62-5aa0013b2ca65__700.jpg:   0%| | 355/355k [0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-6-5a9d588bdd5f4__700.jpg:   0%| | 162/162k [00
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-139-5aa26619e6994__700.jpg:   0%| | 262/262k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-171-5aa2a1fc12850__700.jpg:   0%| | 372/372k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-118-5aa158a04f8b6__700.jpg:   0%| | 388/388k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-213-5a9eb68fda2f7__700.jpg:   0%| | 193/193k [
Downloading ./dataset/images/cute funny 

Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-49-5a9fb508d557c__700.jpg:   0%| | 325/324k [0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-192-5aa7e9016994b__700.jpg:   0%| | 784/784k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-29-5a9d55aad3435__700.jpg:   0%| | 262/261k [0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-203-5a9ea317e20e2__700.jpg:   0%| | 127/126k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-239-5a9fc743e33c6__700.jpg:   0%| | 101/101k [
Downloading ./dataset/images/cute funny dogs/slovakian-asks-questions-about-us-suburbs-thumb.jpg:   0%| | 45.0/44.2k [0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-32-5a9d5ad763780__700.jpg:   0%| | 266/266k [0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-128-5aa25a0c9ce8c__700.jpg:   0%| | 377/376k [
Downloading ./dataset/images/cute funny 

Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-5a9d43944a0be__700.jpg:   0%| | 422/422k [00:0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-98-5aa1449de05d3__700.jpg:   0%| | 466/465k [0
Downloading ./dataset/images/cute funny dogs/celebrities-with-their-parents-hidreley-latest.jpg:   0%| | 33.0/32.5k [00
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-178-5aa68d965aa8b__700.jpg:   0%| | 854/854k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-5a9d471fc8ba5__700.jpg:   0%| | 378/378k [00:0
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-27-5a9d548112a01__700.jpg:   0%| | 365/365k [0
Downloading ./dataset/images/cute funny dogs/Funny-Dogs-Photos-Snapchats19-5a9e44c927f19__700.jpg:   0%| | 77.0/76.1k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-132-5aa26029ebc94__700.jpg:   0%| | 415/414k [
Downloading ./dataset/images/cute funny 

Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-236-5a9fbfbf19eb9__700.jpg:   0%| | 128/128k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-90-5aa13b6518b7f__700.jpg:   0%| | 392/392k [0
Downloading ./dataset/images/cute funny dogs/1-5a9eab8d5d1a4-png__700.jpg:   0%|     | 238/238k [00:00<00:36, 6.67kB/s]
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-52-5a9fb46552f02-png__700.jpg:   0%| | 117/117
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-206-5a9ea56eb39ac__700.jpg:   0%| | 176/176k [
Downloading ./dataset/images/cute funny dogs/funny-dogs-photos-snapchats-36-5a9fff683cfd2__700.jpg:   0%| | 511/511k [0
Extracting images: 100%|███████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 2990.24it/s]
Extracting images: 100%|████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 34877.80it/s]
Extracting images: 0it [00:00, ?it/s]
