<a href="https://colab.research.google.com/github/toan-ly/Image-Retrieval/blob/feature%2Ftraditional_retrieval/Crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tqdm

!apt-get update
!apt-get install -y wget
!pip install selenium
!apt-get install -y chromium-browser
!apt-get install -y chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup # for parsing HTML content
from urllib.parse import urljoin, urlparse # for handling URLs
import urllib.request # for making HTTP requests
import time # for handling time-related operations
import os # for interacting with the operating system (related to dir, folder, file)
from tqdm import tqdm
import concurrent.futures # for multi-threading
import json # for writing to a text file
from PIL import Image

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Ign:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,553 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,423 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,447 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,133 kB]
Get:14 

In [2]:
url = 'https://www.flickr.com/search/?text='
search_term = 'cat'

In [3]:
class UrlScraper:
    def __init__(self, url_template, max_images=50, max_workers=4):
        self.url_template = url_template # link crawl
        self.max_images = max_images # max images
        self.max_workers = max_workers # max threads
        self.setup_environment()

    # Set up environment for selenium
    def setup_environment(self):
        os.environ['PATH'] += ':/usr/lib/chromium-browser/'
        os.environ['PATH'] += ':/usr/lib/chromium-browser/chromedriver/'

    def get_url_images(self, term):
        """
        Crawl the urls of images by term

        Parameters:
        term (str): The name of animal, plant, scenery, furniture

        Returns:
        urls (list): List of urls of images
        """

        # Initialize Chrome driver
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(options=options)

        url = self.url_template.format(search_term=term)
        driver.get(url)

        # Start crawl urls of image like brute force - the same mechanism with this but add some features
        urls = []
        more_content_available = True

        pbar = tqdm(total=self.max_images, desc=f'Fetching images for {term}')

        while len(urls) < self.max_images and more_content_available:
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            img_tags = soup.find_all('img')

            for img in img_tags:
                if len(urls) >= self.max_images:
                    break
                if 'src' in img.attrs:
                    href = img.attrs['src']
                    img_path = urljoin(url, href)
                    img_path = img_path.replace('_m.jpg', '_b.jpg').replace('_n.jpg', '_b.jpg').replace('_w.jpg', '_b.jpg')
                    if img_path == "https://combo.staticflickr.com/ap/build/images/getty/IStock_corporate_logo.svg":
                        continue
                    urls.append(img_path)
                    pbar.update(1)
            try:
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//button[@id="yui_3_16_0_1_1721642285931_28620"]'))
                )
                load_more_button.click()
                time.sleep(2)
            except:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)

                new_soup = BeautifulSoup(driver.page_source, 'html.parser')
                new_img_tags = new_soup.find_all('img', loading='lazy')
                if len(new_img_tags) == len(img_tags):
                    more_content_available = False
                img_tags = new_img_tags

        pbar.close()
        driver.quit()
        return urls

    def scrape_urls(self, categories):
        """
        Call get_url_images method to get all urls of any object in categories\

        Parameter:
        categories (dictionary): the dict of all object we need to collect image with format categories{"name_object": [value1, value2, ...]}

        Returns:
        all_urls (dictionary): Dictionary of urls of images
        """
        all_urls = {category: {} for category in categories}

        # Handle multi-threading for efficent installation
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_term = {executor.submit(self.get_url_images, term): (category, term)
                              for category, terms in categories.items() for term in terms}

            for future in tqdm(concurrent.futures.as_completed(future_to_term), total=len(future_to_term), desc="Overall Progress"):
                category, term = future_to_term[future]
                try:
                    urls = future.result()
                    all_urls[category][term] = urls
                    print(f"\nNumber of images retrieved for {term}: {len(urls)}")
                except Exception as exc:
                    print(f"\n{term} generated an exception: {exc}")
        return all_urls

    def save_to_file(self, data, filename):
        """
        Save the data to a JSON file.

        Parameters:
        data (dict): The data to be saved.
        filename (str): The name of the JSON file.

        Returns:
        None
        """
        with open(filename, 'w') as file:
            json.dump(data, file, indent=4)
        print(f"Data saved to {filename}")


In [4]:
categories = {
    "animal": ["Monkey", "Elephant", "cows", "Cat", "Dog", "bear", "fox", "Civet", "Pangolins", "Rabbit", "Bats", "Whale", "Cock", "Owl", "flamingo", "Lizard", "Turtle", "Snake", "Frog", "Fish", "shrimp", "Crab", "Snail", "Coral", "Jellyfish", "Butterfly", "Flies", "Mosquito", "Ants", "Cockroaches", "Spider", "scorpion", "tiger", "bird", "horse", "pig", "Alligator", "Alpaca", "Anteater", "donkey", "Bee", "Buffalo", "Camel", "Caterpillar", "Cheetah", "Chicken", "Dragonfly", "Duck", "panda", "Giraffe"],
    "plant": ["Bamboo", "Apple", "Apricot", "Banana", "Bean", "Wildflower", "Flower", "Mushroom", "Weed", "Fern", "Reed", "Shrub", "Moss", "Grass", "Palmtree", "Corn", "Tulip", "Rose", "Clove", "Dogwood", "Durian", "Ferns", "Fig", "Flax", "Frangipani", "Lantana", "Hibiscus", "Bougainvillea", "Pea", "OrchidTree", "RangoonCreeper", "Jackfruit", "Cottonplant", "Corneliantree", "Coffeeplant", "Coconut", "wheat", "watermelon", "radish", "carrot"],
    "furniture": ["bed", "cabinet", "chair", "chests", "clock", "desks", "table", "Piano", "Bookcase", "Umbrella", "Clothes", "cart", "sofa", "ball", "spoon", "Bowl", "fridge", "pan", "book"],
    "scenery": ["Cliff", "Bay", "Coast", "Mountains", "Forests", "Waterbodies", "Lake", "desert", "farmland", "river", "hedges", "plain", "sky", "cave", "cloud", "flowergarden", "glacier", "grassland", "horizon", "lighthouse", "plateau", "savannah", "valley", "volcano", "waterfall"]
}

In [5]:
urltopic = {"flickr": "https://www.flickr.com/search/?text={search_term}"}
scraper = UrlScraper(url_template=urltopic["flickr"], max_images=20, max_workers=5)
image_urls = scraper.scrape_urls(categories)

Overall Progress:   0%|          | 0/134 [00:00<?, ?it/s]
Fetching images for Monkey:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Dog:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Monkey:   5%|▌         | 1/20 [00:02<00:52,  2.75s/it][A


Fetching images for Elephant:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for Cat:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A

Fetching images for Dog:   5%|▌         | 1/20 [00:02<00:38,  2.03s/it][A[A


Fetching images for Elephant:   5%|▌         | 1/20 [00:01<00:28,  1.52s/it][A[A[A



Fetching images for Cat:   5%|▌         | 1/20 [00:01<00:25,  1.33s/it][A[A[A[A




Fetching images for cows:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Monkey: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Overall Progress:   1%|          | 1/134 [01:05<2:25:47, 65.77s/it]


Number of images retrieved for Monkey: 20


Fetching images for Cat: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Fetching images for Dog: 100%|██████████| 20/20 [00:17<00:00,  1.11it/s]
Fetching images for Elephant: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]


Number of images retrieved for Dog: 20

Number of images retrieved for Cat: 20



Overall Progress:   3%|▎         | 4/134 [01:10<24:14, 11.19s/it]  


Number of images retrieved for Elephant: 20


Fetching images for cows: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:   4%|▎         | 5/134 [01:11<17:19,  8.06s/it]


Number of images retrieved for cows: 20



Fetching images for Civet:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for fox:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for Pangolins:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for Rabbit:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A

Fetching images for fox:   5%|▌         | 1/20 [00:01<00:36,  1.91s/it][A[A
Fetching images for Civet:   5%|▌         | 1/20 [00:02<00:42,  2.22s/it][A


Fetching images for Pangolins:   5%|▌         | 1/20 [00:02<00:40,  2.14s/it][A[A[A



Fetching images for Civet: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Fetching images for fox: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Fetching images for Pangolins: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]


Number of images retrieved for Civet: 20

Number of images retrieved for fox: 20



Fetching images for Rabbit: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Overall Progress:   7%|▋         | 9/134 [01:43<10:49,  5.20s/it]


Number of images retrieved for Pangolins: 20

Number of images retrieved for Rabbit: 20



Fetching images for Owl:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Owl:   5%|▌         | 1/20 [00:01<00:25,  1.32s/it][A


Fetching images for Cock:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A

Fetching images for Bats:   0%|          | 0/20 [00:00<?, ?it/s][A[A



Fetching images for Whale:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for Cock:   5%|▌         | 1/20 [00:01<00:31,  1.68s/it][A[A[A



Fetching images for Whale:   5%|▌         | 1/20 [00:01<00:37,  1.97s/it][A[A[A[A

Fetching images for Owl: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:   7%|▋         | 10/134 [02:12<26:02, 12.60s/it]


Number of images retrieved for Owl: 20


Fetching images for Cock: 100%|██████████| 20/20 [00:15<00:00,  1.33it/s]
Overall Progress:   8%|▊         | 11/134 [02:15<19:39,  9.59s/it]


Number of images retrieved for Cock: 20


Fetching images for Whale: 100%|██████████| 20/20 [00:15<00:00,  1.27it/s]
Overall Progress:   9%|▉         | 12/134 [02:16<14:02,  6.91s/it]


Number of images retrieved for Whale: 20


Fetching images for Bats: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  10%|▉         | 13/134 [02:17<10:19,  5.12s/it]


Number of images retrieved for Bats: 20



Fetching images for flamingo:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for flamingo:   5%|▌         | 1/20 [00:02<00:39,  2.06s/it][A

Fetching images for Turtle:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Turtle:   5%|▌         | 1/20 [00:00<00:15,  1.19it/s][A[A


Fetching images for Snake:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for flamingo: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  10%|█         | 14/134 [02:37<19:23,  9.70s/it]


Number of images retrieved for flamingo: 20



Fetching images for Frog:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Turtle: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Overall Progress:  11%|█         | 15/134 [02:45<18:07,  9.14s/it]


Number of images retrieved for Turtle: 20


Fetching images for Snake: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]
Overall Progress:  12%|█▏        | 16/134 [02:46<13:13,  6.72s/it]


Number of images retrieved for Snake: 20




Fetching images for Fish:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Fish:   5%|▌         | 1/20 [00:00<00:09,  1.95it/s][A[A


Fetching images for shrimp:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Frog: 100%|██████████| 20/20 [00:13<00:00,  1.46it/s]
Overall Progress:  13%|█▎        | 17/134 [02:56<14:53,  7.64s/it]


Number of images retrieved for Frog: 20



Fetching images for Fish: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]

Overall Progress:  13%|█▎        | 18/134 [03:06<16:33,  8.56s/it]


Number of images retrieved for Fish: 20


Fetching images for shrimp: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  14%|█▍        | 19/134 [03:08<12:22,  6.46s/it]


Number of images retrieved for shrimp: 20




Fetching images for Snail:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Snail:   5%|▌         | 1/20 [00:00<00:05,  3.18it/s][A[A


Fetching images for Coral:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Crab: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Overall Progress:  15%|█▍        | 20/134 [03:19<15:00,  7.90s/it]


Number of images retrieved for Crab: 20



Fetching images for Jellyfish:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Jellyfish:   5%|▌         | 1/20 [00:00<00:12,  1.58it/s][A



Fetching images for Snail: 100%|██████████| 20/20 [00:13<00:00,  1.47it/s]
Overall Progress:  16%|█▌        | 21/134 [03:29<15:53,  8.44s/it]


Number of images retrieved for Snail: 20






Fetching images for Coral: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]
Overall Progress:  16%|█▋        | 22/134 [03:31<12:14,  6.55s/it]


Number of images retrieved for Coral: 20


Fetching images for Jellyfish: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
Overall Progress:  17%|█▋        | 23/134 [03:37<11:49,  6.39s/it]


Number of images retrieved for Jellyfish: 20



Fetching images for Butterfly:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Butterfly:   5%|▌         | 1/20 [00:02<00:48,  2.54s/it][A

Fetching images for bear: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
Overall Progress:  18%|█▊        | 24/134 [03:47<13:25,  7.32s/it]


Number of images retrieved for bear: 20




Fetching images for Flies:   5%|▌         | 1/20 [00:01<00:23,  1.25s/it][A[A


Fetching images for Mosquito:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Mosquito:   5%|▌         | 1/20 [00:00<00:08,  2.14it/s][A[A[A



Fetching images for Ants:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Butterfly: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  19%|█▊        | 25/134 [03:58<15:37,  8.60s/it]


Number of images retrieved for Butterfly: 20


Fetching images for Flies: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  19%|█▉        | 26/134 [04:01<12:22,  6.88s/it]


Number of images retrieved for Flies: 20


Fetching images for Mosquito: 100%|██████████| 20/20 [00:16<00:00,  1.25it/s]
Overall Progress:  20%|██        | 27/134 [04:06<11:19,  6.35s/it]


Number of images retrieved for Mosquito: 20


Fetching images for Ants: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  21%|██        | 28/134 [04:10<09:44,  5.51s/it]


Number of images retrieved for Ants: 20



Fetching images for Cockroaches:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Cockroaches:   5%|▌         | 1/20 [00:02<00:39,  2.09s/it][A

Fetching images for Spider:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Spider:   5%|▌         | 1/20 [00:01<00:28,  1.49s/it][A[A


Fetching images for tiger:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Cockroaches: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  22%|██▏       | 29/134 [04:25<14:51,  8.49s/it]


Number of images retrieved for Cockroaches: 20



Fetching images for bird:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Spider: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  22%|██▏       | 30/134 [04:32<13:59,  8.07s/it]


Number of images retrieved for Spider: 20


Fetching images for tiger: 100%|██████████| 20/20 [00:13<00:00,  1.45it/s]
Overall Progress:  23%|██▎       | 31/134 [04:35<11:14,  6.55s/it]


Number of images retrieved for tiger: 20




Fetching images for scorpion:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for scorpion:   5%|▌         | 1/20 [00:01<00:19,  1.02s/it][A[A


Fetching images for horse:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for bird: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  24%|██▍       | 32/134 [04:46<13:11,  7.76s/it]


Number of images retrieved for bird: 20



Fetching images for pig:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for scorpion: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Overall Progress:  25%|██▍       | 33/134 [04:54<13:15,  7.88s/it]


Number of images retrieved for scorpion: 20




Fetching images for Alligator:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for horse: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  25%|██▌       | 34/134 [04:56<10:17,  6.18s/it]


Number of images retrieved for horse: 20





Fetching images for Alpaca:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Alpaca:   5%|▌         | 1/20 [00:00<00:09,  1.92it/s][A[A[A



Fetching images for pig: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  26%|██▌       | 35/134 [05:03<10:19,  6.26s/it]


Number of images retrieved for pig: 20






Fetching images for Alligator: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]

Overall Progress:  27%|██▋       | 36/134 [05:11<11:15,  6.89s/it]


Number of images retrieved for Alligator: 20




Fetching images for donkey:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Lizard:   5%|▌         | 1/20 [00:01<00:21,  1.12s/it][A

Fetching images for Alpaca: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:  28%|██▊       | 37/134 [05:16<10:13,  6.32s/it]


Number of images retrieved for Alpaca: 20


Fetching images for Anteater: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  28%|██▊       | 38/134 [05:19<08:34,  5.36s/it]


Number of images retrieved for Anteater: 20





Fetching images for Bee:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for donkey: 100%|██████████| 20/20 [00:18<00:00,  1.07it/s]
Fetching images for Lizard: 100%|██████████| 20/20 [00:19<00:00,  1.04it/s]
Overall Progress:  29%|██▉       | 39/134 [05:30<11:14,  7.10s/it]


Number of images retrieved for donkey: 20



Overall Progress:  30%|██▉       | 40/134 [05:30<07:52,  5.03s/it]


Number of images retrieved for Lizard: 20



Fetching images for Buffalo:   5%|▌         | 1/20 [00:00<00:13,  1.41it/s][A

Fetching images for Camel:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Bee: 100%|██████████| 20/20 [00:18<00:00,  1.06it/s]



Overall Progress:  31%|███       | 41/134 [05:45<12:06,  7.81s/it]


Number of images retrieved for Bee: 20






Fetching images for Cheetah:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for Caterpillar:   5%|▌         | 1/20 [00:01<00:24,  1.28s/it][A[A[A



Fetching images for Buffalo: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  31%|███▏      | 42/134 [05:47<09:40,  6.31s/it]


Number of images retrieved for Buffalo: 20


Fetching images for Camel: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  32%|███▏      | 43/134 [05:52<08:34,  5.65s/it]


Number of images retrieved for Camel: 20



Fetching images for Chicken:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Chicken:   5%|▌         | 1/20 [00:00<00:13,  1.43it/s][A

Fetching images for Dragonfly:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Caterpillar: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  33%|███▎      | 44/134 [06:01<10:04,  6.72s/it]


Number of images retrieved for Caterpillar: 20


Fetching images for Cheetah: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Overall Progress:  34%|███▎      | 45/134 [06:03<07:46,  5.25s/it]


Number of images retrieved for Cheetah: 20





Fetching images for Duck:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Chicken: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  34%|███▍      | 46/134 [06:13<09:52,  6.74s/it]


Number of images retrieved for Chicken: 20


Fetching images for Dragonfly: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]

Overall Progress:  35%|███▌      | 47/134 [06:15<07:53,  5.44s/it]


Number of images retrieved for Dragonfly: 20




Fetching images for Giraffe:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for panda:   5%|▌         | 1/20 [00:01<00:31,  1.63s/it][A

Fetching images for Duck: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  36%|███▌      | 48/134 [06:20<07:33,  5.27s/it]


Number of images retrieved for Duck: 20





Fetching images for Bamboo:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Bamboo:   5%|▌         | 1/20 [00:00<00:16,  1.17it/s][A[A[A



Fetching images for Apple:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Apple:   5%|▌         | 1/20 [00:01<00:34,  1.83s/it][A[A[A[A




Fetching images for panda: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Fetching images for Giraffe: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  37%|███▋      | 49/134 [06:32<10:21,  7.31s/it]


Number of images retrieved for Giraffe: 20

Number of images retrieved for panda: 20







Fetching images for Apricot:   5%|▌         | 1/20 [00:00<00:13,  1.39it/s][A[A[A[A[A
Fetching images for Banana:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Bean:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Banana:   5%|▌         | 1/20 [00:01<00:33,  1.74s/it][A

Fetching images for Bamboo: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]
Overall Progress:  38%|███▊      | 51/134 [06:46<09:48,  7.09s/it]


Number of images retrieved for Bamboo: 20


Fetching images for Apple: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  39%|███▉      | 52/134 [06:47<07:30,  5.49s/it]


Number of images retrieved for Apple: 20


Fetching images for Apricot: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
Overall Progress:  40%|███▉      | 53/134 [06:48<05:59,  4.44s/it]


Number of images retrieved for Apricot: 20


Fetching images for Banana: 100%|██████████| 20/20 [00:19<00:00,  1.01it/s]
Overall Progress:  40%|████      | 54/134 [07:00<08:38,  6.49s/it]


Number of images retrieved for Banana: 20



Fetching images for Bean: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


Overall Progress:  41%|████      | 55/134 [07:01<06:34,  4.99s/it]


Number of images retrieved for Bean: 20



Fetching images for Wildflower:   5%|▌         | 1/20 [00:01<00:32,  1.69s/it][A

Fetching images for Flower:   5%|▌         | 1/20 [00:01<00:30,  1.62s/it][A[A


Fetching images for Mushroom:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Mushroom:   5%|▌         | 1/20 [00:01<00:23,  1.22s/it][A[A[A



Fetching images for Weed:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Weed:   5%|▌         | 1/20 [00:00<00:12,  1.54it/s][A[A[A[A




Fetching images for Fern:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Wildflower: 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
Overall Progress:  42%|████▏     | 56/134 [07:16<10:20,  7.95s/it]


Number of images retrieved for Wildflower: 20


Fetching images for Flower: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Fetching images for Mushroom: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]


Number of images retrieved for Flower: 20



Overall Progress:  43%|████▎     | 58/134 [07:18<05:35,  4.41s/it]


Number of images retrieved for Mushroom: 20


Fetching images for Weed: 100%|██████████| 20/20 [00:19<00:00,  1.05it/s]
Overall Progress:  44%|████▍     | 59/134 [07:27<07:00,  5.61s/it]


Number of images retrieved for Weed: 20


Fetching images for Fern: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]
Overall Progress:  45%|████▍     | 60/134 [07:28<05:09,  4.19s/it]


Number of images retrieved for Fern: 20



Fetching images for Reed:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Shrub:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Reed:   5%|▌         | 1/20 [00:00<00:16,  1.15it/s][A

Fetching images for Shrub:   5%|▌         | 1/20 [00:01<00:24,  1.31s/it][A[A


Fetching images for Moss:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Moss:   5%|▌         | 1/20 [00:01<00:24,  1.29s/it][A[A[A



Fetching images for Grass:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for Palmtree:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A



Fetching images for Grass:   5%|▌         | 1/20 [00:00<00:14,  1.28it/s][A[A[A[A




Fetching images for Reed: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]
Overall Progress:  46%|████▌     | 61/134 [07:47<10:32,  8.66s/it]


Number of images retrieved for Reed: 20


Fetching images for Shrub: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
Overall Progress:  46%|████▋     | 62/134 [07:50<08:22,  6.99s/it]


Number of images retrieved for Shrub: 20


Fetching images for Moss: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  47%|████▋     | 63/134 [07:52<06:42,  5.67s/it]


Number of images retrieved for Moss: 20


Fetching images for Grass: 100%|██████████| 20/20 [00:19<00:00,  1.05it/s]
Fetching images for Palmtree: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


Number of images retrieved for Grass: 20




Overall Progress:  49%|████▊     | 65/134 [08:00<05:07,  4.46s/it]


Number of images retrieved for Palmtree: 20




Fetching images for Corn:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Corn:   5%|▌         | 1/20 [00:01<00:28,  1.50s/it][A[A
Fetching images for Tulip:   5%|▌         | 1/20 [00:02<00:51,  2.71s/it][A


Fetching images for Rose:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Rose:   5%|▌         | 1/20 [00:00<00:11,  1.69it/s][A[A[A



Fetching images for Clove:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Clove:   5%|▌         | 1/20 [00:00<00:13,  1.39it/s][A[A[A[A




Fetching images for Dogwood:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Corn: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
Overall Progress:  49%|████▉     | 66/134 [08:16<08:56,  7.89s/it]


Number of images retrieved for Corn: 20


Fetching images for Tulip: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  50%|█████     | 67/134 [08:17<06:21,  5.69s/it]


Number of images retrieved for Tulip: 20


Fetching images for Rose: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  51%|█████     | 68/134 [08:23<06:23,  5.81s/it]


Number of images retrieved for Rose: 20


Fetching images for Clove: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:  51%|█████▏    | 69/134 [08:28<05:59,  5.54s/it]


Number of images retrieved for Clove: 20


Fetching images for Dogwood: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
Overall Progress:  52%|█████▏    | 70/134 [08:30<04:53,  4.58s/it]


Number of images retrieved for Dogwood: 20



Fetching images for Durian:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Ferns:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Durian:   5%|▌         | 1/20 [00:03<01:09,  3.65s/it][A

Fetching images for Ferns:   5%|▌         | 1/20 [00:02<00:38,  2.05s/it][A[A


Fetching images for Fig:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Fig:   5%|▌         | 1/20 [00:00<00:17,  1.08it/s][A[A[A



Fetching images for Flax:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Flax:   5%|▌         | 1/20 [00:00<00:12,  1.46it/s][A[A[A[A




Fetching images for Frangipani:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Durian: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  53%|█████▎    | 71/134 [08:47<08:39,  8.24s/it]


Number of images retrieved for Durian: 20


Fetching images for Ferns: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  54%|█████▎    | 72/134 [08:48<06:28,  6.27s/it]


Number of images retrieved for Ferns: 20


Fetching images for Fig: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
Overall Progress:  54%|█████▍    | 73/134 [08:55<06:26,  6.33s/it]


Number of images retrieved for Fig: 20


Fetching images for Flax: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  55%|█████▌    | 74/134 [08:59<05:29,  5.50s/it]


Number of images retrieved for Flax: 20


Fetching images for Frangipani: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]

Overall Progress:  56%|█████▌    | 75/134 [09:01<04:25,  4.50s/it]


Number of images retrieved for Frangipani: 20




Fetching images for Hibiscus:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Lantana:   5%|▌         | 1/20 [00:06<01:59,  6.27s/it][A

Fetching images for Hibiscus:   5%|▌         | 1/20 [00:03<01:00,  3.16s/it][A[A


Fetching images for Bougainvillea:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Bougainvillea:   5%|▌         | 1/20 [00:00<00:10,  1.81it/s][A[A[A



Fetching images for Pea:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Pea:   5%|▌         | 1/20 [00:00<00:16,  1.18it/s][A[A[A[A




Fetching images for OrchidTree:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Lantana: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]
Overall Progress:  57%|█████▋    | 76/134 [09:20<08:44,  9.04s/it]


Number of images retrieved for Lantana: 20


Fetching images for Hibiscus: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  57%|█████▋    | 77/134 [09:25<07:25,  7.82s/it]


Number of images retrieved for Hibiscus: 20


Fetching images for Bougainvillea: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  58%|█████▊    | 78/134 [09:29<06:06,  6.55s/it]


Number of images retrieved for Bougainvillea: 20



Fetching images for Pea: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Fetching images for OrchidTree: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]


Number of images retrieved for Pea: 20




Overall Progress:  60%|█████▉    | 80/134 [09:32<03:29,  3.88s/it]


Number of images retrieved for OrchidTree: 20




Fetching images for Jackfruit:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Jackfruit:   5%|▌         | 1/20 [00:02<00:46,  2.46s/it][A[A


Fetching images for Cottonplant:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Cottonplant:   5%|▌         | 1/20 [00:00<00:17,  1.06it/s][A[A[A



Fetching images for RangoonCreeper: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]



Number of images retrieved for RangoonCreeper: 20


Overall Progress:  60%|██████    | 81/134 [09:49<06:48,  7.71s/it]



Fetching images for Corneliantree:   5%|▌         | 1/20 [00:01<00:26,  1.38s/it][A[A[A[A





Fetching images for Coffeeplant:   5%|▌         | 1/20 [00:01<00:36,  1.94s/it][A




Fetching images for Coconut:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Jackfruit: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  61%|██████    | 82/134 [10:00<07:35,  8.75s/it]


Number of images retrieved for Jackfruit: 20


Fetching images for Cottonplant: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  62%|██████▏   | 83/134 [10:01<05:24,  6.37s/it]


Number of images retrieved for Cottonplant: 20


Fetching images for Corneliantree:  85%|████████▌ | 17/20 [00:14<00:02,  1.17it/s]
Overall Progress:  63%|██████▎   | 84/134 [10:02<04:03,  4.88s/it]


Number of images retrieved for Corneliantree: 17


Fetching images for Coffeeplant: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  63%|██████▎   | 85/134 [10:05<03:39,  4.48s/it]


Number of images retrieved for Coffeeplant: 20



Fetching images for wheat:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for wheat:   5%|▌         | 1/20 [00:01<00:22,  1.16s/it][A

Fetching images for Coconut: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
Overall Progress:  64%|██████▍   | 86/134 [10:15<04:49,  6.03s/it]


Number of images retrieved for Coconut: 20




Fetching images for watermelon:   5%|▌         | 1/20 [00:02<00:38,  2.01s/it][A[A


Fetching images for radish:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for radish:   5%|▌         | 1/20 [00:03<00:59,  3.15s/it][A[A[A



Fetching images for carrot:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for wheat: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Overall Progress:  65%|██████▍   | 87/134 [10:26<05:50,  7.46s/it]


Number of images retrieved for wheat: 20



Fetching images for bed:   0%|          | 0/20 [00:00<?, ?it/s][A




Fetching images for cabinet:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A
Fetching images for watermelon: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
Overall Progress:  66%|██████▌   | 88/134 [10:32<05:23,  7.03s/it]


Number of images retrieved for watermelon: 20







Fetching images for radish: 100%|██████████| 20/20 [00:19<00:00,  1.05it/s]
Overall Progress:  66%|██████▋   | 89/134 [10:35<04:29,  5.98s/it]


Number of images retrieved for radish: 20


Fetching images for carrot: 100%|██████████| 20/20 [00:16<00:00,  1.25it/s]
Overall Progress:  67%|██████▋   | 90/134 [10:38<03:34,  4.88s/it]


Number of images retrieved for carrot: 20




Fetching images for chair:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for chair:   5%|▌         | 1/20 [00:00<00:18,  1.03it/s][A[A


Fetching images for bed: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  68%|██████▊   | 91/134 [10:48<04:37,  6.45s/it]


Fetching images for chests:   5%|▌         | 1/20 [00:00<00:16,  1.17it/s][A[A[A


Number of images retrieved for bed: 20



Fetching images for cabinet: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]
Overall Progress:  69%|██████▊   | 92/134 [10:49<03:17,  4.71s/it]


Number of images retrieved for cabinet: 20



Fetching images for chair: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Overall Progress:  69%|██████▉   | 93/134 [10:59<04:26,  6.49s/it]


Number of images retrieved for chair: 20




Fetching images for table:   0%|          | 0/20 [00:00<?, ?it/s][A[A



Fetching images for desks:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A

Fetching images for chests: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  70%|███████   | 94/134 [11:05<04:05,  6.14s/it]


Number of images retrieved for chests: 20


Fetching images for clock: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  71%|███████   | 95/134 [11:07<03:14,  4.99s/it]


Number of images retrieved for clock: 20






Fetching images for desks:   5%|▌         | 1/20 [00:04<01:17,  4.07s/it][A[A[A[A
Fetching images for Piano:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Piano:   5%|▌         | 1/20 [00:00<00:14,  1.30it/s][A


Fetching images for Bookcase:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A




Fetching images for Umbrella:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A


Fetching images for table: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Overall Progress:  72%|███████▏  | 96/134 [11:19<04:29,  7.09s/it]







Number of images retrieved for table: 20


Fetching images for desks: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  72%|███████▏  | 97/134 [11:20<03:21,  5.44s/it]


Number of images retrieved for desks: 20




Fetching images for Clothes:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Piano: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]

Overall Progress:  73%|███████▎  | 98/134 [11:30<03:57,  6.60s/it]


Number of images retrieved for Piano: 20



Fetching images for Bookcase: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  74%|███████▍  | 99/134 [11:34<03:28,  5.97s/it]


Number of images retrieved for Bookcase: 20


Fetching images for Umbrella: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  75%|███████▍  | 100/134 [11:35<02:28,  4.37s/it]


Number of images retrieved for Umbrella: 20


Fetching images for Clothes: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Overall Progress:  75%|███████▌  | 101/134 [11:42<02:55,  5.31s/it]


Number of images retrieved for Clothes: 20




Fetching images for sofa:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for sofa:   5%|▌         | 1/20 [00:01<00:28,  1.47s/it][A[A
Fetching images for ball:   0%|          | 0/20 [00:00<?, ?it/s][A


Fetching images for cart: 100%|██████████| 20/20 [00:20<00:00,  1.03s/it]
Overall Progress:  76%|███████▌  | 102/134 [11:51<03:17,  6.17s/it]


Number of images retrieved for cart: 20





Fetching images for spoon:   5%|▌         | 1/20 [00:01<00:26,  1.39s/it][A[A[A
Fetching images for ball:   5%|▌         | 1/20 [00:01<00:28,  1.51s/it][A



Fetching images for Bowl:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Bowl:   5%|▌         | 1/20 [00:00<00:14,  1.31it/s][A[A[A[A




Fetching images for fridge:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for sofa: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]
Overall Progress:  77%|███████▋  | 103/134 [11:59<03:34,  6.90s/it]


Number of images retrieved for sofa: 20




Fetching images for pan:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for spoon: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  78%|███████▊  | 104/134 [12:07<03:36,  7.21s/it]


Number of images retrieved for spoon: 20


Fetching images for ball: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Overall Progress:  78%|███████▊  | 105/134 [12:09<02:38,  5.48s/it]


Number of images retrieved for ball: 20


Fetching images for Bowl: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  79%|███████▉  | 106/134 [12:12<02:15,  4.84s/it]


Number of images retrieved for Bowl: 20


Fetching images for fridge: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  80%|███████▉  | 107/134 [12:15<01:57,  4.36s/it]


Number of images retrieved for fridge: 20


Fetching images for pan: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  81%|████████  | 108/134 [12:20<01:54,  4.42s/it]


Number of images retrieved for pan: 20



Fetching images for book:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for book:   5%|▌         | 1/20 [00:01<00:20,  1.05s/it][A

Fetching images for Cliff:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Cliff:   5%|▌         | 1/20 [00:01<00:20,  1.07s/it][A[A


Fetching images for Bay:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Bay:   5%|▌         | 1/20 [00:01<00:25,  1.37s/it][A[A[A



Fetching images for Coast:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Coast:   5%|▌         | 1/20 [00:00<00:18,  1.05it/s][A[A[A[A




Fetching images for Mountains:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for book: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]
Overall Progress:  81%|████████▏ | 109/134 [12:39<03:41,  8.84s/it]


Number of images retrieved for book: 20


Fetching images for Cliff: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:  82%|████████▏ | 110/134 [12:40<02:40,  6.68s/it]


Number of images retrieved for Cliff: 20


Fetching images for Bay: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  83%|████████▎ | 111/134 [12:45<02:20,  6.10s/it]


Number of images retrieved for Bay: 20


Fetching images for Coast: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  84%|████████▎ | 112/134 [12:47<01:47,  4.89s/it]


Number of images retrieved for Coast: 20


Fetching images for Mountains: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  84%|████████▍ | 113/134 [12:51<01:32,  4.39s/it]


Number of images retrieved for Mountains: 20



Fetching images for Forests:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Waterbodies:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Forests:   5%|▌         | 1/20 [00:01<00:22,  1.21s/it][A

Fetching images for Waterbodies:   5%|▌         | 1/20 [00:01<00:19,  1.01s/it][A[A


Fetching images for Lake:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Lake:   5%|▌         | 1/20 [00:00<00:16,  1.16it/s][A[A[A



Fetching images for desert:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for farmland:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A



Fetching images for desert:   5%|▌         | 1/20 [00:00<00:16,  1.13it/s][A[A[A[A




Fetching images for Forests: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  85%|████████▌ | 114/134 [13:09<02:54,  8.71s/it]


Number of images retrieved for Forests: 20


Fetching images for Waterbodies: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:  86%|████████▌ | 115/134 [13:10<02:00,  6.32s/it]


Number of images retrieved for Waterbodies: 20


Fetching images for Lake: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  87%|████████▋ | 116/134 [13:17<01:59,  6.61s/it]


Number of images retrieved for Lake: 20


Fetching images for desert: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Fetching images for farmland: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]


Number of images retrieved for desert: 20



Overall Progress:  88%|████████▊ | 118/134 [13:21<01:03,  3.95s/it]


Number of images retrieved for farmland: 20



Fetching images for river:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for river:   5%|▌         | 1/20 [00:01<00:19,  1.00s/it][A

Fetching images for hedges:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for hedges:   5%|▌         | 1/20 [00:00<00:17,  1.08it/s][A[A


Fetching images for plain:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for sky:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for cave:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A


Fetching images for plain:   5%|▌         | 1/20 [00:01<00:19,  1.00s/it][A[A[A



Fetching images for sky:   5%|▌         | 1/20 [00:00<00:16,  1.16it/s][A[A[A[A




Fetching images for river: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  89%|████████▉ | 119/134 [13:39<02:05,  8.37s/it]


Number of images retrieved for river: 20


Fetching images for hedges: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:  90%|████████▉ | 120/134 [13:40<01:26,  6.19s/it]


Number of images retrieved for hedges: 20


Fetching images for plain: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]
Fetching images for cave: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Fetching images for sky: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]


Number of images retrieved for plain: 20



Overall Progress:  91%|█████████ | 122/134 [13:51<01:02,  5.21s/it]


Number of images retrieved for cave: 20


Overall Progress:  92%|█████████▏| 123/134 [13:51<00:40,  3.71s/it]


Number of images retrieved for sky: 20



Fetching images for cloud:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for cloud:   5%|▌         | 1/20 [00:02<00:46,  2.44s/it][A

Fetching images for flowergarden:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for flowergarden:   5%|▌         | 1/20 [00:01<00:25,  1.33s/it][A[A


Fetching images for glacier:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for horizon:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for grassland:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A


Fetching images for glacier:   5%|▌         | 1/20 [00:00<00:17,  1.08it/s][A[A[A



Fetching images for horizon:   5%|▌         | 1/20 [00:01<00:20,  1.06s/it][A[A[A[A




Fetching images for cloud: 100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
Overall Progress:  93%|█████████▎| 124/134 [14:06<01:12,  7.25s/it]


Number of images retrieved for cloud: 20


Fetching images for flowergarden: 100%|██████████| 20/20 [00:15<00:00,  1.33it/s]
Overall Progress:  93%|█████████▎| 125/134 [14:10<00:55,  6.14s/it]


Number of images retrieved for flowergarden: 20



Fetching images for lighthouse:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for glacier: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  94%|█████████▍| 126/134 [14:19<00:56,  7.04s/it]


Number of images retrieved for glacier: 20


Fetching images for grassland: 100%|██████████| 20/20 [00:18<00:00,  1.07it/s]
Fetching images for horizon: 100%|██████████| 20/20 [00:19<00:00,  1.05it/s]
Overall Progress:  96%|█████████▌| 128/134 [14:22<00:24,  4.10s/it]


Number of images retrieved for grassland: 20

Number of images retrieved for horizon: 20




Fetching images for plateau:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for lighthouse: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  96%|█████████▋| 129/134 [14:31<00:28,  5.62s/it]


Number of images retrieved for lighthouse: 20



Fetching images for savannah:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for savannah:   5%|▌         | 1/20 [00:00<00:18,  1.04it/s][A


Fetching images for volcano:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for volcano:   5%|▌         | 1/20 [00:01<00:22,  1.17s/it][A[A[A



Fetching images for valley:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for plateau: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  97%|█████████▋| 130/134 [14:39<00:25,  6.31s/it]


Number of images retrieved for plateau: 20




Fetching images for waterfall:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for savannah: 100%|██████████| 20/20 [00:13<00:00,  1.45it/s]
Overall Progress:  98%|█████████▊| 131/134 [14:47<00:20,  6.72s/it]


Number of images retrieved for savannah: 20


Fetching images for volcano: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Overall Progress:  99%|█████████▊| 132/134 [14:49<00:10,  5.35s/it]


Number of images retrieved for volcano: 20


Fetching images for valley: 100%|██████████| 20/20 [00:14<00:00,  1.34it/s]
Overall Progress:  99%|█████████▉| 133/134 [14:51<00:04,  4.43s/it]


Number of images retrieved for valley: 20


Fetching images for waterfall: 100%|██████████| 20/20 [00:13<00:00,  1.53it/s]
Overall Progress: 100%|██████████| 134/134 [14:53<00:00,  6.67s/it]


Number of images retrieved for waterfall: 20





In [6]:
scraper.save_to_file(image_urls, 'image_urls.json')

Data saved to image_urls.json


In [7]:
class ImageDownloader:
    def __init__(self, json_file, download_dir='Dataset', max_workers=4, delay=1):
        self.json_file = json_file  # file containing URLs of images in JSON format
        self.download_dir = download_dir  # Folder name for storing images
        self.max_workers = max_workers  # Number of threads
        self.delay = delay  # Polite delay: when we send request too much to the server for downloading images without polite delay, it will crash or prevent your IP
        self.filename = set()  # To store filename directories
        self.setup_directory()  # Set up the folder structure

    def setup_directory(self):
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)

    def read_json(self):
        """
        Read the JSON file and return the data.

        Returns:
        data (dict): The data read from the JSON file.
        """
        with open(self.json_file, 'r') as file:
            data = json.load(file)
        return data

    def is_valid_url(self, url):
        """
        Check if the URL is valid.

        Parameters:
        url (str): The URL to be checked.

        Returns:
        bool: True if the URL is valid, False otherwise.
        """
        try:
            with urllib.request.urlopen(url) as response:
                if response.status == 200 and 'image' in response.info().get_content_type():
                    return True
        except Exception:
            return False

    def download_image(self, url, category, term, pbar):
        """
        Download the image from the given URL.

        Parameters:
        url (str): The URL of the image to be downloaded.
        category (str): The category of the image.
        term (str): The term or keyword associated with the image.
        pbar (tqdm): The progress bar object.

        Returns:
        str: A message indicating the status of the download.
        """
        if not self.is_valid_url(url):
            pbar.update(1)
            return f"Invalid URL: {url}"

        category_dir = os.path.join(self.download_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

        term_dir = os.path.join(category_dir, term)
        if not os.path.exists(term_dir):
            os.makedirs(term_dir)

        filename = os.path.join(term_dir, os.path.basename(urlparse(url).path))

        self.filename.add(filename)  # Record the filename directory

        try:
            urllib.request.urlretrieve(url, filename)
            pbar.update(1)
            return f"Downloaded: {url}"
        except Exception as e:
            pbar.update(1)
            return f"Failed to download {url}: {str(e)}"

    def download_images(self):
        """
        Download images from the JSON file.

        Returns:
        None
        """
        data = self.read_json()
        download_tasks = []

        total_images = sum(len(urls) for terms in data.values() for urls in terms.values())
        with tqdm(total=total_images, desc="Downloading images") as pbar:
            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                for category, terms in data.items():
                    for term, urls in terms.items():
                        for url in urls:
                            download_tasks.append(executor.submit(self.download_image, url, category, term, pbar))
                            time.sleep(self.delay)  # Polite delay

                for future in concurrent.futures.as_completed(download_tasks):
                    print(future.result())

        self.export_filename()

    def export_filename(self):
        """
        Export the filename directories to a text file.

        Returns:
        None
        """
        with open('filename.txt', 'w') as file:
            for filename in sorted(self.filename):
                file.write(f"{filename}\n")


In [None]:
downloader = ImageDownloader(json_file='image_urls.json', download_dir='Dataset', max_workers=4, delay=1)
downloader.download_images()

In [9]:
downloader.export_filename()

In [10]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [11]:
!ls

Dataset  drive	filename.txt  image_urls.json  sample_data


In [12]:
!zip -r /content/drive/MyDrive/Dataset.zip Dataset

  adding: Dataset/ (stored 0%)
  adding: Dataset/animal/ (stored 0%)
  adding: Dataset/animal/shrimp/ (stored 0%)
  adding: Dataset/animal/shrimp/4898039985_e3eaca1c87_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/2144708533_8f37851185_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/17061896855_554c2b2b8a_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/2344784902_8c2ddbed93_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/233169073_da697f9b1b_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/53624724282_71047360df_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/39860909861_ac4dc6142b_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/3553852023_779b809b27_b.jpg (deflated 2%)
  adding: Dataset/animal/shrimp/52754170014_d0570ded3f_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/50807936448_b99cb05879_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/50861234561_6cc092f2e5_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/15364085118_8a99f61fc8_b.jpg (defla

In [13]:
def check_and_preprocess_images(image_dir):
    """
    Check and preprocess images in the specified directory.

    Parameters:
    image_dir (str): The directory containing the images to be checked and preprocessed.

    Returns:
    None
    """
    for root, _, files in os.walk(image_dir):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with Image.open(file_path) as img:
                    # Check if image is smaller than 50x50 pixels
                    if img.size[0] < 50 or img.size[1] < 50:
                        os.remove(file_path)
                        print(f"Deleted {file_path}: Image too small ({img.size[0]}x{img.size[1]})")
                        continue

                    # Convert non-RGB images to RGB
                    if img.mode != 'RGB':
                        img = img.convert('RGB')
                        img.save(file_path)
                        print(f"Converted {file_path} to RGB")

            except Exception as e:
                # If file is not an image, delete it
                os.remove(file_path)
                print(f"Deleted {file_path}: Not an image or corrupted file ({str(e)})")

In [14]:
check_and_preprocess_images('Dataset')

Converted Dataset/animal/Elephant/4839032364_8c521066b2_b.jpg to RGB
Converted Dataset/animal/Elephant/4839031518_3765f997a4_b.jpg to RGB


In [15]:
!zip -r /content/drive/MyDrive/Clean_Dataset.zip Dataset

  adding: Dataset/ (stored 0%)
  adding: Dataset/animal/ (stored 0%)
  adding: Dataset/animal/shrimp/ (stored 0%)
  adding: Dataset/animal/shrimp/4898039985_e3eaca1c87_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/2144708533_8f37851185_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/17061896855_554c2b2b8a_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/2344784902_8c2ddbed93_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/233169073_da697f9b1b_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/53624724282_71047360df_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/39860909861_ac4dc6142b_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/3553852023_779b809b27_b.jpg (deflated 2%)
  adding: Dataset/animal/shrimp/52754170014_d0570ded3f_b.jpg (deflated 1%)
  adding: Dataset/animal/shrimp/50807936448_b99cb05879_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/50861234561_6cc092f2e5_b.jpg (deflated 0%)
  adding: Dataset/animal/shrimp/15364085118_8a99f61fc8_b.jpg (defla

In [16]:
!cp filename.txt /content/drive/MyDrive/filename.txt