<a href="https://colab.research.google.com/github/vutl/Image-Retrieval/blob/feature%2Fimg-retrieval/Image_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Truy vấn hình ảnh cơ bản**

*   Xây dựng chương trình truy vấn ảnh cơ bản.
*   Phát triển chương trình truy vấn ảnh nâng cao với CLIP model và vector database.
*   (Optional) Thu thập và xử lý dữ liệu nhằm mục đích xây dựng chương trình truy vấn ảnh cá nhân
hóa.





In [1]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

lấy danh sách các class của ảnh trong data

In [None]:
ROOT = 'data'
CLASS_NAME = sorted(list(os.listdir(f'{ROOT}/train')))

đọc ảnh, resize về kích thước chung (thì
mới áp dụng được các phép đo) và chuyển đổi nó về dạng numpy:

In [None]:
def read_image_from_path(path, size):
  im = Image.open(path).convert('RGB').resize(size)
  return np.array(im)

def folder_to_images(folder, size):
  list_dir = [folder + '/' + name for name in os.listdir(folder)]
  images_np = np.zeros(shape=(len(list_dir) *size, 3))
  images_path = []
  for i, path in enumerate(list_dir):
    images_np[i] = read_image_from_path(path, size)
    images_path.append(path)
  images_path = np.array(images_path)
  return images_np, images_path

### **Truy vấn hình ảnh với độ đo L1**

In [None]:
def absolute_difference(query, data):
  axis_batch_size = tuple(range(1, len(data.shape)))
  return np.sum(np.abs(query - data), axis=axis_batch_size)

Tính toán để tính độ tương đồng giữa ảnh input và các hình ảnh trong
bộ dữ liệu. hàm ***get_l1_score*** sẽ trả về ảnh ***query*** và ***ls_path_score*** chứa
danh sách hình ảnh và giá trị độ tương đồng với từng ảnh.

In [None]:
def get_l1_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      rates = absolute_difference(query, images_np)
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

Đoạn code này thực hiện quá trình truy xuất hình ảnh bằng cách so sánh một hình ảnh truy vấn với
các hình ảnh trong tập huấn luyện dựa trên điểm L1. Đầu tiên, các hình ảnh được thay đổi cùng kích
thước. Tiếp theo hệ thống sẽ so sánh ảnh truy vấn với các hình ảnh trong thư mục huấn luyện để tính
điểm L1. Sau đó, kết quả truy vấn được trả về là danh sách các đường dẫn chứa hình ảnh và điểm số
tính theo L1. Cuối cùng 5 kết quả tốt nhất sẽ được hiển thị cùng với ảnh truy vấn

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/Orange_easy/0_100.jpg"
size = (448, 448)
query, ls_path_score = get_l1_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/African_crocodile/n01697457_18534.JPEG"
size = (448, 448)
query, ls_path_score = get_l1_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

### **Truy vấn hình ảnh với độ đo L2**

In [None]:
def mean_square_difference(query, data):
  axis_batch_size = tuple(range(1, len(data.shape)))
  return np.mean((data - query)**2, axis=axis_batch_size)

In [None]:
def get_l2_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      rates = mean_square_difference(query, images_np)
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/Orange_easy/0_100.jpg"
size = (448, 448)
query, ls_path_score = get_l2_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/African_crocodile/n01697457_18534.JPEG"
size = (448, 448)
query, ls_path_score = get_l2_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

### **Truy vấn hình ảnh với độ đo Cosine Similarity**

In [None]:
def cosine_similarity(query, data):
  axis_batch_size = tuple(range(1, len(data.shape)))
  query_norm = np.sqrt(np.sum(query**2))
  data_norm = np.sqrt(np.sum(data**2, axis=axis_batch_size))
  return np.sum(data * query, axis=axis_batch_size) / (query_norm * data_norm + np.finfo(float).eps)

In [None]:
def get_cosine_similarity_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      rates = cosine_similarity(query, images_np)
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

Để hiển thị kết quả chúng ta sử dụng hàm plot_results(), tuy nhiên ở hàm này chúng ta sẽ sắp xếp giá
trị giảm dần từ lớn đến nhỏ vì với độ đo này thì giá trị càng lớn sẽ càng giống nhau, cho nên chúng ta
sử dụng reverse = True.

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/Orange_easy/0_100.jpg"
size = (448, 448)
query, ls_path_score = get_cosine_similarity_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/African_crocodile/n01697457_18534.JPEG"
size = (448, 448)
query, ls_path_score = get_cosine_similarity_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=False)

### **Truy vấn hình ảnh với độ đo Correlation Coefficient**

In [None]:
def correlation_coefficient(query, data):
  axis_batch_size = tuple(range(1, len(data.shape)))
  query_mean = query - np.mean(query)
  data_mean = data - np.mean(data, axis=axis_batch_size, keepdims=True)
  query_norm = np.sqrt(np.sum(query_mean**2))
  data_norm = np.sqrt(np.sum(data_mean**2, axis=axis_batch_size))
  return np.sum(data_mean * query_mean, axis=axis_batch_size) / (query_norm * data_norm + np.finfo(float).eps)

In [None]:
def get_correlation_coefficient_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      rates = correlation_coefficient(query, images_np)
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

Để hiển thị kết quả chúng ta sử dụng hàm plot_results(), tuy nhiên ở hàm này chúng ta sẽ sắp xếp giá
trị giảm dần từ lớn đến nhỏ vì với độ đo này thì giá trị càng lớn sẽ càng giống nhau, cho nên chúng ta
sử dụng reverse = True

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/Orange_easy/0_100.jpg"
size = (448, 448)
query, ls_path_score = get_correlation_coefficient_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=True)

In [None]:
root_img_path = f"{ROOT}/train/"
query_path = f"{ROOT}/test/African_crocodile/n01697457_18534.JPEG"
size = (448, 448)
query, ls_path_score = get_correlation_coefficient_score(root_img_path, query_path, size)
plot_results(query_path, ls_path_score, reverse=True)

# **Truy vấn hình ảnh nâng cao với Pretrained Deep Learning Model**

*   Thư viện chromadb hỗ trợ việc quản lý và truy xuất dữ liệu hình ảnh hiệu quả (sử dụng thêm với mục đích tạo vector
database)
*   chromadb có thể dùng open-clip-torch để cung cấp khả năng sử dụng mô hình CLIP đã
được đào tạo sẵn, đây là một công cụ mạnh mẽ để phân tích nội dung hình ảnh thông qua học sâu.

In [2]:
!pip install chromadb
!pip install open-clip-torch

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.5-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pro

In [4]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction

Tương tự như các bước ở phần cơ bản trước, nhưng chúng ta sẽ nâng cấp bằng cách thêm một hàm để trích xuất vector
đặc trưng cho mỗi hình ảnh. Mô hình CLIP sẽ được sử dụng để biến đổi hình ảnh thành các vector đặc
trưng đại diện cho nội dung và ngữ cảnh của hình ảnh đó. Sau đó, việc so sánh các hình ảnh không
được thực hiện trực tiếp trên ảnh gốc mà là thông qua việc tính sự tương đồng giữa các vector này.
Đoạn code bên đưới khởi tạo một hàm để trích xuất vector đặc trưng từ một hình sử dụng mô hình CLIP.
Tiếp theo, hàm get_single_image_embedding nhận một hình ảnh làm đầu vào và sử dụng phương thức
_encode_image của OpenCLIPEmbeddingFunction để trích xuất ảnh thành một vector đặc trưng.

In [None]:
embedding_function = OpenCLIPEmbeddingFunction()

def get_single_image_embedding(image):
  embedding = embedding_function._encode_image(image=image)
  return np.array(embedding)

### **Truy vấn embedding vector với độ đo L1**

Truy vấn embedding vector với độ đo L1 hàm ***get_l1_score*** được nâng cấp lên bằng cách sử dụng
CLIP model để trích xuất vector đặc trưng

In [None]:
def get_l1_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  query_embedding = get_single_image_embedding(query)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      embedding_list = []
      for idx_img in range(images_np.shape[0]):
        embedding = get_single_image_embedding(images_np[idx_img].astype(np.unit8))
        embedding_list.append(embedding)
      rates = absolute_difference(query_embedding, np.stack(embedding_list))
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

### **Truy vấn embedding vector với độ đo L2**

In [None]:
def get_l2_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  query_embedding = get_single_image_embedding(query)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      embedding_list = []
      for idx_img in range(images_np.shape[0]):
        embedding = get_single_image_embedding(images_np[idx_img].astype(np.unit8))
        embedding_list.append(embedding)
      rates = mean_square_difference(query_embedding, np.stack(embedding_list))
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

### **Truy vấn embedding vector với độ đo Cosine Similarity**

In [None]:
def get_cosine_similarity_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  query_embedding = get_single_image_embedding(query)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      embedding_list = []
      for idx_img in range(images_np.shape[0]):
        embedding = get_single_image_embedding(images_np[idx_img].astype(np.unit8))
        embedding_list.append(embedding)
      rates = cosine_similarity(query_embedding, np.stack(embedding_list))
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

### **Truy vấn embedding vector với độ đo Correlation Coefficent**

In [None]:
def get_correlation_coefficient_score(root_img_path, query_path, size):
  query = read_image_from_path(query_path, size)
  query_embedding = get_single_image_embedding(query)
  ls_path_score = []
  for folder in os.listdir(root_img_path):
    if folder in CLASS_NAME:
      path = root_img_path + folder
      images_np, images_path = folder_to_images(path, size) # mang numpy nhieu anh, paths
      embedding_list = []
      for idx_img in range(images_np.shape[0]):
        embedding = get_single_image_embedding(images_np[idx_img].astype(np.unit8))
        embedding_list.append(embedding)
      rates = correlation_coefficient(query_embedding, np.stack(embedding_list))
      ls_path_score.extend(list(zip(images_path, rates)))
  return query, ls_path_score

## **Tối ưu hoá quá trình truy vấn hình ảnh sử dụng mô hình CLIP và cơ sở dữ liệu vector**

Phương pháp này sẽ sử dụng một cơ sở dữ liệu vector (vector database) để quản lý các
embedding vector, giúp quá trình truy vấn được tối ưu hơn

In [None]:
def get_files_path(path):
  files_path = []
  for label in CLASS_NAME:
    label_path = path + "/" + label
    filenames = os.listdir(label_path)
    for filename in filenames:
      filepath = label_path + '/' + filename
      files_path.append(filepath)
  return files_path

data_path = f'{ROOT}/train'
files_path = get_files_path(data_path)

### **Truy vấn ảnh với L2 Collection**

Hàm giúp trích xuất và lưu trữ các vector
đặc trưng của ảnh vào một collection đã được tạo (collection -  tập hợp các
vector hoặc tài liệu được chỉ mục và lưu trữ cùng nhau dựa trên một số tiêu chí hoặc đặc điểm chung, dùng
để tổ chức và quản lý dữ liệu)

In [None]:
def add_embedding(collection, files_path):
  ids = []
  embeddings = []
  for id_filepath, filepath in tqdm(enumerate(files_path)):
    ids.append(f'id_{id_filepath}')
    image = Image.open(filepath)
    embedding = get_single_image_embedding(image=image)
    embeddings.append(embedding)
  collection.add(
      embeddings=embeddings,
      ids=ids
  )

Khởi tạo một client cho cơ sở dữ liệu Chroma và tạo một collection mới với cấu hình
sử dụng L2 để so sánh các embedding vector. Sau đó, gọi hàm add_embedding để thêm các vector đặc trưng của ảnh vào collection này, qua đó tạo điều kiện thuận lợi cho việc truy vấn nhanh chóng và hiệu quả

In [None]:
#Create a Chroma Client
chroma_client - chromadb.Client()
#Create a collection
l2_collection = chroma_client.get_or_create_collection(name='l2_collection',
                                                      metadata={HNSW_SPACE: "l2"})
add_embedding(collection=l2_collection, files_path=files_path)

Hàm search được định nghĩa để thực hiện truy xuất các ảnh dựa trên embedding của ảnh truy vấn.
Hàm này nhận đường dẫn của ảnh truy vấn, loại collection và số lượng kết quả trả về mong muốn, sau
đó trả về danh sách các kết quả phù hợ

In [None]:
def search(image_path, collection, n_results):
  query_image = Image.open(image_path)
  query_embedding = get_single_image_embedding(query_image)
  results = collection.query(
      query_embeddings=query_embedding,
      n_results=n_results #how many results to return
  )
  return resutls

In [None]:
test_path = f'{ROOT}/test'
test_files_path = get_files_path(path=test_path)
test_results = test_files_path[1]
l2_results = search(image_path=test_path, collection=l2_collection, n_results=5)
plot_results(image_path=test_path, files_path=files_path, results=12_results)

### **Truy vấn ảnh với Cosine Similarity Collection**

In [None]:
#Create a collection
cosine_similarity = chroma.get_or_create_collection(name='cosine_similarity',
                                                      metadata={HNSW_SPACE: "cosine"})
add_embedding(collection=cosine_collection, files_path=files_path)

# **Chương trình Truy Vấn Ảnh Cá Nhân Hóa**

In [5]:
!pip install tqdm
!apt-get update
!apt-get install -y wget
!pip install selenium
!apt-get install -y chromium-browser
!apt-get install -y chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup # For parsing HTML content
from urllib.parse import urljoin, urlparse # For handling URLs
import urllib.request # For making HTTP requests
import time # For handling time-related operations
import os # For interacting with the operating system (relate to dir, folder, file)
from tqdm import tqdm # For displaying progress bars (visualize progress)
import concurrent.futures # For multi-threading
import json # For writing to a text file
from PIL import Image # For handling images
from posix import terminal_size

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [921 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,552 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2,841 kB]
Hit:12 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:13 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubun

**Thu thập dữ liệu - Crawl URL từ Website**

In [13]:
class UrlScraper:
  #Constructor
  def __init__(self, url_template, max_images=50, max_workers=4):
    self.url_template = url_template #link crawl
    self.max_images = max_images #Max images
    self.max_workers = max_workers #Thread
    self.setup_environment() #Call for set up environment

  #Set up environment
  def setup_environment(self):
    os.environ['PATH'] += ':usr/lib/chronium-browser'
    os.environ['PATH'] += ':usr/lib/chronium-browser/chromedriver'

  def get_url_images(self, term):
    """
    Crawl the urls of images by term

    Parameters:
    term (str): The name of animal, plant, scenery, furniture

    Returns:
    urls (list): List of urls of images
    """

    #Initialize Chrome driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    url = self.url_template.format(search_term=term)
    driver.get(url)

    # Start crawl urls of image like brute force - the same mechanism with this but add some feature
    urls = []
    more_content_available = True

    pbar = tqdm(total=self.max_images, desc=f"Fetching images for {term}") #Set up for visualize progress

    while len(urls) < self.max_images and more_content_available:
      soup  = BeautifulSoup(driver.page_source, "html.parser")
      img_tags = soup.find_all("img")

      for img in img_tags:
        if len(urls) >= self.max_images:
          break
        if 'src' in img.attrs:
          href = img.attrs['src']
          img_path = urljoin(url, href)
          img_path = img_path.replace("_m.jpg", "_b.jpg").replace("_n.jpg", "_b.jpg").replace("_w.jpg", "_b.jpg")
          if img_path ==  "https://combo.staticflickr.com/ap/build/images/getty/IStock_corporate_logo.svg":
            continue
          urls.append(img_path)
          pbar.update(1)

      try:
        load_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[@id="yui_3_16_0_1_1721642285931_28620"]')))
        load_more_button.click()
        time.sleep(2)
      except:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        new_soup = BeautifulSoup(driver.page_source, "html.parser")
        new_img_tags = new_soup.find_all("img", loading_="lazy")
        if len(new_img_tags) == len(img_tags):
          more_content_available = False
        img_tags = new_img_tags

    pbar.close()
    driver.quit()
    return urls

  def scrape_urls(self, categories):
    """
    Call get_url_images method to get all urls of any object in categories\

    Parameter:
    categories (dictionary): the dict of all object we need to collect image with format
      categories{"name_object": [value1, value2, ...]}

    Returns:
    all_urls (dictionary): Dictionary of urls of images
    """

    all_urls = {category: {} for category in categories}

    #Handle multi-threading for efficient installation
    with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
      futures_to_term = {executor.submit(self.get_url_images, term): (category, term) for category, terms in categories.items() for term in terms}

      for future in tqdm(concurrent.futures.as_completed(futures_to_term), total=len(futures_to_term), desc="Overall Progress"):
        category, term = futures_to_term[future]

        try:
          urls = future.result()
          all_urls[category][term] = urls
          print(f"\nNumber of images retrieved for {term}: {len(urls)}")
        except Exception as exc:
          print(f"\n{term} generated an exception: {exc}")

    return all_urls

  def save_to_file(self, data,  filename):
    """
    Save the data to a JSON file.

    Parameters:
    data (dict): The data to be saved.
    filename (str): The name of the JSON file.

    Returns:
    None

    """
    with open(filename, "w") as file:
      json.dump(data, file, indent=4)
    print(f"Data saved to {filename}")


categories = {
    "animal": ["Monkey", "Elephant", "cows", "Cat", "Dog", "bear", "fox", "Civet", "Pangolins",
               "Rabbit", "Bats", "Whale", "Cock", "Owl", "flamingo", "Lizard", "Turtle", "Snake",
               "Frog", "Fish", "shrimp", "Crab", "Snail", "Coral", "Jellyfish", "Butterfly", "Flies",
               "Mosquito", "Ants", "Cockroaches", "Spider", "scorpion", "tiger", "bird", "horse",
               "pig", "Alligator", "Alpaca", "Anteater", "donkey", "Bee", "Buffalo", "Camel"],

    "plant": ["Bamboo", "Apple", "Apricot", "Banana", "Bean", "Wildflower", "Flower",
              "Mushroom", "Weed", "Fern", "Reed", "Shrub", "Moss", "Grass", "Palmtree", "Corn",
              "Tulip", "Rose", "Clove", "Dogwood", "Durian", "Ferns", "Fig", "Flax", "Frangipani",
              "Lantana", "Hibiscus", "Bougainvillea", "Pea", "OrchidTree", "RangoonCreeper",
              "Jackfruit", "Cottonplant", "Cornelainetree", "Coffeplant", "Coconut", "wheat",
              "watermelon", "radish", "carrot"],

    "furniture": ["bed", "cabinet", "chair", "chests", "clock", "desks", "table", "Piano",
                  "Bookcase", "Umbrella", "Clothes", "cart", "sofa", "ball", "spoon", "Bowl", "fridge",
                  "pan", "book"],

    "scenery": ["Cliff", "Bay", "Coast", "Mountains", "Forests", "Waterbodies", "Lake",
                "desert", "Farmland", "river", "hedges", "plain", "sky", "cave", "cloud", "flowergarden",
                "glacier", "grassland", "horizon", "lighthouse", "plateau", "savannah", "valley",
                "volcano", "waterfall"]
}

urltopic = {"flickr": "https://www.flickr.com/search/?text={search_term}"}
scraper = UrlScraper(url_template=urltopic["flickr"], max_images=20, max_workers=5)
image_urls = scraper.scrape_urls(categories)
scraper.save_to_file(image_urls, 'image_urls.json')

Overall Progress:   0%|          | 0/127 [00:00<?, ?it/s]
Fetching images for cows:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for cows:   5%|▌         | 1/20 [00:01<00:20,  1.09s/it][A

Fetching images for Dog:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Dog:   5%|▌         | 1/20 [00:02<00:43,  2.26s/it][A[A


Fetching images for Monkey:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for Elephant:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for Cat:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A


Fetching images for Monkey:   5%|▌         | 1/20 [00:01<00:30,  1.62s/it][A[A[A



Fetching images for Elephant:   5%|▌         | 1/20 [00:01<00:34,  1.80s/it][A[A[A[A




Fetching images for cows: 100%|██████████| 20/20 [00:14<00:00,  1.42it/s]
Overall Progress:   1%|          | 1/127 [00:43<1:32:21, 43.98s/it]


Number of images retrieved for cows: 20



Fetching images for bear:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Dog: 100%|██████████| 20/20 [00:14<00:00,  1.34it/s]
Overall Progress:   2%|▏         | 2/127 [00:48<43:02, 20.66s/it]  


Number of images retrieved for Dog: 20




Fetching images for fox:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Monkey: 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
Overall Progress:   2%|▏         | 3/127 [00:52<26:51, 13.00s/it]


Number of images retrieved for Monkey: 20


Fetching images for Elephant: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Fetching images for Cat: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]


Number of images retrieved for Elephant: 20



Overall Progress:   4%|▍         | 5/127 [00:54<11:19,  5.57s/it]


Number of images retrieved for Cat: 20





Fetching images for bear: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]



Overall Progress:   5%|▍         | 6/127 [01:04<14:34,  7.23s/it]


Number of images retrieved for bear: 20



Fetching images for Pangolins:   0%|          | 0/20 [00:00<?, ?it/s][A



Fetching images for Rabbit:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A
Fetching images for Pangolins:   5%|▌         | 1/20 [00:02<00:51,  2.72s/it][A



Fetching images for fox: 100%|██████████| 20/20 [00:20<00:00,  1.01s/it]
Overall Progress:   6%|▌         | 7/127 [01:11<14:27,  7.23s/it]


Number of images retrieved for fox: 20




Fetching images for Bats:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Bats:   5%|▌         | 1/20 [00:00<00:10,  1.89it/s][A[A




Fetching images for Civet: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:   6%|▋         | 8/127 [01:17<13:15,  6.69s/it]




Fetching images for Whale:   5%|▌         | 1/20 [00:00<00:08,  2.23it/s]


Number of images retrieved for Civet: 20


[A[A[A[A[A


Fetching images for Pangolins: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]
Overall Progress:   7%|▋         | 9/127 [01:21<11:17,  5.74s/it]


Number of images retrieved for Pangolins: 20





Fetching images for Rabbit: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:   8%|▊         | 10/127 [01:22<08:52,  4.55s/it]


Number of images retrieved for Rabbit: 20



Fetching images for Bats: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]
Overall Progress:   9%|▊         | 11/127 [01:27<08:33,  4.43s/it]


Number of images retrieved for Bats: 20




Fetching images for flamingo:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Owl:   5%|▌         | 1/20 [00:02<00:47,  2.52s/it][A

Fetching images for Whale: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:   9%|▉         | 12/127 [01:33<09:54,  5.17s/it]


Number of images retrieved for Whale: 20






Fetching images for Cock: 100%|██████████| 20/20 [00:17<00:00,  1.18it/s]
Overall Progress:  10%|█         | 13/127 [01:38<09:11,  4.83s/it]



Fetching images for Lizard:   5%|▌         | 1/20 [00:01<00:20,  1.06s/it][A[A[A[A


Number of images retrieved for Cock: 20





Fetching images for Turtle:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Owl: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  11%|█         | 14/127 [01:44<09:48,  5.21s/it]


Number of images retrieved for Owl: 20



Fetching images for flamingo: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  12%|█▏        | 15/127 [01:46<08:03,  4.31s/it]
Fetching images for Snake:   5%|▌         | 1/20 [00:01<00:21,  1.15s/it][A


Number of images retrieved for flamingo: 20




Fetching images for Frog:   0%|          | 0/20 [00:00<?, ?it/s][A[A




Fetching images for Fish:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A

Fetching images for Frog:   5%|▌         | 1/20 [00:00<00:08,  2.17it/s][A[A




Fetching images for Lizard: 100%|██████████| 20/20 [00:13<00:00,  1.47it/s]
Overall Progress:  13%|█▎        | 16/127 [01:50<08:06,  4.39s/it]


Number of images retrieved for Lizard: 20






Fetching images for shrimp:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Turtle: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]
Overall Progress:  13%|█▎        | 17/127 [01:54<07:46,  4.24s/it]


Number of images retrieved for Turtle: 20





Fetching images for Crab:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Snake: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  14%|█▍        | 18/127 [02:00<08:32,  4.70s/it]


Number of images retrieved for Snake: 20


Fetching images for Fish: 100%|██████████| 20/20 [00:13<00:00,  1.45it/s]
Fetching images for Frog: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:  16%|█▌        | 20/127 [02:02<04:53,  2.74s/it]


Number of images retrieved for Fish: 20

Number of images retrieved for Frog: 20


Fetching images for shrimp: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  17%|█▋        | 21/127 [02:09<06:59,  3.95s/it]


Number of images retrieved for shrimp: 20



Fetching images for Snail:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Snail:   5%|▌         | 1/20 [00:02<00:50,  2.67s/it][A

Fetching images for Coral:   0%|          | 0/20 [00:00<?, ?it/s][A[A



Fetching images for Crab: 100%|██████████| 20/20 [00:18<00:00,  1.07it/s]
Overall Progress:  17%|█▋        | 22/127 [02:16<08:30,  4.86s/it]


Number of images retrieved for Crab: 20






Fetching images for Jellyfish:   5%|▌         | 1/20 [00:01<00:25,  1.37s/it][A[A[A[A

Fetching images for Coral:   5%|▌         | 1/20 [00:03<00:57,  3.03s/it][A[A


Fetching images for Butterfly:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Butterfly:   5%|▌         | 1/20 [00:00<00:12,  1.58it/s][A[A[A




Fetching images for Flies:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Snail: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  18%|█▊        | 23/127 [02:26<11:03,  6.38s/it]


Number of images retrieved for Snail: 20



Fetching images for Jellyfish: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:  19%|█▉        | 24/127 [02:33<11:08,  6.49s/it]
Fetching images for Mosquito:   5%|▌         | 1/20 [00:01<00:31,  1.64s/it][A


Number of images retrieved for Jellyfish: 20


Fetching images for Coral: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]
Overall Progress:  20%|█▉        | 25/127 [02:33<08:05,  4.76s/it]


Number of images retrieved for Coral: 20


Fetching images for Butterfly: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  20%|██        | 26/127 [02:37<07:31,  4.47s/it]


Number of images retrieved for Butterfly: 20


Fetching images for Flies: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  21%|██▏       | 27/127 [02:38<05:54,  3.54s/it]


Number of images retrieved for Flies: 20




Fetching images for Ants:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Ants:   5%|▌         | 1/20 [00:00<00:18,  1.02it/s][A[A


Fetching images for Cockroaches:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Cockroaches:   5%|▌         | 1/20 [00:00<00:17,  1.09it/s][A[A[A



Fetching images for Mosquito: 100%|██████████| 20/20 [00:17<00:00,  1.14it/s]
Overall Progress:  22%|██▏       | 28/127 [02:49<09:12,  5.58s/it]


Number of images retrieved for Mosquito: 20






Fetching images for Spider:   5%|▌         | 1/20 [00:01<00:37,  1.95s/it][A[A[A[A
Fetching images for scorpion:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for scorpion:   5%|▌         | 1/20 [00:01<00:20,  1.09s/it][A




Fetching images for tiger:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Ants: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]
Overall Progress:  23%|██▎       | 29/127 [02:56<09:57,  6.10s/it]


Number of images retrieved for Ants: 20


Fetching images for Cockroaches: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Overall Progress:  24%|██▎       | 30/127 [02:57<07:21,  4.55s/it]


Number of images retrieved for Cockroaches: 20




Fetching images for bird:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for bird:   5%|▌         | 1/20 [00:02<00:38,  2.00s/it][A[A

Fetching images for bird:  10%|█         | 2/20 [00:02<00:16,  1.10it/s][A[A


Fetching images for Spider: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
Overall Progress:  24%|██▍       | 31/127 [03:06<09:26,  5.90s/it]


Number of images retrieved for Spider: 20





Fetching images for horse:   5%|▌         | 1/20 [00:02<00:44,  2.33s/it][A[A[A


Fetching images for scorpion: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
Overall Progress:  25%|██▌       | 32/127 [03:08<07:13,  4.57s/it]


Number of images retrieved for scorpion: 20


Fetching images for tiger: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Overall Progress:  26%|██▌       | 33/127 [03:08<05:23,  3.44s/it]


Number of images retrieved for tiger: 20



Fetching images for pig:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for pig:   5%|▌         | 1/20 [00:00<00:11,  1.66it/s][A



Fetching images for Alligator:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Alligator:   5%|▌         | 1/20 [00:00<00:15,  1.23it/s][A[A[A[A




Fetching images for Alpaca:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for bird: 100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
Overall Progress:  27%|██▋       | 34/127 [03:16<07:26,  4.80s/it]


Number of images retrieved for bird: 20


Fetching images for horse: 100%|██████████| 20/20 [00:15<00:00,  1.31it/s]
Overall Progress:  28%|██▊       | 35/127 [03:20<06:38,  4.33s/it]


Number of images retrieved for horse: 20




Fetching images for Anteater:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for pig: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  28%|██▊       | 36/127 [03:27<07:49,  5.16s/it]


Number of images retrieved for pig: 20


Fetching images for Alligator: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  29%|██▉       | 37/127 [03:31<07:27,  4.98s/it]
Fetching images for donkey:   0%|          | 0/20 [00:00<?, ?it/s][A


Number of images retrieved for Alligator: 20


Fetching images for Alpaca: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  30%|██▉       | 38/127 [03:33<05:59,  4.04s/it]
Fetching images for donkey:   5%|▌         | 1/20 [00:02<00:38,  2.02s/it][A


Number of images retrieved for Alpaca: 20





Fetching images for Bee:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Anteater: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  31%|███       | 39/127 [03:39<06:56,  4.74s/it]


Number of images retrieved for Anteater: 20




Fetching images for Buffalo:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Buffalo:   5%|▌         | 1/20 [00:00<00:09,  2.02it/s][A[A



Fetching images for Camel:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Camel:   5%|▌         | 1/20 [00:00<00:13,  1.40it/s][A[A[A[A




Fetching images for Bamboo:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for donkey: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  31%|███▏      | 40/127 [03:46<07:36,  5.25s/it]


Number of images retrieved for donkey: 20



Fetching images for Apple:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Bee: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  32%|███▏      | 41/127 [03:51<07:38,  5.34s/it]


Number of images retrieved for Bee: 20


Fetching images for Buffalo: 100%|██████████| 20/20 [00:15<00:00,  1.33it/s]
Overall Progress:  33%|███▎      | 42/127 [03:56<07:07,  5.02s/it]


Number of images retrieved for Buffalo: 20


Fetching images for Camel: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  34%|███▍      | 43/127 [03:59<06:13,  4.44s/it]


Number of images retrieved for Camel: 20




Fetching images for Bamboo: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
Overall Progress:  35%|███▍      | 44/127 [04:00<04:43,  3.41s/it]


Number of images retrieved for Bamboo: 20




Fetching images for Apple: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:  35%|███▌      | 45/127 [04:05<05:33,  4.07s/it]


Number of images retrieved for Apple: 20



Fetching images for Banana:   0%|          | 0/20 [00:00<?, ?it/s][A


Fetching images for Bean:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A
Fetching images for Banana:   5%|▌         | 1/20 [00:01<00:23,  1.25s/it][A



Fetching images for Wildflower:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for Bean:   5%|▌         | 1/20 [00:00<00:17,  1.11it/s][A[A[A



Fetching images for Wildflower:   5%|▌         | 1/20 [00:02<00:41,  2.17s/it][A[A[A[A




Fetching images for Flower:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Apricot: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  36%|███▌      | 46/127 [04:14<07:09,  5.31s/it]


Number of images retrieved for Apricot: 20




Fetching images for Mushroom:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Banana: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]
Overall Progress:  37%|███▋      | 47/127 [04:23<08:49,  6.61s/it]


Number of images retrieved for Banana: 20


Fetching images for Bean: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  38%|███▊      | 48/127 [04:25<06:56,  5.27s/it]


Number of images retrieved for Bean: 20


Fetching images for Wildflower: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Fetching images for Flower: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


Number of images retrieved for Wildflower: 20



Overall Progress:  39%|███▉      | 50/127 [04:27<03:51,  3.00s/it]


Number of images retrieved for Flower: 20



Fetching images for Shrub:   0%|          | 0/20 [00:00<?, ?it/s][A


Fetching images for Mushroom: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  40%|████      | 51/127 [04:33<04:54,  3.87s/it]


Number of images retrieved for Mushroom: 20




Fetching images for Fern:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for Weed:   5%|▌         | 1/20 [00:05<01:37,  5.11s/it][A[A[A

Fetching images for Fern:   5%|▌         | 1/20 [00:02<00:46,  2.43s/it][A[A



Fetching images for Reed:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A
Fetching images for Shrub:   5%|▌         | 1/20 [00:10<03:20, 10.55s/it][A



Fetching images for Reed:   5%|▌         | 1/20 [00:01<00:29,  1.58s/it][A[A[A[A




Fetching images for Moss:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Weed: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Overall Progress:  41%|████      | 52/127 [04:49<09:30,  7.60s/it]


Number of images retrieved for Weed: 20


Fetching images for Fern: 100%|██████████| 20/20 [00:15<00:00,  1.29it/s]
Overall Progress:  42%|████▏     | 53/127 [04:52<07:23,  5.99s/it]


Number of images retrieved for Fern: 20


Fetching images for Shrub: 100%|██████████| 20/20 [00:24<00:00,  1.24s/it]
Overall Progress:  43%|████▎     | 54/127 [04:55<06:11,  5.08s/it]


Number of images retrieved for Shrub: 20


Fetching images for Reed: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  43%|████▎     | 55/127 [04:56<04:50,  4.04s/it]


Number of images retrieved for Reed: 20



Fetching images for Moss: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  44%|████▍     | 56/127 [05:00<04:30,  3.81s/it]


Number of images retrieved for Moss: 20



Fetching images for Grass:   5%|▌         | 1/20 [00:02<00:48,  2.56s/it][A

Fetching images for Palmtree:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Palmtree:   5%|▌         | 1/20 [00:02<00:45,  2.39s/it][A[A


Fetching images for Corn:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Corn:   5%|▌         | 1/20 [00:00<00:10,  1.86it/s][A[A[A



Fetching images for Tulip:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Tulip:   5%|▌         | 1/20 [00:00<00:12,  1.56it/s][A[A[A[A




Fetching images for Rose:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Grass: 100%|██████████| 20/20 [00:15<00:00,  1.27it/s]
Overall Progress:  45%|████▍     | 57/127 [05:14<08:15,  7.08s/it]


Number of images retrieved for Grass: 20


Fetching images for Palmtree: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Overall Progress:  46%|████▌     | 58/127 [05:18<07:05,  6.17s/it]


Number of images retrieved for Palmtree: 20


Fetching images for Corn: 100%|██████████| 20/20 [00:18<00:00,  1.06it/s]
Fetching images for Tulip: 100%|██████████| 20/20 [00:18<00:00,  1.10it/s]
Overall Progress:  46%|████▋     | 59/127 [05:24<06:41,  5.90s/it]


Number of images retrieved for Corn: 20


Fetching images for Rose: 100%|██████████| 20/20 [00:17<00:00,  1.14it/s]
Overall Progress:  48%|████▊     | 61/127 [05:24<03:21,  3.06s/it]


Number of images retrieved for Tulip: 20

Number of images retrieved for Rose: 20



Fetching images for Clove:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Clove:   5%|▌         | 1/20 [00:01<00:29,  1.57s/it][A

Fetching images for Dogwood:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Dogwood:   5%|▌         | 1/20 [00:01<00:32,  1.71s/it][A[A


Fetching images for Durian:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A




Fetching images for Fig:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A



Fetching images for Ferns:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for Durian:   5%|▌         | 1/20 [00:01<00:12,  1.47it/s][A[A[A


Fetching images for Durian:  10%|█         | 2/20 [00:01<00:12,  1.40it/s][A[A[A




Fetching images for Fig:   5%|▌         | 1/20 [00:01<00:29,  1.57s/it][A[A[A[A[A



Fetching images for Clove: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]
Overall Progress:  49%|████▉     | 62/127 [05:40<07:25,  6.86s/it]


Number of images retrieved for Clove: 20



Fetching images for Flax:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Dogwood: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  50%|████▉     | 63/127 [05:45<06:40,  6.25s/it]


Number of images retrieved for Dogwood: 20


Fetching images for Ferns: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Overall Progress:  50%|█████     | 64/127 [05:50<06:23,  6.09s/it]


Number of images retrieved for Ferns: 20


Fetching images for Durian: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
Fetching images for Fig: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Overall Progress:  51%|█████     | 65/127 [05:51<04:37,  4.47s/it]


Number of images retrieved for Fig: 20

Number of images retrieved for Durian: 20




Fetching images for Frangipani:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Frangipani:   5%|▌         | 1/20 [00:01<00:24,  1.26s/it][A[A


Fetching images for Bougainvillea:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for Lantana:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for Flax: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Overall Progress:  53%|█████▎    | 67/127 [06:01<04:44,  4.74s/it]


Number of images retrieved for Flax: 20






Fetching images for Lantana:   5%|▌         | 1/20 [00:01<00:26,  1.37s/it][A[A[A[A




Fetching images for Hibiscus:   5%|▌         | 1/20 [00:01<00:27,  1.42s/it][A[A[A[A[A


Fetching images for Bougainvillea:   5%|▌         | 1/20 [00:01<00:30,  1.63s/it][A[A[A
Fetching images for Pea:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for Frangipani: 100%|██████████| 20/20 [00:13<00:00,  1.47it/s]
Overall Progress:  54%|█████▎    | 68/127 [06:05<04:30,  4.59s/it]


Number of images retrieved for Frangipani: 20




Fetching images for OrchidTree:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for Hibiscus: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Fetching images for Lantana: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Fetching images for Bougainvillea: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Overall Progress:  56%|█████▌    | 71/127 [06:16<03:20,  3.58s/it]


Number of images retrieved for Hibiscus: 20

Number of images retrieved for Lantana: 20

Number of images retrieved for Bougainvillea: 20


Fetching images for Pea: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]
Overall Progress:  57%|█████▋    | 72/127 [06:18<02:55,  3.18s/it]


Number of images retrieved for Pea: 20


Fetching images for OrchidTree: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
Overall Progress:  57%|█████▋    | 73/127 [06:27<04:01,  4.48s/it]


Number of images retrieved for OrchidTree: 20



Fetching images for Cottonplant:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for RangoonCreeper:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for Cornelainetree:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for Jackfruit:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A
Fetching images for Cottonplant:   5%|▌         | 1/20 [00:01<00:21,  1.16s/it][A

Fetching images for RangoonCreeper:   5%|▌         | 1/20 [00:01<00:25,  1.33s/it][A[A



Fetching images for Jackfruit:   5%|▌         | 1/20 [00:01<00:23,  1.25s/it][A[A[A[A




Fetching images for Coffeplant:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Cornelainetree:   0%|          | 0/20 [00:13<?, ?it/s]
Overall Progress:  58%|█████▊    | 74/127 [06:42<06:35,  7.46s/it]


Number of images retrieved for Cornelainetree: 0


Fetching images for RangoonCreeper: 100%|██████████| 20/20 [00:15<00:00,  1.27it/s]
Fetching images for Cottonplant: 100%|██████████| 20/20 [00:16<00:00,  1.25it/s]
Overall Progress:  60%|█████▉    | 76/127 [06:44<03:33,  4.18s/it]


Number of images retrieved for RangoonCreeper: 20

Number of images retrieved for Cottonplant: 20


Fetching images for Jackfruit: 100%|██████████| 20/20 [00:15<00:00,  1.29it/s]
Overall Progress:  61%|██████    | 77/127 [06:44<02:35,  3.12s/it]


Number of images retrieved for Jackfruit: 20


Fetching images for Coffeplant: 100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
Overall Progress:  61%|██████▏   | 78/127 [06:47<02:23,  2.93s/it]


Number of images retrieved for Coffeplant: 20



Fetching images for Coconut:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for watermelon:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Coconut:   5%|▌         | 1/20 [00:01<00:18,  1.01it/s][A


Fetching images for wheat:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for radish:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for wheat:   5%|▌         | 1/20 [00:01<00:20,  1.07s/it][A[A[A

Fetching images for watermelon:   5%|▌         | 1/20 [00:01<00:24,  1.29s/it][A[A



Fetching images for radish:   5%|▌         | 1/20 [00:01<00:24,  1.27s/it][A[A[A[A




Fetching images for carrot:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Coconut: 100%|██████████| 20/20 [00:13<00:00,  1.46it/s]
Overall Progress:  62%|██████▏   | 79/127 [07:07<06:20,  7.93s/it]


Number of images retrieved for Coconut: 20


Fetching images for watermelon: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Fetching images for wheat: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]
Overall Progress:  63%|██████▎   | 80/127 [07:09<04:58,  6.35s/it]


Number of images retrieved for watermelon: 20


Overall Progress:  64%|██████▍   | 81/127 [07:09<03:28,  4.53s/it]


Number of images retrieved for wheat: 20


Fetching images for radish: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Overall Progress:  65%|██████▍   | 82/127 [07:13<03:06,  4.15s/it]


Number of images retrieved for radish: 20


Fetching images for carrot: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  65%|██████▌   | 83/127 [07:14<02:25,  3.32s/it]


Number of images retrieved for carrot: 20



Fetching images for bed:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for bed:   5%|▌         | 1/20 [00:01<00:29,  1.58s/it][A

Fetching images for chair:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for cabinet:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A

Fetching images for chair:   5%|▌         | 1/20 [00:01<00:21,  1.11s/it][A[A


Fetching images for cabinet:   5%|▌         | 1/20 [00:01<00:25,  1.33s/it][A[A[A



Fetching images for chests:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for clock:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A



Fetching images for chests:   5%|▌         | 1/20 [00:00<00:15,  1.21it/s][A[A[A[A




Fetching images for bed: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Overall Progress:  66%|██████▌   | 84/127 [07:31<05:15,  7.33s/it]


Number of images retrieved for bed: 20


Fetching images for cabinet: 100%|██████████| 20/20 [00:16<00:00,  1.22it/s]
Overall Progress:  67%|██████▋   | 85/127 [07:37<04:56,  7.06s/it]


Number of images retrieved for cabinet: 20



Fetching images for desks:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for clock: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Fetching images for chests: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  69%|██████▊   | 87/127 [07:41<02:51,  4.28s/it]


Number of images retrieved for clock: 20

Number of images retrieved for chests: 20




Fetching images for chair: 100%|██████████| 20/20 [00:22<00:00,  1.13s/it][A[A


Fetching images for table:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for table:   5%|▌         | 1/20 [00:01<00:19,  1.05s/it][A[A[A



Fetching images for Bookcase:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for Piano:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for Piano:   5%|▌         | 1/20 [00:00<00:17,  1.08it/s][A[A[A[A[A



Fetching images for desks: 100%|██████████| 20/20 [00:15<00:00,  1.29it/s]
Overall Progress:  69%|██████▉   | 88/127 [07:53<04:15,  6.55s/it]


Number of images retrieved for desks: 20



Fetching images for Umbrella:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for chair: 100%|██████████| 20/20 [00:36<00:00,  1.82s/it]
Overall Progress:  70%|███████   | 89/127 [07:57<03:41,  5.83s/it]


Number of images retrieved for chair: 20


Fetching images for table: 100%|██████████| 20/20 [00:15<00:00,  1.27it/s]
Overall Progress:  71%|███████   | 90/127 [08:01<03:19,  5.39s/it]


Number of images retrieved for table: 20




Fetching images for Bookcase: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Overall Progress:  72%|███████▏  | 91/127 [08:04<02:47,  4.64s/it]

Fetching images for Piano: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]


Number of images retrieved for Bookcase: 20



Overall Progress:  72%|███████▏  | 92/127 [08:05<01:56,  3.34s/it]


Number of images retrieved for Piano: 20





Fetching images for Umbrella: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]

Overall Progress:  73%|███████▎  | 93/127 [08:13<02:44,  4.85s/it]


Fetching images for cart:   5%|▌         | 1/20 [00:03<01:01,  3.25s/it][A[A[A


Number of images retrieved for Umbrella: 20






Fetching images for ball:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A
Fetching images for sofa:   5%|▌         | 1/20 [00:02<00:45,  2.40s/it][A



Fetching images for Clothes: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]
Overall Progress:  74%|███████▍  | 94/127 [08:18<02:38,  4.82s/it]


Number of images retrieved for Clothes: 20




Fetching images for spoon:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for spoon:   5%|▌         | 1/20 [00:00<00:05,  3.50it/s][A[A




Fetching images for Bowl:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for cart: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  75%|███████▍  | 95/127 [08:27<03:19,  6.24s/it]


Number of images retrieved for cart: 20


Fetching images for ball: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]
Fetching images for sofa: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Overall Progress:  76%|███████▌  | 96/127 [08:29<02:30,  4.84s/it]


Number of images retrieved for sofa: 20

Number of images retrieved for ball: 20


Fetching images for spoon: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]
Overall Progress:  77%|███████▋  | 98/127 [08:32<01:37,  3.38s/it]


Number of images retrieved for spoon: 20



Fetching images for pan:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Bowl: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]
Overall Progress:  78%|███████▊  | 99/127 [08:39<01:58,  4.22s/it]


Number of images retrieved for Bowl: 20




Fetching images for fridge:   5%|▌         | 1/20 [00:01<00:37,  1.97s/it][A[A


Fetching images for book:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A
Fetching images for pan:   5%|▌         | 1/20 [00:03<00:59,  3.13s/it][A


Fetching images for book:   5%|▌         | 1/20 [00:01<00:30,  1.62s/it][A[A[A



Fetching images for Cliff:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A



Fetching images for Cliff:   5%|▌         | 1/20 [00:00<00:11,  1.68it/s][A[A[A[A




Fetching images for Bay:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for fridge: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]
Overall Progress:  79%|███████▊  | 100/127 [08:55<03:15,  7.24s/it]


Number of images retrieved for fridge: 20


Fetching images for pan: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]
Overall Progress:  80%|███████▉  | 101/127 [08:56<02:26,  5.64s/it]


Number of images retrieved for pan: 20


Fetching images for book: 100%|██████████| 20/20 [00:17<00:00,  1.11it/s]
Overall Progress:  80%|████████  | 102/127 [08:58<01:55,  4.62s/it]


Number of images retrieved for book: 20


Fetching images for Cliff: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]
Overall Progress:  81%|████████  | 103/127 [08:59<01:22,  3.42s/it]


Number of images retrieved for Cliff: 20


Fetching images for Bay: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  82%|████████▏ | 104/127 [09:01<01:11,  3.13s/it]


Number of images retrieved for Bay: 20



Fetching images for Coast:   0%|          | 0/20 [00:00<?, ?it/s][A

Fetching images for Mountains:   0%|          | 0/20 [00:00<?, ?it/s][A[A
Fetching images for Coast:   5%|▌         | 1/20 [00:02<00:50,  2.67s/it][A


Fetching images for Forests:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A

Fetching images for Mountains:   5%|▌         | 1/20 [00:01<00:35,  1.88s/it][A[A



Fetching images for Waterbodies:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




Fetching images for Lake:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A


Fetching images for Forests:   5%|▌         | 1/20 [00:01<00:22,  1.17s/it][A[A[A



Fetching images for Waterbodies:   5%|▌         | 1/20 [00:01<00:18,  1.01it/s][A[A[A[A




Fetching images for Coast: 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
Overall Progress:  83%|████████▎ | 105/127 [09:21<03:00,  8.20s/it]


Number of images retrieved for Coast: 20


Fetching images for Mountains: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
Overall Progress:  83%|████████▎ | 106/127 [09:25<02:24,  6.89s/it]


Number of images retrieved for Mountains: 20


Fetching images for Forests: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]
Fetching images for Waterbodies: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]


Number of images retrieved for Forests: 20



Fetching images for Lake: 100%|██████████| 20/20 [00:16<00:00,  1.20it/s]


Number of images retrieved for Waterbodies: 20



Overall Progress:  86%|████████▌ | 109/127 [09:27<00:49,  2.75s/it]


Number of images retrieved for Lake: 20



Fetching images for desert:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for desert:   5%|▌         | 1/20 [00:04<01:16,  4.04s/it][A

Fetching images for river:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for Farmland:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A


Fetching images for Farmland:   5%|▌         | 1/20 [00:00<00:12,  1.57it/s][A[A[A



Fetching images for plain:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A

Fetching images for river:   5%|▌         | 1/20 [00:01<00:19,  1.03s/it][A[A




Fetching images for hedges:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A




Fetching images for hedges:   5%|▌         | 1/20 [00:01<00:19,  1.03s/it][A[A[A[A[A



Fetching images for desert: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Overall Progress:  87%|████████▋ | 110/127 [09:48<02:19,  8.19s/it]


Number of images retrieved for desert: 20


Fetching images for Farmland: 100%|██████████| 20/20 [00:15<00:00,  1.25it/s]
Overall Progress:  87%|████████▋ | 111/127 [09:52<01:51,  6.97s/it]


Number of images retrieved for Farmland: 20


Fetching images for river: 100%|██████████| 20/20 [00:16<00:00,  1.18it/s]
Fetching images for hedges: 100%|██████████| 20/20 [00:16<00:00,  1.24it/s]
Overall Progress:  89%|████████▉ | 113/127 [09:53<00:51,  3.66s/it]


Number of images retrieved for river: 20

Number of images retrieved for hedges: 20


Fetching images for plain: 100%|██████████| 20/20 [00:17<00:00,  1.14it/s]
Overall Progress:  90%|████████▉ | 114/127 [09:55<00:39,  3.01s/it]


Number of images retrieved for plain: 20



Fetching images for sky:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for sky:   5%|▌         | 1/20 [00:02<00:40,  2.12s/it][A

Fetching images for cave:   0%|          | 0/20 [00:00<?, ?it/s][A[A


Fetching images for cloud:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for flowergarden:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A

Fetching images for cave:   5%|▌         | 1/20 [00:02<00:44,  2.35s/it][A[A




Fetching images for glacier:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A



Fetching images for flowergarden:   5%|▌         | 1/20 [00:02<00:50,  2.66s/it][A[A[A[A


Fetching images for cloud:   5%|▌         | 1/20 [00:03<00:59,  3.11s/it][A[A[A




Fetching images for sky: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
Overall Progress:  91%|█████████ | 115/127 [10:12<01:25,  7.15s/it]


Number of images retrieved for sky: 20



Fetching images for grassland:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for cave: 100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
Overall Progress:  91%|█████████▏| 116/127 [10:20<01:21,  7.45s/it]


Number of images retrieved for cave: 20


Fetching images for glacier: 100%|██████████| 20/20 [00:16<00:00,  1.23it/s]
Fetching images for flowergarden: 100%|██████████| 20/20 [00:17<00:00,  1.13it/s]
Fetching images for cloud: 100%|██████████| 20/20 [00:18<00:00,  1.11it/s]
Overall Progress:  94%|█████████▎| 119/127 [10:22<00:25,  3.15s/it]


Number of images retrieved for glacier: 20

Number of images retrieved for flowergarden: 20

Number of images retrieved for cloud: 20




Fetching images for horizon:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Fetching images for horizon:   5%|▌         | 1/20 [00:00<00:10,  1.89it/s][A[A


Fetching images for plateau:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A



Fetching images for lighthouse:   0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A


Fetching images for plateau:   5%|▌         | 1/20 [00:00<00:15,  1.25it/s][A[A[A




Fetching images for grassland: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]




Overall Progress:  94%|█████████▍| 120/127 [10:30<00:31,  4.53s/it]




Fetching images for savannah:   5%|▌         | 1/20 [00:01<00:30,  1.60s/it][A[A[A[A[A


Number of images retrieved for grassland: 20



Fetching images for valley:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for horizon: 100%|██████████| 20/20 [00:13<00:00,  1.52it/s]
Overall Progress:  95%|█████████▌| 121/127 [10:38<00:31,  5.30s/it]


Number of images retrieved for horizon: 20




Fetching images for plateau: 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
Overall Progress:  96%|█████████▌| 122/127 [10:44<00:27,  5.52s/it]


Number of images retrieved for plateau: 20




Fetching images for volcano:   5%|▌         | 1/20 [00:01<00:29,  1.53s/it][A[A

Fetching images for lighthouse: 100%|██████████| 20/20 [00:17<00:00,  1.12it/s]
Fetching images for savannah: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]
Overall Progress:  98%|█████████▊| 124/127 [10:46<00:10,  3.35s/it]


Number of images retrieved for lighthouse: 20

Number of images retrieved for savannah: 20


Fetching images for valley: 100%|██████████| 20/20 [00:15<00:00,  1.33it/s]
Overall Progress:  98%|█████████▊| 125/127 [10:47<00:05,  2.63s/it]


Number of images retrieved for valley: 20



Fetching images for waterfall:   0%|          | 0/20 [00:00<?, ?it/s][A
Fetching images for volcano: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]
Overall Progress:  99%|█████████▉| 126/127 [10:58<00:05,  5.01s/it]


Number of images retrieved for volcano: 20


Fetching images for waterfall: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s]
Overall Progress: 100%|██████████| 127/127 [11:02<00:00,  5.22s/it]


Number of images retrieved for waterfall: 20
Data saved to image_urls.json





**Thu thập dữ liệu - Crawl ảnh từ URL**

In [17]:
import os
import json
import time
import urllib.request
from collections import defaultdict
from tqdm import tqdm
import concurrent.futures
from urllib.parse import urlparse

class ImageDownloader:
    def __init__(self, json_file, download_dir='Dataset', max_workers=4, delay=1):
        self.json_file = json_file  # File chứa các URL hình ảnh ở định dạng JSON
        self.download_dir = download_dir  # Tên thư mục để lưu trữ hình ảnh
        self.max_workers = max_workers  # Số luồng thực thi
        self.delay = delay  # Độ trễ giữa các request để tránh việc gửi quá nhiều yêu cầu tới máy chủ
        self.filename = set()  # Để lưu trữ các đường dẫn file đã tải xuống
        self.setup_directory()  # Thiết lập cấu trúc thư mục

    def setup_directory(self):
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)

    def read_json(self):
        """
        Đọc file JSON và trả về dữ liệu.

        Returns:
        data (dict): Dữ liệu được đọc từ file JSON.
        """
        with open(self.json_file, 'r') as file:
            data = json.load(file)
        return data

    def is_valid_url(self, url):
        """
        Kiểm tra xem URL có hợp lệ hay không.

        Parameters:
        url (str): URL cần kiểm tra.

        Returns:
        bool: True nếu URL hợp lệ, False nếu không hợp lệ.
        """
        try:
            with urllib.request.urlopen(url) as response:
                if response.status == 200 and 'image' in response.info().get_content_type():
                    return True
        except Exception:
            return False

    def download_image(self, url, category, term, pbar):
        """
        Tải hình ảnh từ URL đã cho.

        Parameters:
        url (str): URL của hình ảnh cần tải xuống.
        category (str): Danh mục của hình ảnh.
        term (str): Từ khóa hoặc thuật ngữ liên quan đến hình ảnh.
        pbar (tqdm): Đối tượng thanh tiến trình.

        Returns:
        str: Thông báo về trạng thái của quá trình tải xuống.
        """
        if not self.is_valid_url(url):
            pbar.update(1)
            return f"Invalid URL: {url}"

        category_dir = os.path.join(self.download_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

        term_dir = os.path.join(category_dir, term)
        if not os.path.exists(term_dir):
            os.makedirs(term_dir)

        filename = os.path.join(term_dir, os.path.basename(urlparse(url).path))
        self.filename.add(filename)  # Ghi lại đường dẫn file đã tải xuống

        try:
            urllib.request.urlretrieve(url, filename)
            pbar.update(1)
            return f"Downloaded: {url}"
        except Exception as e:
            pbar.update(1)
            return f"Failed to download {url}: {str(e)}"

    def download_images(self):
        """
        Tải xuống các hình ảnh từ file JSON.

        Returns:
        None
        """
        data = self.read_json()
        download_tasks = []

        total_images = sum(len(urls) for terms in data.values() for urls in terms.values())
        with tqdm(total=total_images, desc="Downloading images") as pbar:
            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                for category, terms in data.items():
                    for term, urls in terms.items():
                        for url in urls:
                            download_tasks.append(
                                executor.submit(self.download_image, url, category, term, pbar)
                            )
                        time.sleep(self.delay)  # Thêm độ trễ để tránh gửi quá nhiều yêu cầu tới máy chủ

            for future in concurrent.futures.as_completed(download_tasks):
                print(future.result())

        self.export_filename()

    def export_filename(self):
        """
        Xuất các đường dẫn file đã tải xuống vào một file văn bản.

        Returns:
        None
        """
        with open('filename.txt', 'w') as file:
            for filename in sorted(self.filename):
                file.write(f"{filename}\n")

In [18]:
downloader = ImageDownloader(json_file='image_urls.json', download_dir='Dataset', max_workers=4, delay=1)
downloader.download_images()
downloader.export_filename()

Downloading images:  17%|█▋        | 420/2520 [00:34<02:51, 12.21it/s]


KeyboardInterrupt: 

**Xử lí dữ liệu - Làm sạch bộ dữ liệu**

In [None]:
from google.colab import drive
drive.mount('content/drive/')

In [None]:
def check_and_preprocess_images(image_dir):
  """
  Check and preprocess images in the specified directory.

  Parameters:
  image_dir (str): The directory containing the images to be checked and preprocessed.

  Returns:

  None
  """
  for root, _, files in os.walk(image_dir):
    for filename in files:
      filepath = os.path.join(root, file)
      try:
        with Image.open(filepath) as img:
          #Check if image is smaller than 50x50 pixels
          if img.size[0] < 50 or img.size[1] < 50:
            os.remove(filepath)
            print(f"Deleted {file_path}: Image too small ({img.size[0]}x{img.size[1]})")
            continue

          #Convert non-RGB images to RGB
          if img.mode != 'RGB':
            img = img.convert('RGB')
            img.save(filepath)
            print(f"Converted {file_path} to RGB")

      except Exception as e:
        #If file is not an image, delete it
        os.remove(filepath)
        print(f"Deleted {file_path}: Not an image or corrupted file ({str(e)})")


check_and_preprocess_images('Dataset')

In [None]:
!zip -r /content/drive/MyDrive/Clean_Dataset.zip Dataset

**Xử lí dữ liệu - Tổ chức cấu trúc folder**

In [None]:
!gdown --id 1--6fe48D9ydnTpLV1GKKqJ0pqpOXB3z_

In [None]:
!unzip Clean_Dataset

In [None]:
import os
import shutil
from collections import defaultdict

# Define the source and target directories
source_dir = "Dataset"
train_dir = "data/train"
test_dir = "data/test"

# Create the target directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Initialize a dictionary to hold file paths for each class
class_files = defaultdict(list)

# Read the file paths from the text file
with open('filename.txt', 'r') as file:
    lines = file.readlines()
    for line in lines:
        line = line.strip()
        if line:
            # Extract the class name from the path
            parts = line.split('/')
            class_name = parts[2]  # Structure Dataset/category/class/image.jpg
            class_files[class_name].append(line)

# Move images to the train and test directories
for class_name, files in class_files.items():
    # Create the train and test directories for the class
    train_class_dir = os.path.join(train_dir, class_name)
    test_class_dir = os.path.join(test_dir, class_name)
    os.makedirs(train_class_dir, exist_ok=True)
    os.makedirs(test_class_dir, exist_ok=True)

    # Move 19 images to train and 1 image to test
    for i, file_path in enumerate(files):
        if i == 0:
            shutil.copy(file_path, test_class_dir)
        elif i < 20:
            shutil.copy(file_path, train_class_dir)

print("Dataset organization complete!")

In [None]:
!zip -r /content/drive/MyDrive/data.zip data