# Libraries Import

In [18]:
from duckduckgo_search import DDGS
from fastcore.all import *
from fastdownload import download_url
import requests
from urllib.parse import urlsplit

import matplotlib.pyplot as plt
import cv2
from sklearn.model_selection import train_test_split

from pathlib import Path
from time import sleep
import sys
import shutil
import warnings
warnings.filterwarnings("ignore")
# import socket
# try:
#     socket.setdefaulttimeout(1)
#     socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect(('1.1.1.1', 53))
# except socket.error as ex: raise Exception("STOP: No internet. Click '>|' in top right and set 'Internet' switch to on")

# Downloading Images

In [9]:
def search_images(term, max_images):
    print(f"\nSearching for '{term}'")
    with DDGS() as ddgs:
        search_results = ddgs.images(keywords=term)
        search_results_list = list(search_results)
        image_urls = [result.get("image") for result in search_results_list[:max_images]]
        return L(image_urls)

In [10]:
# def download_images(dest, term, max_images):
#     urls=search_images(term, max_images=max_images)
#     for i,url in enumerate(urls):
#         print(f'\r{i+1}/{max_images} images downloading',end='',flush=True)
#         file_path = dest / f"{term.replace(' ', '_')}_{i+1}.jpg"
#         try:
#             download_url(url, file_path, show_progress=False)
#         except Exception as e:
#             print(f"\nFailed to download {url}: {e}")

def download_images(dest, term, max_images, timeout=10):
    urls = search_images(term, max_images=max_images)
    for i, url in enumerate(urls):
        print(f'\r{i+1}/{max_images} images downloading', end='', flush=True)
        file_extension = Path(urlsplit(url).path).suffix or '.jpg'
        file_path = dest / f"{term.replace(' ', '_')}_{i+1}{file_extension}"
        try:
            response = requests.get(url, timeout=timeout)
            with open(file_path, 'wb') as f:
                f.write(response.content)
        except requests.exceptions.RequestException as e:
            print(f"\nFailed to download {url}: {e}")
            continue

## Downloading using Dictionary

In [17]:
vehicles = {
    'car': ['car photos', 'sports car', 'luxury car', 'classic car', 'car exterior'],
    'truck': ['truck photos', 'pickup truck', 'monster truck', 'delivery truck', 'heavy truck'],
    'bus': ['bus photos', 'school bus', 'city bus', 'coach bus', 'public transport bus'],
    'motorcycle': ['motorcycle photos', 'sports motorcycle', 'cruiser motorcycle', 'vintage motorcycle', 'motorbike'],
    'bicycle': ['bicycle photos', 'mountain bike', 'road bike', 'vintage bicycle', 'cycling bike']
}

path = Path('Vehicle New')

for vehicle, terms in vehicles.items():
    for term in terms:
        dest = path / vehicle
        dest.mkdir(exist_ok=True, parents=True)
        download_images(dest, term, max_images=50, timeout=20)
        sleep(10)


Searching for 'truck photos'
50/50 images downloading
Searching for 'pickup truck'
3/50 images downloading
Failed to download https://media.ed.edmunds-media.com/chevrolet/silverado-1500/2023/oem/2023_chevrolet_silverado-1500_crew-cab-pickup_high-country_fq_oem_1_1280.jpg: HTTPSConnectionPool(host='media.ed.edmunds-media.com', port=443): Read timed out. (read timeout=20)
24/50 images downloading
Failed to download http://cdn.carbuzz.com/gallery-images/1600/703000/700/703778.jpg: HTTPConnectionPool(host='cdn.carbuzz.com', port=80): Max retries exceeded with url: /gallery-images/1600/703000/700/703778.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x30196da50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
30/50 images downloading
Failed to download http://cdn.carbuzz.com/gallery-images/1600/690000/800/690890.jpg: HTTPConnectionPool(host='cdn.carbuzz.com', port=80): Max retries exceeded with url: /gallery-

KeyboardInterrupt: 

## Downloading using Array

In [None]:
# vehicles = ['Car', 'Truck', 'Motorcycle', 'Bicycle', 'Bus']
# lighting_conditions = [
#     'photo', 'daytime photo', 'afternoon photo', 'sun photo', 'sunny day photo', 
#     'bright sun photo', 'sunlight photo', 'noon photo', 'evening photo', 'dusk photo', 
#     'dawn photo', 'overcast photo', 'cloudy photo'
# ]
# path = Path('Vehicle')

# for vehicle in vehicles:
#     for condition in lighting_conditions:
#         dest = path / vehicle
#         dest.mkdir(exist_ok=True, parents=True)
#         download_images(dest, f'{vehicle} {condition}', max_images=64)
#         sleep(5)  # Pause between searches to avoid over-loading the server


Searching for 'Motorcycle dawn photo'
15/64 images downloading
Failed to download https://www.wallpapertip.com/wmimgs/251-2510439_wallpaper-motorcyclist-motorcycle-dawn-indonesia.jpg: <urlopen error [Errno 60] Operation timed out>
53/64 images downloading
Failed to download https://swmmotorcycles.com.au/wp-content/uploads/2019/12/DAWN_left_f.png: HTTP Error 403: Forbidden
====Error Body====
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="wi

# Move All Images to Class Folders from Subfolders

In [None]:
# def move_images_to_main_folder(main_folder):
#     main_folder = Path(main_folder)
    
#     for subfolder in main_folder.iterdir():
#         if subfolder.is_dir():
#             for sub_subfolder in subfolder.iterdir():
#                 if sub_subfolder.is_dir():
#                     for image_file in sub_subfolder.iterdir():
#                         if image_file.is_file():
#                             destination = subfolder / image_file.name
#                             shutil.move(str(image_file), str(destination))
#                     # Remove the now-empty sub_subfolder
#                     sub_subfolder.rmdir()

# # Define the main folder path
# main_folder_path = Path('Vehicle')

# # Move all images to their respective main folders
# move_images_to_main_folder(main_folder_path)

# print("All images have been moved to their respective main folders.")


All images have been moved to their respective main folders.


# Split Data to Folders

## Train, Test & Validation Split

In [None]:
# # Function to split dataset into train, validation, and test sets
# def split_dataset(dataset_dir, output_dir, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
#     dataset_dir = Path(dataset_dir)
#     output_dir = Path(output_dir)

#     classes = [d.name for d in dataset_dir.iterdir() if d.is_dir()]

#     for cls in classes:
#         images = list((dataset_dir / cls).glob('*.*'))
#         train, temp = train_test_split(images, test_size=val_ratio + test_ratio, random_state=42)
#         val, test = train_test_split(temp, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

#         # Function to copy images to respective directories
#         def copy_images(images, subset):
#             subset_dir = output_dir / subset / cls
#             subset_dir.mkdir(parents=True, exist_ok=True)
#             for img in images:
#                 shutil.copy(img, subset_dir / img.name)

#         copy_images(train, 'train')
#         copy_images(val, 'val')
#         copy_images(test, 'test')

## Train & Test Split

In [29]:
# # Function to split dataset into train and test sets
# def split_dataset(dataset_dir, output_dir, train_ratio=0.85, test_ratio=0.15):
#     dataset_dir = Path(dataset_dir)
#     output_dir = Path(output_dir)

#     classes = [d.name for d in dataset_dir.iterdir() if d.is_dir()]

#     for cls in classes:
#         images = list((dataset_dir / cls).glob('*.*'))
#         train, test = train_test_split(images, test_size=test_ratio, random_state=42)

#         # Function to copy images to respective directories
#         def copy_images(images, subset):
#             subset_dir = output_dir / subset / cls
#             subset_dir.mkdir(parents=True, exist_ok=True)
#             for img in images:
#                 shutil.copy(img, subset_dir / img.name)

#         copy_images(train, 'train')
#         copy_images(test, 'test')

## Run Split

In [30]:
# # Paths
# dataset_dir = 'Vehicle New'
# output_dir = 'Vehicle_split'
# split_dataset(dataset_dir, output_dir)