In [1]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

In [2]:
# Define your custom dataset path
base_path = "D:/Work/Corrosion_Detection/Web_Scrapping/"

# Define folder structure
corrosion_path = os.path.join(base_path, "Corrosion1")  # Folder for corrosion images
nocorrosion_path = os.path.join(base_path, "Nocorrosion1")  # Folder for no-corrosion images

# Ensure the folders exist
os.makedirs(corrosion_path, exist_ok=True)
os.makedirs(nocorrosion_path, exist_ok=True)

In [3]:
def scrape_google_images(search_query, save_folder, num_images=50):
    """Scrapes Google Images for the specified query and saves images to the given folder."""
    
    # Setup WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # To run in headless mode
    options.add_argument("--incognito") # To run in incognito mode
    options.add_argument("--log-level=3")  # Suppress warnings
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Open Google Images with refined search parameters
    search_url = f"https://www.google.com/search?q={search_query.replace(' ', '+')}&tbm=isch"
    driver.get(search_url)
    time.sleep(3)

    # Scroll down multiple times to load more images
    for _ in range(10):
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(2)

    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    img_tags = soup.find_all("img")

    # Extract image URLs
    image_urls = []
    for img_tag in img_tags:
        img_url = img_tag.get("src")
        if img_url and "http" in img_url:
            image_urls.append(img_url)

    driver.quit()  # Close the browser

    print(f"üîç Found {len(image_urls)} images for {search_query}. Downloading...")

    # Download and save images (No size filtering)
    valid_images = 0
    for i, img_url in enumerate(image_urls):
        if valid_images >= num_images:
            break  # Stop when we reach the required number of images

        try:
            response = requests.get(img_url, timeout=5)
            image = Image.open(BytesIO(response.content))

            # FILTER: Allow only JPG and PNG images
            if image.format not in ["JPEG", "JPG", "PNG"]:
                print(f"‚ö†Ô∏è Skipped non-image file: {img_url}")
                continue

            # Save the valid image
            image_path = os.path.join(save_folder, f"{search_query.replace(' ', '_')}_{valid_images+1}.jpg")
            image.save(image_path)
            valid_images += 1
            print(f"‚úÖ Saved: {image_path}")

        except Exception as e:
            print(f"‚ùå Failed to download {img_url}: {e}")

    print(f"üéØ {valid_images} images saved in '{save_folder}'.")

In [5]:
# Scrape ALL steel plate corrosion images (without resolution filtering)
scrape_google_images("Rusty surface of steel", corrosion_path, num_images=1500)

# Scrape ALL clean steel plate images (without resolution filtering)
scrape_google_images("Stainless steel surface", nocorrosion_path, num_images=1500)

üîç Found 700 images for Rusty surface of steel. Downloading...
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_1.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_2.jpg
‚ùå Failed to download https://fonts.gstatic.com/s/i/productlogos/googleg/v6/24px.svg: cannot identify image file <_io.BytesIO object at 0x00000208AEB203B0>
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_3.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_4.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_5.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_6.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_7.jpg
‚úÖ Saved: D:/Work/Corrosion_Detection/Web_Scrapping/Corrosion1\Rusty_surface_of_steel_8.jpg
‚úÖ Saved: D:/Work/Corrosion_Detec