In [None]:
!pip install Pillow

Defaulting to user installation because normal site-packages is not writeable
Collecting Pillow
  Downloading pillow-11.1.0-cp313-cp313-win_amd64.whl.metadata (9.3 kB)
Downloading pillow-11.1.0-cp313-cp313-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.6 MB ? eta -:--:--
   --------------- ------------------------ 1.0/2.6 MB 3.5 MB/s eta 0:00:01
   --------------------------- ------------ 1.8/2.6 MB 3.5 MB/s eta 0:00:01
   ---------------------------------------- 2.6/2.6 MB 3.4 MB/s eta 0:00:00
Installing collected packages: Pillow
Successfully installed Pillow-11.1.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import time
import requests
import pandas as pd
import csv
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='image_collection.log'
)
logger = logging.getLogger()

# Dataset name
DATASET_NAME = "Image_MultiModalVerse"
base_dir = f"{DATASET_NAME}_dataset"
os.makedirs(base_dir, exist_ok=True)

# List of 20 categories as specified in the task
categories = [
    "cats", "dogs", "cars", "bicycles", "mountains", "beaches", "planes", "flowers", "buildings", "birds",
    "computers", "books", "trees", "fruits", "shoes", "clocks", "guitars", "robots", "bridges", "cakes"
]

# Set up Selenium WebDriver
chrome_driver_path = r"C:\Users\UJWAL\Downloads\chromedriver-win64 (1)\chromedriver-win64\chromedriver.exe"

# Create a Service object
service = Service(chrome_driver_path)

# Add options for better performance
options = Options()
options.add_argument("--headless")  # Run in headless mode for better performance
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")

# Initialize the WebDriver with the service and options
driver = webdriver.Chrome(service=service, options=options)

# Create metadata CSV file with all required fields
metadata_file = os.path.join(base_dir, f"{DATASET_NAME}_metadata.csv")
with open(metadata_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['category', 'filename', 'url', 'width', 'height', 'file_size_kb'])

# Function to scrape image URLs - improved version
def fetch_image_urls(query, max_images=50, max_retries=3):
    for retry in range(max_retries):
        try:
            # Use different search engines for diversity
            if retry == 0:
                search_url = f"https://www.google.com/search?q={query}&source=lnms&tbm=isch"
            elif retry == 1:
                search_url = f"https://www.bing.com/images/search?q={query}"
            else:
                search_url = f"https://search.yahoo.com/search?p={query}&tbm=isch"
                
            driver.get(search_url)
            time.sleep(2)
            
            # Scroll down multiple times to load more images
            for _ in range(5):
                driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
                time.sleep(2)
            
            # Get page source
            page_source = driver.page_source
            
            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, "html.parser")
            
            # Different image selectors for different search engines
            if retry == 0:  # Google
                img_tags = soup.find_all("img", {"class": "rg_i"})
                if not img_tags:
                    img_tags = soup.find_all("img")
            elif retry == 1:  # Bing
                img_tags = soup.find_all("img", {"class": "mimg"})
                if not img_tags:
                    img_tags = soup.find_all("img")
            else:  # Yahoo
                img_tags = soup.find_all("img")
            
            # Extract image URLs
            img_urls = []
            for img in img_tags:
                if "src" in img.attrs:
                    src = img["src"]
                    if src.startswith('http') and src not in img_urls:
                        img_urls.append(src)
                elif "data-src" in img.attrs:
                    src = img["data-src"]
                    if src.startswith('http') and src not in img_urls:
                        img_urls.append(src)
            
            logger.info(f"Found {len(img_urls)} images for {query}")
            
            # If we found enough images, return them
            if len(img_urls) >= max_images:
                return img_urls[:max_images]
            
            # If we didn't find enough, try clicking on thumbnails to get full-res URLs
            try:
                thumbnail_elements = driver.find_elements(By.CSS_SELECTOR, "img.rg_i")
                for thumbnail in thumbnail_elements[:max_images]:
                    try:
                        thumbnail.click()
                        time.sleep(1)
                        expanded_images = driver.find_elements(By.CSS_SELECTOR, "img.r48jcc")
                        if expanded_images:
                            src = expanded_images[0].get_attribute('src')
                            if src and src.startswith('http') and src not in img_urls:
                                img_urls.append(src)
                    except:
                        continue
            except:
                pass
            
            logger.info(f"After thumbnail expansion: Found {len(img_urls)} images for {query}")
            return img_urls[:max_images]
            
        except Exception as e:
            logger.error(f"Error during URL fetching for {query}, attempt {retry+1}: {str(e)}")
            
    # If all retries failed, return empty list
    logger.error(f"Failed to fetch URLs for {query} after {max_retries} attempts")
    return []

# Function to download and validate an image
def download_and_validate_image(img_url, category, index, max_retries=3):
    filename = f"{category}_{index:03d}.jpg"
    category_dir = os.path.join(base_dir, category)
    file_path = os.path.join(category_dir, filename)
    
    for retry in range(max_retries):
        try:
            # Set headers to mimic a browser
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Referer": "https://www.google.com/"
            }
            
            # Download the image
            response = requests.get(img_url, headers=headers, timeout=10)
            
            if response.status_code != 200:
                logger.warning(f"Failed to download {img_url}: Status code {response.status_code}")
                raise Exception(f"HTTP error: {response.status_code}")
            
            # Try to open the image to validate it
            img = Image.open(BytesIO(response.content))
            
            # Save the image
            img.save(file_path)
            
            # Get the dimensions and file size
            width, height = img.size
            file_size_kb = os.path.getsize(file_path) / 1024
            
            # Add to metadata CSV
            with open(metadata_file, 'a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([category, filename, img_url, width, height, file_size_kb])
            
            logger.info(f"Successfully downloaded {category}/{filename}")
            return True
            
        except Exception as e:
            logger.warning(f"Error downloading {img_url} (attempt {retry+1}): {str(e)}")
            if retry == max_retries - 1:
                logger.error(f"Failed to download {img_url} after {max_retries} attempts")
    
    return False

# Function to download images for a category
def download_images_for_category(category, max_images=50):
    category_dir = os.path.join(base_dir, category)
    os.makedirs(category_dir, exist_ok=True)
    
    logger.info(f"Starting download for category: {category}")
    
    # Get image URLs
    img_urls = fetch_image_urls(category, max_images=max_images*2)  # Get extra URLs in case some fail
    
    # Track the number of successful downloads
    successful_downloads = 0
    
    # Try to download each image
    for i, img_url in enumerate(img_urls):
        if successful_downloads >= max_images:
            break
            
        if download_and_validate_image(img_url, category, successful_downloads):
            successful_downloads += 1
            
        # Slight delay to avoid being blocked
        time.sleep(0.5)
    
    logger.info(f"Completed downloads for {category}: {successful_downloads}/{max_images} images")
    return successful_downloads

# Function to generate dataset summary
def generate_dataset_summary():
    try:
        # Read the metadata file
        df = pd.read_csv(metadata_file)
        
        # Calculate summary statistics
        category_counts = df['category'].value_counts()
        avg_dimensions = df[['width', 'height']].mean()
        avg_file_size = df['file_size_kb'].mean()
        
        summary = {
            'dataset_name': DATASET_NAME,
            'total_images': len(df),
            'categories': len(category_counts),
            'images_per_category': category_counts.to_dict(),
            'avg_width': avg_dimensions['width'],
            'avg_height': avg_dimensions['height'],
            'avg_file_size_kb': avg_file_size
        }
        
        # Save summary to file
        summary_file = os.path.join(base_dir, f"{DATASET_NAME}_summary.txt")
        with open(summary_file, 'w') as f:
            for key, value in summary.items():
                f.write(f"{key}: {value}\n")
        
        logger.info(f"Dataset summary saved to {summary_file}")
        return summary
        
    except Exception as e:
        logger.error(f"Error generating summary: {str(e)}")
        return None

# Main execution
try:
    print(f"Starting dataset collection for {DATASET_NAME}...")
    print(f"Target: {len(categories)} categories with 50 images each")
    
    total_images = 0
    
    # Process each category
    for category in categories:
        num_images = download_images_for_category(category, max_images=50)
        total_images += num_images
    
    # Generate summary
    summary = generate_dataset_summary()
    
    print(f"\nDataset collection complete!")
    print(f"Total images collected: {total_images}")
    
    # Use case demonstration
    print("\nUse Case for MultiModalVerse Dataset:")
    print("1. Multi-class image classification model training")
    print("2. Image search and retrieval systems")
    print("3. Computer vision model evaluation across diverse categories")
    print("4. Transfer learning experiments with varied data domains")
    print("5. Data augmentation technique testing")
    
finally:
    # Close the WebDriver
    driver.quit()

Starting dataset collection for MultiModalVerse...
Target: 20 categories with 50 images each

Dataset collection complete!
Total images collected: 1000

Use Case for MultiModalVerse Dataset:
1. Multi-class image classification model training
2. Image search and retrieval systems
3. Computer vision model evaluation across diverse categories
4. Transfer learning experiments with varied data domains
5. Data augmentation technique testing
