# Scrape human made images

This code is designed to scrape images from Google search results. It utilizes Selenium and Chromedriver to automate the process of visiting search result pages, scrolling down to load more images, and extracting the image URLs. The images are then downloaded and saved into specific category folders.

In [1]:
!pip install selenium



In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
import urllib

# Define the base directory where the images will be saved in your Google Drive
HUMAN_DRIVE_FOLDER = 'C:/Users/thisi/Downloads/pr/HUMAN/'

# URLs and categories for the selected categories
urls = [
    'https://www.google.com/search?q=people+outside&tbm=isch&ved=2ahUKEwid5LC33pr-AhX0HzQIHWLOAkMQ2-cCegQIABAA&oq=people+outside&gs_lcp=CgNpbWcQAzIHCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQ6BAgjECc6BwgAEIoFEENQrAVY6gpg9wtoAHAAeACAAW2IAeMFkgEDNy4xmAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=tZoxZN21H_S_0PEP4pyLmAQ&bih=609&biw=1280',
    'https://www.google.com/search?q=people+doing+things+real&tbm=isch&ved=2ahUKEwjQ-9ST95r-AhXhGTQIHWqpDgEQ2-cCegQIABAA&oq=people+doing+things+real&gs_lcp=CgNpbWcQAzIGCAAQCBAeOgQIIxAnOgcIABCKBRBDOgUIABCABDoGCAAQBRAeUIEDWNcHYKQJaABwAHgAgAGoAYgBsQWSAQMyLjSYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=obQxZN2MKuCA0PEP6tK6CA&bih=609&biw=1280',
    'https://www.google.com/search?sca_esv=562677350&sxsrf=AB5stBgnIirqqynWYsfi1kimDUdW0C1NRQ:1693897813044&q=real+portrait&tbm=isch&source=lnms&sa=X&ved=2ahUKEwiynrqL9ZKBAxUWT2wGHYjLAGcQ0pQJegQIDhAB&biw=1280&bih=595&dpr=1.5',
]

categories = ['people_outside', 'people_doing_things', 'real_portraits']

delay = 3

def scroll_down(wd):
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(delay)

def download_images(url, category):
    wd = webdriver.Chrome()
    wd.get(url)
    delay = 0

    # Create the category folder if it doesn't exist in your Google Drive
    category_folder = os.path.join(HUMAN_DRIVE_FOLDER, category)
    os.makedirs(category_folder, exist_ok=True)

    try:
        skips = 0
        while skips <= 50:
            thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
            print(f'Found {len(thumbnails)} thumbnails for category {category}')  # Debugging statement
            time.sleep(3)

            for img in thumbnails:
                src = img.get_attribute('src')
                if src and src not in image_urls:
                    image_urls.add(src)
                    try:
                        urllib.request.urlretrieve(src, os.path.join(category_folder, f'{len(image_urls)}.png'))
                    except:
                        continue

            skips += 1
            scroll_down(wd)
    finally:
        wd.quit()

image_urls = set()

for search in range(len(urls)):
    url = urls[search]
    category = categories[search]
    download_images(url, category)


Found 50 thumbnails for category people_outside
Found 100 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails for category people_outside
Found 104 thumbnails 

# Scrape DALLE images

This code is used to scrape images from the website dalle2.gallery, which hosts AI-generated images created by DALLE. It automates the process of visiting the website, scrolling down to load more images, and downloading the unique image URLs until the specified maximum number of images (max_images) is reached. The downloaded images are saved with sequential numbers as filenames.

In [12]:
from multiprocessing.sharedctypes import Value
from selenium import webdriver
from selenium.webdriver.common.by import By
from PIL import Image
import time
import os
import urllib

# PATH = 'C:\\Users\\Karl Marie Yazigi\\Desktop\\chromedriver_win32\\chromedriver'

def scroll_down(wd):
    """
    Scroll down the webpage using JavaScript to load more content.
    """
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(delay)

# Initialize the Chrome webdriver
wd = webdriver.Chrome()

url = 'https://dalle2.gallery/#search-random'
DALLE = 'C:\\Users\\thisi\\Downloads\\pr\\DALL-E\\'

# Visit the URL
wd.get(url)
max_images = 50000
delay = 0
image_urls = set()
skips = 0
before = 0

while len(image_urls) < max_images:
    # Print the number of collected image URLs every 100 images
    if len(image_urls) % 100 == 0:
        print(len(image_urls))

    before = len(image_urls)

    # Find thumbnail elements on the webpage
    thumbnails = wd.find_elements(By.TAG_NAME, 'img')
    time.sleep(1)

    # Iterate over the thumbnails and download images
    for img in thumbnails[len(image_urls):]:
        try:
            src = img.get_attribute('src')
            if src not in image_urls:
                image_urls.add(src)
                try:
                    # Download the image and save it with a sequential number as the filename
                    urllib.request.urlretrieve(src, os.path.join(DALLE, f'{len(image_urls)}.png'))
                except:
                    # If download fails, continue with the next image
                    continue

        except:
            # If an exception occurs during image processing, continue with the next image
            continue

    # Scroll down to load more content
    scroll_down(wd)

# Quit the webdriver
wd.quit()

0


WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=116.0.5845.141)
Stacktrace:
	GetHandleVerifier [0x00007FF7271252A2+57122]
	(No symbol) [0x00007FF72709EA92]
	(No symbol) [0x00007FF726F6E3AB]
	(No symbol) [0x00007FF726F5BA47]
	(No symbol) [0x00007FF726F5B6C0]
	(No symbol) [0x00007FF726F6FA71]
	(No symbol) [0x00007FF726FDE27F]
	(No symbol) [0x00007FF726FC6DB3]
	(No symbol) [0x00007FF726F9D2B1]
	(No symbol) [0x00007FF726F9E494]
	GetHandleVerifier [0x00007FF7273CEF82+2849794]
	GetHandleVerifier [0x00007FF727421D24+3189156]
	GetHandleVerifier [0x00007FF72741ACAF+3160367]
	GetHandleVerifier [0x00007FF7271B6D06+653702]
	(No symbol) [0x00007FF7270AA208]
	(No symbol) [0x00007FF7270A62C4]
	(No symbol) [0x00007FF7270A63F6]
	(No symbol) [0x00007FF7270967A3]
	BaseThreadInitThunk [0x00007FF81424257D+29]
	RtlUserThreadStart [0x00007FF81622AA68+40]
