In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import os
import requests
from urllib.parse import urlparse

# Setup headless Chrome
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def get_all_product_links(base_url, total_pages=4):
    all_links = set()
    for page in range(1, total_pages + 1):
        url = f"{base_url}/page/{page}/"
        driver.get(url)
        time.sleep(3)
        # Find all product links in the product title
        for a in driver.find_elements(By.CSS_SELECTOR, 'h3.wd-entities-title a'):
            href = a.get_attribute('href')
            if href:
                all_links.add(href)
    return list(all_links)

def get_all_carousel_images(product_url):
    driver.get(product_url)
    time.sleep(3)
    image_urls = set()
    # Main image (has data-large_image)
    for img in driver.find_elements(By.CSS_SELECTOR, 'img[data-large_image]'):
        url = img.get_attribute('data-large_image')
        if url and 'lazy.svg' not in url:
            image_urls.add(url)
    # Carousel images (thumbnails, may have srcset)
    for img in driver.find_elements(By.CSS_SELECTOR, '.wd-carousel-wrap img, .woocommerce-product-gallery__image img'):
        url = img.get_attribute('data-large_image')
        if not url:
            srcset = img.get_attribute('srcset')
            if srcset:
                # Get the largest image in srcset
                candidates = [s.strip().split(' ') for s in srcset.split(',')]
                if candidates:
                    url = sorted(candidates, key=lambda x: int(x[1][:-1]) if len(x) > 1 and x[1][-1] == 'w' else 0)[-1][0]
            if not url:
                url = img.get_attribute('src')
        if url and 'lazy.svg' not in url:
            image_urls.add(url)
    return list(image_urls)


def download_image(url, folder, prefix):
    parsed = urlparse(url)
    filename = os.path.basename(parsed.path)
    filename = f"{prefix}_{filename}"
    filepath = os.path.join(folder, filename)
    if not os.path.exists(filepath):
        r = requests.get(url)
        with open(filepath, 'wb') as f:
            f.write(r.content)
        print(f"Downloaded {filepath}")


# Main
base_url = 'https://gavelparis.com/shop'
product_links = get_all_product_links(base_url, total_pages=4)
for link in product_links:
    product_slug = link.rstrip('/').split('/')[-1]
    folder = os.path.join('gavel_images', product_slug)
    os.makedirs(folder, exist_ok=True)
    images = get_all_carousel_images(link)
    for idx, img_url in enumerate(images):
        download_image(img_url, folder, f"{product_slug}_{idx+1}")
driver.quit()

Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_1_DSC3953-scaled.jpg
Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_2_031A5101-scaled.jpg
Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_3_172-scaled.jpg
Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_4_DSC2784-scaled.jpg
Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_5_031A5096-scaled.jpg
Downloaded gavel_images/womens-asymmetric-kimono-with-feathers/womens-asymmetric-kimono-with-feathers_6_031A5112.jpg
Downloaded gavel_images/womens-fur-vest-silver-fox-nightshade-whisper/womens-fur-vest-silver-fox-nightshade-whisper_1_WhatsApp-Image-2024-10-28-at-23.26.05-2.jpeg
Downloaded gavel_images/womens-fur-vest-silver-fox-nightshade-whisper/womens-fur-vest-silver-fox-nightshade

In [2]:
product_links

[]