In [1]:
import multiprocessing
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import re # For extracting numbers
import random # For optional delays

# --- Configuration ---
TOTAL_PAGES = 256  # As per your requirement
PAGES_PER_WORKER = 16
BASE_URL_TEMPLATE = "https://www.wikigallery.org/wiki/Subjects/page-{page_num}"
# Optional: Specify ChromeDriver path if not in system PATH
# CHROMEDRIVER_PATH = "/path/to/your/chromedriver"
CHROMEDRIVER_PATH = None # Set to None if chromedriver is in PATH

def setup_driver(webdriver_path=None):
    """Sets up the Selenium WebDriver."""
    chrome_options = Options()
    # chrome_options.add_argument("--headless=new")  # Recommended for headless
    chrome_options.add_argument("--headless") # Older headless
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    # Disable images to speed up page loading (optional)
    chrome_options.add_argument('--blink-settings=imagesEnabled=false')
    # Or more robust way to disable images
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # chrome_options.add_experimental_option("prefs", prefs)


    if webdriver_path:
        service = Service(executable_path=webdriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    else:
        driver = webdriver.Chrome(options=chrome_options)
    return driver

def extract_subjects_from_current_page(driver, current_page_url):
    """Extracts subject data from the currently loaded page in the driver."""
    subjects_data_on_page = []
    wait = WebDriverWait(driver, 15) # Reduced wait time for potentially faster pages
    try:
        # Subjects are in the first <div class="galleryb">
        subjects_container_xpath = "(//div[@class='galleryb'])[1]"
        subjects_container = wait.until(
            EC.visibility_of_element_located((By.XPATH, subjects_container_xpath))
        )
        
        subject_boxes = subjects_container.find_elements(By.XPATH, "./div[@class='gallerybox2']")
        
        for box in subject_boxes:
            try:
                img_element = box.find_element(By.CSS_SELECTOR, "img.dia")
                image_url = img_element.get_attribute("src")
                title = img_element.get_attribute("alt") # Alt usually has the clean title

                # Fallback for title if alt is empty (less likely for this site structure)
                if not title:
                    link_element = box.find_element(By.XPATH, ".//a[contains(@href, '/wiki/Subject_')]")
                    title = link_element.text.split('\n')[0].strip() # Get first line of text if complex

                box_text = box.text
                num_paintings_match = re.search(r'\((\d+)\s+paintings\)', box_text)
                num_paintings = int(num_paintings_match.group(1)) if num_paintings_match else "N/A"

                subjects_data_on_page.append({
                    "title": title.strip() if title else "N/A",
                    "image_url": image_url,
                    "number_of_paintings": num_paintings,
                    "source_page_url": current_page_url
                })
            except Exception as e_item:
                print(f"    Worker: Error processing a subject box on {current_page_url}: {e_item}")
        
        return subjects_data_on_page

    except TimeoutException:
        print(f"    Worker: Timed out waiting for subject boxes on {current_page_url}.")
    except Exception as e_page:
        print(f"    Worker: Error scraping page {current_page_url}: {e_page}")
    return [] # Return empty list on error for this page

def worker_task(page_numbers_to_scrape, base_url_template, driver_path=None):
    """
    Worker function for a multiprocessing Pool.
    Scrapes a list of specified page numbers.
    Each worker initializes and quits its own driver.
    """
    process_name = multiprocessing.current_process().name
    first_page = page_numbers_to_scrape[0]
    last_page = page_numbers_to_scrape[-1]
    print(f"[Worker {process_name}] Started. Pages: {first_page}-{last_page}")
    
    driver = None
    worker_results = []
    try:
        driver = setup_driver(webdriver_path=driver_path)
        for page_num in page_numbers_to_scrape:
            page_url = base_url_template.replace("{page_num}", str(page_num))
            # print(f"  [Worker {process_name}] Processing: {page_url}") # Can be very verbose
            
            try:
                driver.get(page_url)
                # Optional: Add a small random delay to be less aggressive
                # time.sleep(random.uniform(0.2, 0.8)) 
                
                subjects_on_page = extract_subjects_from_current_page(driver, page_url)
                if subjects_on_page:
                    worker_results.extend(subjects_on_page)
                # else:
                #     print(f"  [Worker {process_name}] No subjects found or error on page: {page_url}")
            except Exception as e_nav:
                print(f"  [Worker {process_name}] Error navigating/processing page {page_url}: {e_nav}")
                # Could implement retries for specific pages if needed

    except Exception as e_worker_init:
        print(f"[Worker {process_name}] Critical error during initialization or main loop: {e_worker_init}")
    finally:
        if driver:
            driver.quit()
            
    print(f"[Worker {process_name}] Finished. Pages: {first_page}-{last_page}. Found {len(worker_results)} items.")
    return worker_results

In [2]:
multiprocessing.freeze_support() 

start_time = time.time()

# Calculate number of workers based on total pages and pages per worker
num_workers = (TOTAL_PAGES + PAGES_PER_WORKER - 1) // PAGES_PER_WORKER # Ceiling division

# Cap workers to CPU count if desired, otherwise uses calculated based on pages
# num_workers = min(num_workers, multiprocessing.cpu_count()) 
# print(f"Using {num_workers} worker processes.")


tasks_for_pool = []
for i in range(num_workers):
    start_page_num = i * PAGES_PER_WORKER + 1
    end_page_num = min((i + 1) * PAGES_PER_WORKER, TOTAL_PAGES)
    
    # Create a list of page numbers for this worker
    pages_for_this_worker = list(range(start_page_num, end_page_num + 1))
    
    if pages_for_this_worker: # Ensure not empty
        tasks_for_pool.append(
            (pages_for_this_worker, BASE_URL_TEMPLATE, CHROMEDRIVER_PATH)
        )

if not tasks_for_pool:
    print("No tasks generated for workers. Check TOTAL_PAGES.")
else:
    print(f"Distributing {TOTAL_PAGES} pages among {len(tasks_for_pool)} worker batches.")
    
    all_scraped_data = []
    # Using a Pool to manage worker processes
    with multiprocessing.Pool(processes=num_workers) as pool:
        # starmap passes arguments from each tuple in tasks_for_pool to worker_task
        results_from_workers = pool.starmap(worker_task, tasks_for_pool)
    
    # Combine results from all workers
    for worker_result_list in results_from_workers:
        all_scraped_data.extend(worker_result_list)

    end_time = time.time()

    print("\n--- Parallel Scraping Complete ---")
    print(f"Total subjects scraped: {len(all_scraped_data)}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")

    # Print a sample of the scraped data
    if all_scraped_data:
        print("\nSample of scraped data (first 5 items):")
        for i, item in enumerate(all_scraped_data[:5]):
            print(f"\nItem {i+1}:")
            print(f"  Title: {item['title']}")
            print(f"  Image URL: {item['image_url']}")
            print(f"  Number of Paintings: {item['number_of_paintings']}")
            print(f"  Source Page: {item['source_page_url']}")
        if len(all_scraped_data) > 5:
            print(f"\n... and {len(all_scraped_data) - 5} more items.")
    else:
        print("No data was scraped.")

Distributing 256 pages among 16 worker batches.


Process SpawnPoolWorker-6:
Process SpawnPoolWorker-1:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-11:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/p_dev/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/p_dev/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/p_dev/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/envs/p_dev/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'worker_task' on <module '__main__' (built-in)>
  File "/opt/anaconda3/envs/p_dev/lib/python3.11

KeyboardInterrupt: 