In [1]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

In [44]:
page = 6409 # Update this with the actual page number you're scraping
# Set up Firefox options
firefox_options = Options()
firefox_options.add_argument('--headless')  # Run in headless mode for faster execution

# Specify the path to Firefox binary
firefox_binary_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"  # Adjust this path if necessary
firefox_options.binary_location = firefox_binary_path

# Specify the path to GeckoDriver
webdriver_service = Service(r'C:\Users\amals\geckodriver\geckodriver.exe')

# Function to download the image
def download_image(image_url, image_id, save_dir="images"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Send a GET request to download the image
    response = requests.get(image_url, stream=True)
    
    # Save the image with the image_id as filename
    image_path = os.path.join(save_dir, f"{image_id}.jpg")
    with open(image_path, 'wb') as img_file:
        for chunk in response.iter_content(1024):
            img_file.write(chunk)
    
    return image_path

# Initialize the list to hold all artifacts' data
all_artifacts_data = []

# Loop through the first 10 artifacts (adjust this as needed)
for idx in range(32):  # Ensure this is within bounds for the number of artifact cards
    try:
        # Reinitialize the WebDriver for each artifact to avoid session conflicts
        driver = webdriver.Firefox(service=webdriver_service, options=firefox_options)
        
        # Step 1: Open the main Penn Museum collection page
        driver.get(f"https://www.penn.museum/collections/search.php?images%5B0%5D=yes&submit_term=Submit%2BQuery&page={page}")

        # Wait for the page to load and ensure the collection page is visible
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, "//h1[contains(text(), 'Collections')]"))
        )

        # Step 2: Find all artifact cards on the page
        card_divs = driver.find_elements(By.XPATH, "//div[@class='card']")
        
        # Ensure there are enough cards to process
        if len(card_divs) > idx:
            # Click on the artifact link to go to the artifact's page
            artifact_url = card_divs[idx].find_element(By.XPATH, ".//h2/a").get_attribute('href')
            print(f"Artifact URL: {artifact_url}")  # Print the URL being used

            # Navigate to the artifact's detail page
            driver.get(artifact_url)

            # Step 3: Wait for the artifact page to load and for the table to appear
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/section[2]/div/div[2]/div[1]/table"))
            )

            # Extract the table data
            table = driver.find_element(By.XPATH, "/html/body/div[2]/section[2]/div/div[2]/div[1]/table")
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Initialize a dictionary to hold the artifact data
            artifact_data = {}

            # Loop through the rows in the table and extract the key-value pairs
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")  # Get the cells in the row
                if len(cells) == 2:  # Ensure there are exactly two columns (key and value)
                    key = cells[0].text.strip()  # First column as key
                    value = cells[1].text.strip()  # Second column as value
                    artifact_data[key] = value

            # Step 4: Find the image URL directly from the `<img>` tag
            try:
                image_element = driver.find_element(By.XPATH, "//*[@id='maincontent']/section[1]/div/div[2]/div[2]/div[1]/a/img")
                image_url = image_element.get_attribute("src")  # Direct image URL from the `src` attribute
            except Exception as e:
                image_url = "None"  # Handle the case if the image is not found

            # Get the image_id from the artifact's number (or create a unique one if needed)
            image_id = artifact_data.get("Object Number", f"unknown_id_{idx}")

            # Download the image and save it
            image_path = download_image(image_url, image_id)

            # Step 5: Create the JSON structure for the VQA for the current artifact
            vqa_data = []
            for key, value in artifact_data.items():
                question = f"What is the {key.lower().replace('_', ' ')}?"
                vqa_data.append({
                    "image_id": image_id,
                    "image_path": image_path,
                    "question": question,
                    "answer": value
                })

            # Add the data of the current artifact to the all_artifacts_data list
            all_artifacts_data.extend(vqa_data)

            print(f"Data saved for artifact {idx + 1}")

        else:
            print(f"Error: No artifact found at index {idx + 1}.")

    except Exception as e:
        print(f"Error processing artifact {idx + 1}: {e}")

    finally:
        # Ensure the driver is closed after processing each artifact
        driver.quit()

# Step 7: Save the entire list of artifact data as a single JSON file
vqa_json = json.dumps(all_artifacts_data, indent=2)

# Save the JSON to a file
json_file_path = f"all_artifacts_vqa_{page}.json"
with open(json_file_path, "w") as file:
    file.write(vqa_json)

print(f"All data saved as {json_file_path}")

Artifact URL: https://www.penn.museum/collections/object/84072
Data saved for artifact 1
Artifact URL: https://www.penn.museum/collections/object/84054
Data saved for artifact 2
Artifact URL: https://www.penn.museum/collections/object/84043
Data saved for artifact 3
Artifact URL: https://www.penn.museum/collections/object/84014
Data saved for artifact 4
Artifact URL: https://www.penn.museum/collections/object/84012
Data saved for artifact 5
Artifact URL: https://www.penn.museum/collections/object/84002
Data saved for artifact 6
Artifact URL: https://www.penn.museum/collections/object/83956
Data saved for artifact 7
Artifact URL: https://www.penn.museum/collections/object/83930
Data saved for artifact 8
Artifact URL: https://www.penn.museum/collections/object/83926
Data saved for artifact 9
Artifact URL: https://www.penn.museum/collections/object/83922
Data saved for artifact 10
Artifact URL: https://www.penn.museum/collections/object/83917
Data saved for artifact 11
Artifact URL: https

In [48]:
pages = [i for i in range(6408, 6349, -1)]

In [49]:
#pages = 6431 # Update this with the actual page number you're scraping
# Set up Firefox options
for page in pages:
    firefox_options = Options()
    firefox_options.add_argument('--headless')  # Run in headless mode for faster execution

    # Specify the path to Firefox binary
    firefox_binary_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"  # Adjust this path if necessary
    firefox_options.binary_location = firefox_binary_path

    # Specify the path to GeckoDriver
    webdriver_service = Service(r'C:\Users\amals\geckodriver\geckodriver.exe')

    # Function to download the image
    def download_image(image_url, image_id, save_dir="images"):
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Send a GET request to download the image
        response = requests.get(image_url, stream=True)

        # Save the image with the image_id as filename
        image_path = os.path.join(save_dir, f"{image_id}.jpg")
        with open(image_path, 'wb') as img_file:
            for chunk in response.iter_content(1024):
                img_file.write(chunk)

        return image_path

    # Initialize the list to hold all artifacts' data
    all_artifacts_data = []

    # Loop through the first 10 artifacts (adjust this as needed)
    for idx in range(32):  # Ensure this is within bounds for the number of artifact cards
        try:
            # Reinitialize the WebDriver for each artifact to avoid session conflicts
            driver = webdriver.Firefox(service=webdriver_service, options=firefox_options)

            # Step 1: Open the main Penn Museum collection page
            driver.get(f"https://www.penn.museum/collections/search.php?images%5B0%5D=yes&submit_term=Submit%2BQuery&page={page}")

            # Wait for the page to load and ensure the collection page is visible
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//h1[contains(text(), 'Collections')]"))
            )

            # Step 2: Find all artifact cards on the page
            card_divs = driver.find_elements(By.XPATH, "//div[@class='card']")

            # Ensure there are enough cards to process
            if len(card_divs) > idx:
                # Click on the artifact link to go to the artifact's page
                artifact_url = card_divs[idx].find_element(By.XPATH, ".//h2/a").get_attribute('href')
                print(f"Artifact URL: {artifact_url}")  # Print the URL being used

                # Navigate to the artifact's detail page
                driver.get(artifact_url)

                # Step 3: Wait for the artifact page to load and for the table to appear
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/section[2]/div/div[2]/div[1]/table"))
                )

                # Extract the table data
                table = driver.find_element(By.XPATH, "/html/body/div[2]/section[2]/div/div[2]/div[1]/table")
                rows = table.find_elements(By.TAG_NAME, "tr")

                # Initialize a dictionary to hold the artifact data
                artifact_data = {}

                # Loop through the rows in the table and extract the key-value pairs
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, "td")  # Get the cells in the row
                    if len(cells) == 2:  # Ensure there are exactly two columns (key and value)
                        key = cells[0].text.strip()  # First column as key
                        value = cells[1].text.strip()  # Second column as value
                        artifact_data[key] = value

                # Step 4: Find the image URL directly from the `<img>` tag
                try:
                    image_element = driver.find_element(By.XPATH, "//*[@id='maincontent']/section[1]/div/div[2]/div[2]/div[1]/a/img")
                    image_url = image_element.get_attribute("src")  # Direct image URL from the `src` attribute
                except Exception as e:
                    image_url = "None"  # Handle the case if the image is not found

                # Get the image_id from the artifact's number (or create a unique one if needed)
                image_id = artifact_data.get("Object Number", f"unknown_id_{idx}")

                # Download the image and save it
                image_path = download_image(image_url, image_id)

                # Step 5: Create the JSON structure for the VQA for the current artifact
                vqa_data = []
                for key, value in artifact_data.items():
                    question = f"What is the {key.lower().replace('_', ' ')}?"
                    vqa_data.append({
                        "image_id": image_id,
                        "image_path": image_path,
                        "question": question,
                        "answer": value
                    })

                # Add the data of the current artifact to the all_artifacts_data list
                all_artifacts_data.extend(vqa_data)

                print(f"Data saved for artifact {idx + 1}")

            else:
                print(f"Error: No artifact found at index {idx + 1}.")

        except Exception as e:
            print(f"Error processing artifact {idx + 1}: {e}")

        finally:
            # Ensure the driver is closed after processing each artifact
            driver.quit()

    # Step 7: Save the entire list of artifact data as a single JSON file
    vqa_json = json.dumps(all_artifacts_data, indent=2)

    # Save the JSON to a file
    json_file_path = f"all_artifacts_vqa_{page}.json"
    with open(json_file_path, "w") as file:
        file.write(vqa_json)

    print(f"All data saved as {json_file_path}")

Artifact URL: https://www.penn.museum/collections/object/84895
Data saved for artifact 1
Artifact URL: https://www.penn.museum/collections/object/84894
Data saved for artifact 2
Artifact URL: https://www.penn.museum/collections/object/84876
Data saved for artifact 3
Artifact URL: https://www.penn.museum/collections/object/84873
Data saved for artifact 4
Artifact URL: https://www.penn.museum/collections/object/84860
Data saved for artifact 5
Artifact URL: https://www.penn.museum/collections/object/84842
Data saved for artifact 6
Artifact URL: https://www.penn.museum/collections/object/84841
Data saved for artifact 7
Artifact URL: https://www.penn.museum/collections/object/84823
Data saved for artifact 8
Artifact URL: https://www.penn.museum/collections/object/84787
Data saved for artifact 9
Artifact URL: https://www.penn.museum/collections/object/84786
Data saved for artifact 10
Artifact URL: https://www.penn.museum/collections/object/84785
Data saved for artifact 11
Artifact URL: https

Data saved for artifact 27
Artifact URL: https://www.penn.museum/collections/object/85869
Data saved for artifact 28
Artifact URL: https://www.penn.museum/collections/object/85759
Data saved for artifact 29
Artifact URL: https://www.penn.museum/collections/object/85725
Data saved for artifact 30
Artifact URL: https://www.penn.museum/collections/object/85715
Data saved for artifact 31
Artifact URL: https://www.penn.museum/collections/object/85700
Data saved for artifact 32
All data saved as all_artifacts_vqa_6406.json
Artifact URL: https://www.penn.museum/collections/object/86676
Data saved for artifact 1
Artifact URL: https://www.penn.museum/collections/object/86653
Data saved for artifact 2
Artifact URL: https://www.penn.museum/collections/object/86619
Data saved for artifact 3
Artifact URL: https://www.penn.museum/collections/object/86615
Data saved for artifact 4
Artifact URL: https://www.penn.museum/collections/object/86614
Data saved for artifact 5
Artifact URL: https://www.penn.m

Data saved for artifact 21
Artifact URL: https://www.penn.museum/collections/object/87638
Data saved for artifact 22
Artifact URL: https://www.penn.museum/collections/object/87637
Data saved for artifact 23
Artifact URL: https://www.penn.museum/collections/object/87603
Data saved for artifact 24
Artifact URL: https://www.penn.museum/collections/object/87591
Data saved for artifact 25
Artifact URL: https://www.penn.museum/collections/object/87582
Data saved for artifact 26
Artifact URL: https://www.penn.museum/collections/object/87501
Data saved for artifact 27
Artifact URL: https://www.penn.museum/collections/object/87498
Data saved for artifact 28
Artifact URL: https://www.penn.museum/collections/object/87488
Data saved for artifact 29
Artifact URL: https://www.penn.museum/collections/object/87451
Data saved for artifact 30
Artifact URL: https://www.penn.museum/collections/object/87450
Data saved for artifact 31
Artifact URL: https://www.penn.museum/collections/object/87389
Data saved

Data saved for artifact 15
Artifact URL: https://www.penn.museum/collections/object/89600
Data saved for artifact 16
Artifact URL: https://www.penn.museum/collections/object/89598
Data saved for artifact 17
Artifact URL: https://www.penn.museum/collections/object/89595
Data saved for artifact 18
Artifact URL: https://www.penn.museum/collections/object/89590
Data saved for artifact 19
Artifact URL: https://www.penn.museum/collections/object/89586
Data saved for artifact 20
Artifact URL: https://www.penn.museum/collections/object/89558
Data saved for artifact 21
Artifact URL: https://www.penn.museum/collections/object/89557
Data saved for artifact 22
Artifact URL: https://www.penn.museum/collections/object/89517
Data saved for artifact 23
Artifact URL: https://www.penn.museum/collections/object/89507
Data saved for artifact 24
Artifact URL: https://www.penn.museum/collections/object/89482
Data saved for artifact 25
Artifact URL: https://www.penn.museum/collections/object/89478
Data saved

Data saved for artifact 9
Artifact URL: https://www.penn.museum/collections/object/91779
Data saved for artifact 10
Artifact URL: https://www.penn.museum/collections/object/91727
Data saved for artifact 11
Artifact URL: https://www.penn.museum/collections/object/91702
Data saved for artifact 12
Artifact URL: https://www.penn.museum/collections/object/91678
Data saved for artifact 13
Artifact URL: https://www.penn.museum/collections/object/91608
Data saved for artifact 14
Artifact URL: https://www.penn.museum/collections/object/91607
Data saved for artifact 15
Artifact URL: https://www.penn.museum/collections/object/91606
Data saved for artifact 16
Artifact URL: https://www.penn.museum/collections/object/91598
Data saved for artifact 17
Artifact URL: https://www.penn.museum/collections/object/91596
Data saved for artifact 18
Artifact URL: https://www.penn.museum/collections/object/91593
Data saved for artifact 19
Artifact URL: https://www.penn.museum/collections/object/91513
Data saved 

Data saved for artifact 3
Artifact URL: https://www.penn.museum/collections/object/94289
Data saved for artifact 4
Artifact URL: https://www.penn.museum/collections/object/94285
Data saved for artifact 5
Artifact URL: https://www.penn.museum/collections/object/94275
Data saved for artifact 6
Artifact URL: https://www.penn.museum/collections/object/94246
Data saved for artifact 7
Artifact URL: https://www.penn.museum/collections/object/94243
Data saved for artifact 8
Artifact URL: https://www.penn.museum/collections/object/94238
Data saved for artifact 9
Artifact URL: https://www.penn.museum/collections/object/94209
Data saved for artifact 10
Artifact URL: https://www.penn.museum/collections/object/94187
Data saved for artifact 11
Artifact URL: https://www.penn.museum/collections/object/94186
Data saved for artifact 12
Artifact URL: https://www.penn.museum/collections/object/94182
Data saved for artifact 13
Artifact URL: https://www.penn.museum/collections/object/94150
Data saved for ar

KeyboardInterrupt: 