In [10]:
pip install selenium webdriver-manager



In [11]:
# 📦 System dependencies for Chromium & ChromeDriver
!apt-get update
!apt install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/chromedriver
!pip install selenium

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com] [Connected to r2u.                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://pp

In [16]:
# Install required packages
!pip install selenium webdriver-manager pillow

# Import core Python modules
import os
import time
import logging
import shutil
import hashlib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler()
    ]
)

# Ask for website URL at runtime
website_url = input("Enter the website URL: ")

# Configuration
CONFIG = {
    'url': website_url,
    'output_dir': "webpage_elements",
    'fullpage_dir': "full_page",
    'html_save_path': "saved_page.html",
    'min_container_width': 300,  # Minimum width in pixels
    'min_container_height': 150,  # Minimum height in pixels
    'container_selectors': {
        'image_containers': [
            '//figure[contains(@class, "mw-default-size")]',
            '//div[contains(@class, "thumb")]',
            '//div[contains(@class, "image-container")]'
        ],
        'tables': '//table[not(ancestor::div[contains(@class, "navbox")])]',
        'graphs': '//div[contains(@class, "chart") or contains(@class, "graph")]'
    },
    'exclude_selectors': [
        '.navbox',
        '.sidebar',
        '.mw-logo',
        '.vector-header-container'
    ],
    'scroll_padding': 50,
    'wait_timeout': 15
}

captured_hashes = set()

def initialize_environment():
    """Set up directory structure and clean previous runs"""
    try:
        if os.path.exists(CONFIG['output_dir']):
            shutil.rmtree(CONFIG['output_dir'])

        os.makedirs(CONFIG['output_dir'], exist_ok=True)
        os.makedirs(os.path.join(CONFIG['output_dir'], CONFIG['fullpage_dir']), exist_ok=True)

        logging.info("Environment initialized")

    except Exception as e:
        logging.error(f"Environment setup failed: {str(e)}")
        raise

def create_driver():
    """Configure and initialize headless Chrome for Colab"""
    try:
        # Colab-specific Chrome options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--disable-gpu')

        # Use Colab-specific setup
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)

        logging.info("Chrome instance created")
        return driver

    except Exception as e:
        logging.error(f"Driver creation failed: {str(e)}")
        raise

def get_container_hash(element):
    """Create unique identifier for containers to prevent duplicates"""
    location = element.location
    size = element.size
    hash_string = f"{location['x']}_{location['y']}_{size['width']}_{size['height']}"
    return hashlib.md5(hash_string.encode()).hexdigest()

def is_valid_container(element):
    """Validate container meets size requirements and visibility"""
    try:
        size = element.size
        return all([
            element.is_displayed(),
            size['width'] >= CONFIG['min_container_width'],
            size['height'] >= CONFIG['min_container_height']
        ])
    except Exception:
        return False

def save_page_html(driver):
    """Save the HTML source code of the page"""
    try:
        with open(CONFIG['html_save_path'], 'w', encoding='utf-8') as file:
            file.write(driver.page_source)
        logging.info(f"Page HTML saved: {CONFIG['html_save_path']}")
    except Exception as e:
        logging.error(f"Saving HTML failed: {str(e)}")

def capture_full_page_screenshot(driver):
    """Capture full page screenshot by scrolling and stitching images"""
    try:
        total_width = driver.execute_script("return document.body.scrollWidth")
        total_height = driver.execute_script("return document.body.scrollHeight")
        viewport_height = driver.execute_script("return window.innerHeight")
        driver.set_window_size(total_width, viewport_height)

        timestamp = int(time.time())
        fullpage_path = os.path.join(CONFIG['output_dir'], CONFIG['fullpage_dir'], f"fullpage_{timestamp}.png")

        stitched_image = Image.new('RGB', (total_width, total_height))
        current_y = 0

        for y in range(0, total_height, viewport_height):
            driver.execute_script(f"window.scrollTo(0, {y})")
            time.sleep(0.5)
            temp_path = f"temp_screenshot_{y}.png"
            driver.save_screenshot(temp_path)
            screenshot = Image.open(temp_path)
            stitched_image.paste(screenshot, (0, current_y))
            current_y += screenshot.size[1]
            os.remove(temp_path)

        stitched_image.save(fullpage_path)
        logging.info(f"Full page screenshot saved: {fullpage_path}")
    except Exception as e:
        logging.error(f"Full page capture failed: {str(e)}")

def capture_container(driver, container, container_type):
    """Capture screenshot of validated container"""
    try:
        container_hash = get_container_hash(container)
        if container_hash in captured_hashes or not is_valid_container(container):
            return

        # Scroll to container
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", container)
        time.sleep(0.5)  # Allow for rendering

        # Create output filename
        timestamp = int(time.time())
        filename = f"{container_type}_{timestamp}_{container_hash[:6]}.png"
        output_path = os.path.join(CONFIG['output_dir'], filename)

        # Capture and save
        container.screenshot(output_path)
        captured_hashes.add(container_hash)
        logging.info(f"Captured {container_type} container: {filename}")

    except Exception as e:
        logging.warning(f"Container capture failed: {str(e)}")

def find_content_containers(driver):
    """Locate relevant content containers excluding navigation elements"""
    logging.info("Identifying content containers")

    containers = []

    # Find image containers
    for xpath in CONFIG['container_selectors']['image_containers']:
        containers.extend(driver.find_elements(By.XPATH, xpath))

    # Find tables and graphs
    for element_type in ['tables', 'graphs']:
        containers.extend(driver.find_elements(By.XPATH, CONFIG['container_selectors'][element_type]))

    # Filter out excluded elements
    filtered = []
    for container in containers:
        try:
            if not container.find_elements(By.XPATH, 'ancestor::*[{}]'.format(
                ' or '.join([f'contains(@class, "{cls}")' for cls in CONFIG['exclude_selectors']])
            )):
                filtered.append(container)
        except Exception:
            continue

    logging.info(f"Found {len(filtered)} valid containers")
    return filtered

def process_page(driver):
    """Main processing workflow"""
    try:
        # Save HTML content
        save_page_html(driver)

        # Capture full page first
        capture_full_page_screenshot(driver)

        # Find and process containers
        containers = find_content_containers(driver)
        for idx, container in enumerate(containers, 1):
            container_type = "image" if idx <= len(CONFIG['container_selectors']['image_containers']) else "content"
            capture_container(driver, container, container_type)

    except Exception as e:
        logging.error(f"Processing failed: {str(e)}")
        raise

def execute_workflow():
    """Main execution controller"""
    driver = None
    try:
        initialize_environment()
        driver = create_driver()

        logging.info(f"Loading: {CONFIG['url']}")
        driver.get(CONFIG['url'])

        WebDriverWait(driver, CONFIG['wait_timeout']).until(
            EC.presence_of_element_located((By.XPATH, "//body"))
        )
        process_page(driver)
        logging.info("Operation completed successfully")

    except Exception as e:
        logging.error(f"Workflow failed: {str(e)}")
    finally:
        if driver:
            driver.quit()
            logging.info("Browser instance closed")

if __name__ == "__main__":
    execute_workflow()

Enter the website URL: https://en.wikipedia.org/wiki/2024_United_States_presidential_election


In [18]:
# Install required packages
!pip install google-generativeai pillow pandas

import os
import glob
import json
import pandas as pd
from google import generativeai as genai
import PIL.Image

# Configuration
OUTPUT_DIR = "/content/webpage_elements"
CSV_OUTPUT_DIR = "/content/webpage_elements_csv"
CATEGORIES = ["table", "map", "chart", "graph", "other"]
API_KEY = "Replace_With_Your_API"  # Your Gemini API key

# Initialize the Gemini client - FIXED API INITIALIZATION
genai.configure(api_key=API_KEY)

# Create output directory for CSVs if it doesn't exist
os.makedirs(CSV_OUTPUT_DIR, exist_ok=True)

# Create directories for each category
for category in CATEGORIES:
    os.makedirs(os.path.join(CSV_OUTPUT_DIR, category), exist_ok=True)

def categorize_image(image_path):
    """Use Gemini to categorize the image type"""
    try:
        # Load the image
        image = PIL.Image.open(image_path)

        # Ask Gemini to categorize the image
        model = genai.GenerativeModel('gemini-2.0-flash')
        response = model.generate_content([
            "Categorize this image into exactly ONE of these categories: table, map, chart, graph, or other. " +
            "Respond with ONLY the category name in lowercase without any additional text.",
            image
        ])

        # Get the category
        category = response.text.strip().lower()

        # Validate the category
        if category not in CATEGORIES:
            print(f"Unexpected category '{category}' for {image_path}, defaulting to 'other'")
            category = "other"

        return category

    except Exception as e:
        print(f"Error categorizing {image_path}: {str(e)}")
        return "other"

def extract_table_to_csv(image_path, output_path):
    """Extract table content from image and save as CSV"""
    try:
        # Load the image
        image = PIL.Image.open(image_path)

        # Ask Gemini to extract the table content
        model = genai.GenerativeModel('gemini-2.0-flash')
        response = model.generate_content([
            "Extract the content of this table into a CSV format. " +
            "Provide ONLY the CSV data with comma as delimiter, with no additional text. " +
            "Each row should be on a new line. Include the header row.",
            image
        ])

        csv_content = response.text.strip()

        # Try to parse the CSV content to validate it
        try:
            # Split the raw text into lines
            lines = csv_content.split('\n')

            # Convert to a list of lists for pandas DataFrame
            data = [line.split(',') for line in lines]

            if len(data) > 0 and len(data[0]) > 0:
                # Create a DataFrame
                df = pd.DataFrame(data[1:], columns=data[0])

                # Save to CSV
                df.to_csv(output_path, index=False)
                print(f"Saved table as CSV: {output_path}")
                return True
            else:
                print(f"Failed to parse CSV content for {image_path}")
                return False

        except Exception as e:
            print(f"Error parsing CSV content: {str(e)}")
            # Save the raw text as a fallback
            with open(output_path, 'w') as f:
                f.write(csv_content)
            print(f"Saved raw table content as CSV: {output_path}")
            return True

    except Exception as e:
        print(f"Error extracting table from {image_path}: {str(e)}")
        return False

def process_all_images():
    """Process all images in the output directory"""
    # Get all image files
    image_files = glob.glob(os.path.join(OUTPUT_DIR, "*.png"))

    # Add images from fullpage directory
    image_files.extend(glob.glob(os.path.join(OUTPUT_DIR, "full_page", "*.png")))

    print(f"Found {len(image_files)} images to process")

    # Create a dictionary to store image categories
    image_categories = {}

    # Process each image
    for image_path in image_files:
        image_filename = os.path.basename(image_path)
        print(f"Processing: {image_filename}")

        # Categorize the image
        category = categorize_image(image_path)
        print(f"  Categorized as: {category}")

        # Add to categories dictionary
        image_categories[image_filename] = category

        # Create symlink or copy to category directory
        category_dir = os.path.join(CSV_OUTPUT_DIR, category)

        # Use copy instead of symlink for better compatibility
        import shutil
        dest_path = os.path.join(category_dir, image_filename)
        shutil.copy2(image_path, dest_path)
        print(f"  Copied to: {dest_path}")

        # If it's a table, extract to CSV
        if category == "table":
            csv_filename = os.path.splitext(image_filename)[0] + ".csv"
            csv_path = os.path.join(category_dir, csv_filename)
            extract_table_to_csv(image_path, csv_path)

    # Save the categories to a JSON file for reference
    with open(os.path.join(CSV_OUTPUT_DIR, "image_categories.json"), 'w') as f:
        json.dump(image_categories, f, indent=2)

    print(f"Processing complete. Found {len(image_categories)} images:")
    for category in CATEGORIES:
        count = list(image_categories.values()).count(category)
        print(f"  {category}: {count}")

if __name__ == "__main__":
    process_all_images()

Found 27 images to process
Processing: content_1744325976_154fc0.png
  Categorized as: table
  Copied to: /content/webpage_elements_csv/table/content_1744325976_154fc0.png
Saved table as CSV: /content/webpage_elements_csv/table/content_1744325976_154fc0.csv
Processing: content_1744325977_51fab5.png
  Categorized as: table
  Copied to: /content/webpage_elements_csv/table/content_1744325977_51fab5.png
Error parsing CSV content: 10 columns passed, passed data had 14 columns
Saved raw table content as CSV: /content/webpage_elements_csv/table/content_1744325977_51fab5.csv
Processing: content_1744325981_c3eb16.png
  Categorized as: table
  Copied to: /content/webpage_elements_csv/table/content_1744325981_c3eb16.png
Error parsing CSV content: 2 columns passed, passed data had 4 columns
Saved raw table content as CSV: /content/webpage_elements_csv/table/content_1744325981_c3eb16.csv
Processing: content_1744325973_9771f2.png
  Categorized as: table
  Copied to: /content/webpage_elements_csv/tab



Error extracting table from /content/webpage_elements/content_1744325972_975996.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
Processing: content_1744325962_28f1c7.png




Error categorizing /content/webpage_elements/content_1744325962_28f1c7.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325962_28f1c7.png
Processing: content_1744325980_b66443.png




Error categorizing /content/webpage_elements/content_1744325980_b66443.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325980_b66443.png
Processing: content_1744325967_4a256b.png




Error categorizing /content/webpage_elements/content_1744325967_4a256b.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325967_4a256b.png
Processing: content_1744325961_f21427.png




Error categorizing /content/webpage_elements/content_1744325961_f21427.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325961_f21427.png
Processing: content_1744325970_535de3.png




Error categorizing /content/webpage_elements/content_1744325970_535de3.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325970_535de3.png
Processing: content_1744325960_f9f348.png




Error categorizing /content/webpage_elements/content_1744325960_f9f348.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325960_f9f348.png
Processing: content_1744325968_f86b56.png




Error categorizing /content/webpage_elements/content_1744325968_f86b56.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325968_f86b56.png
Processing: content_1744325971_e6dd7c.png




Error categorizing /content/webpage_elements/content_1744325971_e6dd7c.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325971_e6dd7c.png
Processing: content_1744325979_64cb7a.png




Error categorizing /content/webpage_elements/content_1744325979_64cb7a.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325979_64cb7a.png
Processing: content_1744325965_d30022.png




Error categorizing /content/webpage_elements/content_1744325965_d30022.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325965_d30022.png
Processing: content_1744325961_7dbaae.png




Error categorizing /content/webpage_elements/content_1744325961_7dbaae.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325961_7dbaae.png
Processing: content_1744325966_ce7f71.png




Error categorizing /content/webpage_elements/content_1744325966_ce7f71.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325966_ce7f71.png
Processing: content_1744325976_0fec21.png




Error categorizing /content/webpage_elements/content_1744325976_0fec21.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325976_0fec21.png
Processing: content_1744325973_695e3a.png




Error categorizing /content/webpage_elements/content_1744325973_695e3a.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325973_695e3a.png
Processing: content_1744325963_5e63db.png




Error categorizing /content/webpage_elements/content_1744325963_5e63db.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325963_5e63db.png
Processing: content_1744325974_69e26f.png




Error categorizing /content/webpage_elements/content_1744325974_69e26f.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325974_69e26f.png
Processing: content_1744325978_a22297.png




Error categorizing /content/webpage_elements/content_1744325978_a22297.png: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/content_1744325978_a22297.png
Processing: fullpage_1744325897.png
  Categorized as: other
  Copied to: /content/webpage_elements_csv/other/fullpage_1744325897.png
Processing complete. Found 27 images:
  table: 7
  map: 2
  chart: 0
  graph: 0
  other: 18
