# Selenium vs Supacrawler: A Practical Comparison

This notebook provides a practical comparison between Selenium, a popular browser automation tool, and Supacrawler API, a cloud-based web scraping solution.

We'll compare these tools across several dimensions:
1. Setup and installation
2. Basic web scraping
3. Performance benchmarks


In [None]:
# Installation
# Uncomment and run these commands to install the required packages

# !pip install selenium webdriver-manager
# !pip install supacrawler


In [None]:
# Import libraries
import time
import os
import json
from datetime import datetime


## 1. Setup and Basic Usage

Let's compare how to set up and use Selenium and Supacrawler for a simple task: navigating to a webpage and getting its title.


In [None]:
# Selenium - Basic Setup
def selenium_basic(url):
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.options import Options
    
    start_time = time.time()
    
    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    
    # Setup Chrome driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Navigate to URL
    driver.get(url)
    
    # Wait for page to load (simple implementation)
    time.sleep(2)
    
    # Get page title
    title = driver.title
    
    # Close browser
    driver.quit()
    
    end_time = time.time()
    setup_time = end_time - start_time
    
    return {
        "title": title,
        "setup_time": setup_time
    }


In [None]:
# Supacrawler - Basic Setup
def supacrawler_basic(url):
    from supacrawler import SupacrawlerClient
    
    start_time = time.time()
    
    # Initialize client
    client = SupacrawlerClient(api_key=os.environ.get('SUPACRAWLER_API_KEY', 'YOUR_API_KEY'))
    
    # Make API request
    response = client.scrape(url=url, render_js=True)
    
    # Get page title
    title = response.metadata.title if response.metadata else "No title"
    
    end_time = time.time()
    setup_time = end_time - start_time
    
    return {
        "title": title,
        "setup_time": setup_time
    }


In [None]:
# Run basic comparison
def compare_basic(url):
    results = {}
    
    print(f"Testing basic setup for URL: {url}")
    print("-" * 50)
    
    try:
        print("Running Selenium basic setup...")
        results["selenium"] = selenium_basic(url)
        print(f"Title: {results['selenium']['title']}")
        print(f"Setup time: {results['selenium']['setup_time']:.2f} seconds")
    except Exception as e:
        print(f"Selenium error: {e}")
        results["selenium"] = {"title": "Error", "setup_time": 0}
    
    print("-" * 30)
    
    try:
        print("Running Supacrawler basic setup...")
        results["supacrawler"] = supacrawler_basic(url)
        print(f"Title: {results['supacrawler']['title']}")
        print(f"Setup time: {results['supacrawler']['setup_time']:.2f} seconds")
    except Exception as e:
        print(f"Supacrawler error: {e}")
        results["supacrawler"] = {"title": "Error", "setup_time": 0}
    
    print("-" * 50)
    return results

# Run comparison with example.com
# basic_results = compare_basic("https://example.com")


In [None]:
# Visualize basic comparison
def plot_basic_times(results):
    import matplotlib.pyplot as plt
    
    tools = list(results.keys())
    times = [results[tool]["setup_time"] for tool in tools]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(tools, times)
    
    # Add labels and title
    plt.xlabel('Tool')
    plt.ylabel('Setup Time (seconds)')
    plt.title('Browser Automation Tools: Setup Time Comparison')
    
    # Add values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                 f'{height:.2f}s',
                 ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Visualize the results
# plot_basic_times(basic_results)


## 2. Web Scraping Comparison

Now let's compare how each tool handles extracting data from a webpage with JavaScript content.


In [None]:
# Sample JavaScript-heavy page URL
JS_HEAVY_URL = "https://quotes.toscrape.com/js/"


In [None]:
# Selenium - Scrape JavaScript Content
def selenium_scrape_js(url):
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    
    start_time = time.time()
    
    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    
    # Setup Chrome driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Navigate to URL
    driver.get(url)
    
    # Wait for JavaScript to load content
    time.sleep(3)
    
    # Extract quotes data
    quotes = []
    quote_elements = driver.find_elements(By.CLASS_NAME, "quote")
    
    for quote_element in quote_elements:
        quote_text = quote_element.find_element(By.CLASS_NAME, "text").text
        quote_author = quote_element.find_element(By.CLASS_NAME, "author").text
        quotes.append({
            "text": quote_text,
            "author": quote_author
        })
    
    # Close browser
    driver.quit()
    
    end_time = time.time()
    scrape_time = end_time - start_time
    
    return {
        "quotes": quotes,
        "count": len(quotes),
        "scrape_time": scrape_time
    }


In [None]:
# Supacrawler - Scrape JavaScript Content
def supacrawler_scrape_js(url):
    from supacrawler import SupacrawlerClient
    import re
    
    start_time = time.time()
    
    # Initialize client
    client = SupacrawlerClient(api_key=os.environ.get('SUPACRAWLER_API_KEY', 'YOUR_API_KEY'))
    
    # Make API request with JavaScript rendering
    response = client.scrape(url=url, render_js=True)
    
    # Extract quotes data using the HTML content
    quotes = []
    
    if response.html:
        # Simple regex-based extraction (in a real scenario, you might use BeautifulSoup)
        quote_pattern = r'<div class="quote".*?<span class="text".*?>(.*?)</span>.*?<small class="author">(.*?)</small>'
        matches = re.findall(quote_pattern, response.html, re.DOTALL)
        
        for match in matches:
            quotes.append({
                "text": match[0].strip(),
                "author": match[1].strip()
            })
    
    end_time = time.time()
    scrape_time = end_time - start_time
    
    return {
        "quotes": quotes,
        "count": len(quotes),
        "scrape_time": scrape_time
    }


In [None]:
# Run JavaScript scraping comparison
def compare_js_scraping(url):
    results = {}
    
    print(f"Testing JavaScript scraping for URL: {url}")
    print("-" * 50)
    
    try:
        print("Running Selenium JavaScript scraping...")
        results["selenium"] = selenium_scrape_js(url)
        print(f"Quotes found: {results['selenium']['count']}")
        if results['selenium']['quotes']:
            print(f"First quote: \"{results['selenium']['quotes'][0]['text']}\" - {results['selenium']['quotes'][0]['author']}")
        print(f"Scrape time: {results['selenium']['scrape_time']:.2f} seconds")
    except Exception as e:
        print(f"Selenium error: {e}")
        results["selenium"] = {"quotes": [], "count": 0, "scrape_time": 0}
    
    print("-" * 30)
    
    try:
        print("Running Supacrawler JavaScript scraping...")
        results["supacrawler"] = supacrawler_scrape_js(url)
        print(f"Quotes found: {results['supacrawler']['count']}")
        if results['supacrawler']['quotes']:
            print(f"First quote: \"{results['supacrawler']['quotes'][0]['text']}\" - {results['supacrawler']['quotes'][0]['author']}")
        print(f"Scrape time: {results['supacrawler']['scrape_time']:.2f} seconds")
    except Exception as e:
        print(f"Supacrawler error: {e}")
        results["supacrawler"] = {"quotes": [], "count": 0, "scrape_time": 0}
    
    print("-" * 50)
    return results

# Uncomment to run the comparison
# js_results = compare_js_scraping(JS_HEAVY_URL)


In [None]:
# Visualize JavaScript scraping comparison
def plot_js_comparison(results):
    import matplotlib.pyplot as plt
    import numpy as np
    
    tools = list(results.keys())
    times = [results[tool]["scrape_time"] for tool in tools]
    counts = [results[tool]["count"] for tool in tools]
    
    # Set up the figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot times
    bars1 = ax1.bar(tools, times)
    ax1.set_xlabel('Tool')
    ax1.set_ylabel('Time (seconds)')
    ax1.set_title('JavaScript Scraping Time')
    
    # Add values on top of bars
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                 f'{height:.2f}s',
                 ha='center', va='bottom')
    
    # Plot quote counts
    bars2 = ax2.bar(tools, counts)
    ax2.set_xlabel('Tool')
    ax2.set_ylabel('Quotes Count')
    ax2.set_title('Quotes Retrieved')
    
    # Add values on top of bars
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                 f'{int(height)}',
                 ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Uncomment to visualize the results after running the comparison
# plot_js_comparison(js_results)


## 3. Feature Comparison

Let's compare the key features of Selenium and Supacrawler.


In [None]:
# Create comparison table
def create_comparison_table():
    import pandas as pd
    
    # Create comparison data
    data = {
        "Feature": [
            "Languages", 
            "Browsers", 
            "Performance", 
            "Reliability", 
            "Setup Complexity", 
            "Maintenance", 
            "Best For"
        ],
        "Selenium": [
            "Java, Python, C#, Ruby, JavaScript",
            "Chrome, Firefox, Edge, Safari",
            "★★★☆☆",
            "★★★☆☆",
            "★★★★☆",
            "High",
            "Legacy systems, enterprise"
        ],
        "Supacrawler API": [
            "JavaScript, Python, REST API",
            "Chrome (headless)",
            "★★★★★",
            "★★★★★",
            "★☆☆☆☆",
            "None",
            "Production scraping at scale"
        ]
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

# Display comparison table
# create_comparison_table()


## Conclusion

Based on our comparison, here are the key takeaways:

1. **Selenium** is a well-established browser automation framework with broad browser support, but comes with higher maintenance overhead and slower performance.

2. **Supacrawler API** provides a simpler implementation with no infrastructure management, making it ideal for production scraping at scale with minimal development effort.

For most web scraping needs, Supacrawler API offers a better developer experience than Selenium, especially when you need to focus on extracting data rather than managing browser automation infrastructure.


## 3. Feature Comparison

Let's compare the key features of Selenium and Supacrawler.
