# Databricks Runtime Discovery and Analysis

This notebook demonstrates how to fetch, filter, and analyze Databricks runtime information using the `dbx_container` package.


## 1. Initialize the Runtime Scraper

Import required modules and set up the scraper with logging.


In [1]:
from dbx_container.data.scraper import RuntimeScraper
from dbx_container.utils.logging import get_logger

# Initialize logger
logger = get_logger(__name__)

# Initialize scraper with custom settings
scraper = RuntimeScraper(max_workers=5, verify_ssl=False)

logger.info("RuntimeScraper initialized - ready to fetch runtime information")

## 2. Fetch and Display All Runtimes

Fetch runtime information from Databricks documentation and display in a table.


In [2]:
# Fetch all supported runtimes
logger.info("Fetching runtime information from Databricks documentation...")

runtimes = scraper.get_supported_runtimes()

logger.info(f"Successfully fetched {len(runtimes)} runtimes")

# Display runtimes in a rich table
scraper.display_runtimes()

Output()

Output()

True

## 3. Filter and Analyze Runtimes

Filter runtimes by specific criteria and analyze their properties.


In [4]:
# Filter LTS runtimes only
lts_runtimes = [r for r in runtimes if "LTS" in r.version]
logger.info(f"Found {len(lts_runtimes)} LTS runtimes")

# Filter ML runtimes
ml_runtimes = [r for r in runtimes if r.is_ml]
logger.info(f"Found {len(ml_runtimes)} ML runtimes")

# Get latest LTS runtime
lts_versions = sorted(
    [r for r in runtimes if "LTS" in r.version and not r.is_ml], key=lambda r: r.version, reverse=True
)

if lts_versions:
    latest = lts_versions[0]
    env = latest.system_environment
    logger.info(f"\n[bold]Latest LTS Runtime:[/bold] {latest.version}")
    logger.info(f"  Release Date: {latest.release_date}")
    logger.info(f"  Python: {env.python_version}")
    logger.info(f"  Java: {env.java_version}")
    logger.info(f"  Scala: {env.scala_version}")
    logger.info(f"  OS: {env.operating_system}")

## 4. Analyze Runtime Dependencies

Examine Python packages and libraries included in runtimes.


In [5]:
# Analyze dependencies for the latest LTS runtime
if lts_versions:
    target_runtime = lts_versions[0]
    logger.info(f"\n[bold]Analyzing dependencies for {target_runtime.version}[/bold]")

    # Get Python libraries
    python_libs = target_runtime.included_libraries.get("python", {})
    logger.info(f"\nTotal Python packages: {len(python_libs)}")

    # Check for popular data science packages
    popular_packages = ["pandas", "numpy", "scikit-learn", "tensorflow", "torch", "pyspark", "matplotlib", "seaborn"]
    logger.info("\n[bold]Popular packages:[/bold]")
    for pkg in popular_packages:
        if pkg in python_libs:
            logger.info(f"  ✓ {pkg}: {python_libs[pkg]}")
        else:
            logger.info(f"  ✗ {pkg}: not included")

    # Get Java libraries
    java_libs = target_runtime.included_libraries.get("java", {})
    logger.info(f"\nTotal Java libraries: {len(java_libs)}")

    # Show sample Python packages
    logger.info("\n[bold]Sample Python packages (first 10):[/bold]")
    for i, (pkg, version) in enumerate(list(python_libs.items())[:10]):
        logger.info(f"  {pkg}=={version}")
    if len(python_libs) > 10:
        logger.info(f"  ... and {len(python_libs) - 10} more packages")

## 5. Compare Runtime Versions

Compare different versions to see changes in packages and environment.


In [6]:
# Compare two LTS runtime versions
if len(lts_versions) >= 2:
    newer = lts_versions[0]
    older = lts_versions[1]

    logger.info(f"\n[bold]Comparing Runtimes:[/bold]")
    logger.info(f"  Newer: {newer.version}")
    logger.info(f"  Older: {older.version}")

    # Compare versions
    logger.info(f"\n[bold]Python Version:[/bold]")
    logger.info(f"  {newer.version}: {newer.system_environment.python_version}")
    logger.info(f"  {older.version}: {older.system_environment.python_version}")

    logger.info(f"\n[bold]OS Version:[/bold]")
    logger.info(f"  {newer.version}: {newer.system_environment.operating_system}")
    logger.info(f"  {older.version}: {older.system_environment.operating_system}")

    # Compare package counts
    newer_pkg_count = len(newer.included_libraries.get("python", {}))
    older_pkg_count = len(older.included_libraries.get("python", {}))

    logger.info(f"\n[bold]Python Package Count:[/bold]")
    logger.info(f"  {newer.version}: {newer_pkg_count} packages")
    logger.info(f"  {older.version}: {older_pkg_count} packages")
    logger.info(f"  Difference: {newer_pkg_count - older_pkg_count:+d} packages")

    # Find packages unique to each version
    newer_pkgs = set(newer.included_libraries.get("python", {}).keys())
    older_pkgs = set(older.included_libraries.get("python", {}).keys())

    new_packages = newer_pkgs - older_pkgs
    removed_packages = older_pkgs - newer_pkgs

    if new_packages:
        logger.info(f"\n[bold]New packages in {newer.version} (showing first 5):[/bold]")
        for pkg in list(new_packages)[:5]:
            logger.info(f"  + {pkg}")

    if removed_packages:
        logger.info(f"\n[bold]Removed packages from {older.version} (showing first 5):[/bold]")
        for pkg in list(removed_packages)[:5]:
            logger.info(f"  - {pkg}")
else:
    logger.warning("Not enough LTS runtimes to compare")

## 6. Generate Runtime Summary Report

Create a comprehensive summary of the runtime ecosystem.


In [7]:
from rich.panel import Panel
from rich.table import Table
from collections import Counter

# Generate comprehensive summary
logger.print("\n" + "=" * 60)
logger.print(Panel.fit("[bold cyan]Databricks Runtime Ecosystem Summary[/bold cyan]"))
logger.print("=" * 60 + "\n")

# Overall statistics
logger.info(f"[bold]Total Runtimes:[/bold] {len(runtimes)}")
logger.info(f"  • LTS Runtimes: {len(lts_runtimes)}")
logger.info(f"  • ML Runtimes: {len(ml_runtimes)}")

# Python version distribution
python_versions = Counter(
    [r.system_environment.python_version for r in runtimes if r.system_environment.python_version]
)
logger.info(f"\n[bold]Python Version Distribution:[/bold]")
for version, count in sorted(python_versions.items()):
    logger.info(f"  • Python {version}: {count} runtimes")

# OS version distribution
os_versions = Counter(
    [r.system_environment.operating_system for r in runtimes if r.system_environment.operating_system]
)
logger.info(f"\n[bold]OS Version Distribution:[/bold]")
for version, count in sorted(os_versions.items()):
    logger.info(f"  • {version}: {count} runtimes")

# Create a detailed table for latest LTS runtimes
table = Table(title="\nLatest LTS Runtimes (Top 5)", show_header=True)
table.add_column("Version", style="cyan")
table.add_column("Python", style="green")
table.add_column("OS", style="yellow")
table.add_column("ML", style="blue")

for runtime in lts_versions[:5]:
    table.add_row(
        runtime.version,
        runtime.system_environment.python_version or "N/A",
        runtime.system_environment.operating_system or "N/A",
        "✓" if runtime.is_ml else "✗",
    )

logger.print(table)
logger.info("\n✅ Runtime analysis complete!")

## 7. Integration with Build Engine

See how runtime information can be used with the container build engine.


In [None]:
from pathlib import Path
from dbx_container.engine import RuntimeContainerEngine

# Initialize the build engine
logger.info("\nInitializing RuntimeContainerEngine...")

engine = RuntimeContainerEngine(data_dir=Path("../data/"), max_workers=5, verify_ssl=False, latest_lts_count=3)

# Show which runtimes will be used for building
logger.info("\n[bold]Runtimes selected for building (latest 3 LTS):[/bold]")
selected_lts = lts_versions[:3]
for i, runtime in enumerate(selected_lts, 1):
    logger.info(f"  {i}. {runtime.version}")
    logger.info(
        f"     Python: {runtime.system_environment.python_version}, OS: {runtime.system_environment.operating_system}"
    )

# Show available image types
logger.info("\n[bold]Available image types:[/bold]")
for img_type, config in engine.image_types.items():
    depends = config.get("depends_on", "None (base image)")
    logger.info(f"  • {img_type}: {config['description']}")
    logger.info(f"    Depends on: {depends}")

logger.info("\n💡 Ready to generate Dockerfiles! See build.ipynb for examples.")