# Index comparison explorer

This notebook helps you explore Elasticsearch index comparison outputs produced by the index comparison CLI.

1. Use the configuration cell to choose a captured analysis run and inspect the available index snapshots.
2. Load the Parquet shards for each index into Polars for further analysis.
3. Run the example queries (or add your own) to spot differences between the two indices.

In [None]:
from __future__ import annotations

from pathlib import Path

import ipywidgets as widgets
from IPython.display import clear_output, display

DATA_ROOT = Path("../data").resolve()

analysis_dirs = sorted(p for p in DATA_ROOT.glob("analysis-*") if p.is_dir())

if not analysis_dirs:
    raise FileNotFoundError(f"No analysis directories found in {DATA_ROOT}")

analysis_name = analysis_dirs[0].name


def update_state(selected: str) -> None:
    global analysis_name, analysis_path, parquet_root, index_dirs, index_names
    analysis_name = selected
    analysis_path = DATA_ROOT / analysis_name
    parquet_root = analysis_path / "parquet"
    if not parquet_root.exists():
        raise FileNotFoundError(f"No 'parquet' directory found in {analysis_path}")
    index_dirs = sorted(p for p in parquet_root.iterdir() if p.is_dir())
    if len(index_dirs) != 2:
        raise ValueError(
            f"Expected exactly two index directories, found {len(index_dirs)} in {parquet_root}"
        )
    index_names = [p.name for p in index_dirs]
    print(f"Using analysis: {analysis_name}")
    for name, folder in zip(index_names, index_dirs):
        shard_count = len(list(folder.glob("*.parquet")))
        print(f"  - {name} ({shard_count} shard files)")


dropdown = widgets.Dropdown(
    options=[p.name for p in analysis_dirs],
    value=analysis_name,
    description="Analysis:",
    layout=widgets.Layout(width="60%"),
)


def _on_change(change):
    if change.get("name") == "value" and change.get("new") is not None:
        clear_output(wait=True)
        display(dropdown)
        update_state(change["new"])


dropdown.observe(_on_change, names="value")
display(dropdown)

update_state(analysis_name)

In [None]:
import polars as pl
from IPython.display import display

index_lazy_frames = {}
index_counts = {}

for name, folder in zip(index_names, index_dirs):
    shard_paths = sorted(folder.glob("*.parquet"))
    if not shard_paths:
        raise FileNotFoundError(f"No parquet shards found in {folder}")
    index_lazy_frames[name] = pl.scan_parquet([str(path) for path in shard_paths])
    count_df = index_lazy_frames[name].select(pl.len().alias("row_count")).collect()
    index_counts[name] = int(count_df["row_count"][0])
    sample = index_lazy_frames[name].head(3).collect()
    print(
        f"{name}: {index_counts[name]:,} documents across {len(shard_paths)} shards "
        f"({len(index_lazy_frames[name].columns)} columns)"
    )
    #display(sample)
    #print("-")

In [None]:
import polars as pl
from IPython.display import display

left_name, right_name = index_names

left_cols_available = set(index_lazy_frames[left_name].collect_schema().names())
right_cols_available = set(index_lazy_frames[right_name].collect_schema().names())

unique_left = index_lazy_frames[left_name].select(pl.col("_id")).unique()
unique_right = index_lazy_frames[right_name].select(pl.col("_id")).unique()

common_ids = unique_left.join(unique_right, on="_id", how="inner")
only_left = unique_left.join(unique_right, on="_id", how="anti")
only_right = unique_right.join(unique_left, on="_id", how="anti")

common_count = int(common_ids.select(pl.len().alias("rows")).collect()["rows"][0])
left_only_count = int(only_left.select(pl.len().alias("rows")).collect()["rows"][0])
right_only_count = int(only_right.select(pl.len().alias("rows")).collect()["rows"][0])

print(f"Common IDs: {common_count:,}")
print(f"Only in {left_name}: {left_only_count:,}")
print(f"Only in {right_name}: {right_only_count:,}")

if left_only_count:
    sample_left = only_left.limit(5).collect()
    print(f"\nSample IDs only in {left_name}:")
    display(sample_left)
else:
    print(f"\nNo IDs unique to {left_name}.")

if right_only_count:
    sample_right = only_right.limit(5).collect()
    print(f"\nSample IDs only in {right_name}:")
    display(sample_right)
else:
    print(f"\nNo IDs unique to {right_name}.")

if common_count:
    left_cols = ["_id"] + (["_source"] if "_source" in left_cols_available else [])
    right_cols = ["_id"] + (["_source"] if "_source" in right_cols_available else [])
    common_sample = (
        index_lazy_frames[left_name].select(left_cols)
        .join(
            index_lazy_frames[right_name].select(right_cols),
            on="_id",
            how="inner",
            suffix="_right",
        )
        .limit(1)
        .collect()
    )
    print("\nExample document present in both indices:")
    display(common_sample)