# Create Vector Search Indexes

This notebook:
1. Creates Vector Search endpoint (if not exists)
2. Creates Delta Sync indexes for each agent's document table

**Prerequisites:** Run `01_parse_pdfs.ipynb` first to create Delta tables

In [None]:
%pip install databricks-sdk databricks-vectorsearch
dbutils.library.restartPython()

In [None]:
# Parameters - widgets allow both interactive + job execution
dbutils.widgets.text("catalog", "your_catalog", "Unity Catalog")
dbutils.widgets.text("schema", "rag_agents", "Schema name")

CATALOG = dbutils.widgets.get("catalog")
SCHEMA = dbutils.widgets.get("schema")

# Fail fast in job context if not configured
if spark.conf.get("spark.databricks.job.id", None) and CATALOG == "your_catalog":
    raise ValueError("catalog parameter required for job execution")

# Vector Search endpoint name (shared across agents)
VS_ENDPOINT_NAME = "rag-vector-search"

# Agent configurations: (source_table, index_name)
AGENTS = [
    ("agent_a_docs", "agent_a_vs_index"),
    ("agent_b_docs", "agent_b_vs_index"),
    ("agent_c_docs", "agent_c_vs_index"),
]

# Embedding model (Databricks Foundation Model)
EMBEDDING_MODEL = "databricks-gte-large-en"

In [None]:
from databricks.sdk import WorkspaceClient
from databricks.vector_search.client import VectorSearchClient

w = WorkspaceClient()
vsc = VectorSearchClient()

## Create Vector Search Endpoint

In [None]:
def create_endpoint_if_not_exists(endpoint_name: str):
    """Create VS endpoint if it doesn't exist."""
    try:
        endpoint = vsc.get_endpoint(endpoint_name)
        print(f"Endpoint '{endpoint_name}' exists. Status: {endpoint.get('status')}")
        return endpoint
    except Exception:
        print(f"Creating endpoint '{endpoint_name}'...")
        endpoint = vsc.create_endpoint(name=endpoint_name, endpoint_type="STANDARD")
        print(f"Endpoint created. Status: {endpoint.get('status')}")
        return endpoint


create_endpoint_if_not_exists(VS_ENDPOINT_NAME)

## Create Delta Sync Indexes

In [None]:
def create_delta_sync_index(source_table: str, index_name: str):
    """Create Delta Sync index for a table (idempotent)."""
    source_table_full = f"{CATALOG}.{SCHEMA}.{source_table}"
    index_full_name = f"{CATALOG}.{SCHEMA}.{index_name}"

    print(f"Creating index: {index_full_name}")
    print(f"  Source table: {source_table_full}")

    # Check if index exists - if so, trigger sync instead
    try:
        existing = vsc.get_index(VS_ENDPOINT_NAME, index_full_name)
        print(f"  Index exists. Status: {existing.get('status')}")
        # Trigger sync for existing index
        try:
            existing.sync()
            print(f"  Sync triggered for existing index")
        except Exception as sync_err:
            print(f"  Could not trigger sync: {sync_err}")
        return existing
    except Exception:
        pass  # Index doesn't exist, create it

    # Create Delta Sync index with embedding model
    index = vsc.create_delta_sync_index(
        endpoint_name=VS_ENDPOINT_NAME,
        index_name=index_full_name,
        source_table_name=source_table_full,
        pipeline_type="TRIGGERED",  # Manual sync; use CONTINUOUS for auto-sync
        primary_key="doc_id",
        embedding_source_column="content",
        embedding_model_endpoint_name=EMBEDDING_MODEL,
    )

    print(f"  Index created. Initial status: {index.get('status')}")
    return index

In [None]:
# Create indexes for all agents
results = {}
for source_table, index_name in AGENTS:
    try:
        index = create_delta_sync_index(source_table, index_name)
        results[index_name] = index.get("status", "created")
    except Exception as e:
        print(f"Error creating {index_name}: {e}")
        results[index_name] = f"Error: {e}"

print("\n=== Summary ===")
for idx_name, status in results.items():
    print(f"{idx_name}: {status}")

## Sync Indexes (if using TRIGGERED pipeline)

In [None]:
# Sync all indexes
for source_table, index_name in AGENTS:
    index_full_name = f"{CATALOG}.{SCHEMA}.{index_name}"
    try:
        vsc.get_index(VS_ENDPOINT_NAME, index_full_name).sync()
        print(f"Sync triggered for {index_name}")
    except Exception as e:
        print(f"Could not sync {index_name}: {e}")

## Check Index Status

In [None]:
import time


def wait_for_index_ready(index_name: str, timeout_minutes: int = 30):
    """Wait for index to be ready."""
    index_full_name = f"{CATALOG}.{SCHEMA}.{index_name}"
    start = time.time()
    timeout = timeout_minutes * 60

    while time.time() - start < timeout:
        try:
            index = vsc.get_index(VS_ENDPOINT_NAME, index_full_name)
            status = index.get("status", {})
            state = status.get("detailed_state") or status.get("state", "UNKNOWN")

            if state == "ONLINE":
                print(f"{index_name}: READY")
                return True

            print(f"{index_name}: {state}")
            time.sleep(30)
        except Exception as e:
            print(f"{index_name}: Error checking status - {e}")
            time.sleep(30)

    print(f"{index_name}: Timeout after {timeout_minutes} minutes")
    return False


# Check all indexes
for _, index_name in AGENTS:
    wait_for_index_ready(index_name, timeout_minutes=5)

## Test Similarity Search

In [None]:
# Test query on first index
test_index = f"{CATALOG}.{SCHEMA}.{AGENTS[0][1]}"
test_query = "How do I get started?"

try:
    index = vsc.get_index(VS_ENDPOINT_NAME, test_index)
    results = index.similarity_search(
        query_text=test_query,
        columns=["content", "source", "chunk_id"],
        num_results=3,
    )
    print(f"Query: {test_query}\n")
    for i, row in enumerate(results.get("result", {}).get("data_array", [])):
        print(f"Result {i + 1}:")
        print(f"  Score: {row[-1]}")
        print(f"  Content: {row[0][:200]}...")
        print()
except Exception as e:
    print(f"Test failed: {e}")