In [None]:
OPENROUTER_API_KEY = (
    
)

In [3]:
import dspy
lm = dspy.LM("openrouter/meta-llama/llama-4-scout", api_key=OPENROUTER_API_KEY, cache=False, provider="groq")
dspy.configure(lm=lm)

In [4]:
lm("hi")

  headers, stream = encode_request(


['Hi! How are you today? Is there something I can help you with, or would you like to chat?']

In [5]:
import os
import logging
from typing import List
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import MarkdownHeaderTextSplitter
import faiss
import numpy as np
import dspy

# ---------------------------------------------------------------------
# Logging configuration
# ---------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    handlers=[logging.StreamHandler()]   # console only for notebook
)
logger = logging.getLogger("MarkdownRAG")

# ---------------------------------------------------------------------
# 1. Load multiple markdown files
# ---------------------------------------------------------------------
md_file_paths = [
"/Users/vedansh.kapoor/atlan_project/experiments/knowledge_base_resource/atlan_documentation/apps_connectors_data-warehouses_databricks_how-tos_set-up-databricks.md"]
logger.info("Loading %d markdown files...", len(md_file_paths))

all_chunks = []
headers_to_split_on = [("#", "Header1"), ("##", "Header2"), ("###", "Header3")]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

for path in md_file_paths:
    logger.info("Reading %s", path)
    with open(path, "r", encoding="utf-8") as f:
        md_content = f.read()
    logger.info("File loaded: %s (size=%d chars)", path, len(md_content))
    chunks = splitter.split_text(md_content)
    # Store the source filename in metadata for traceability
    for ch in chunks:
        ch.metadata["source"] = os.path.basename(path)
    all_chunks.extend(chunks)

logger.info("Total chunks across all files: %d", len(all_chunks))

# ---------------------------------------------------------------------
# 2. Load embedding model
# ---------------------------------------------------------------------
embed_model_name = "BAAI/bge-large-en-v1.5"
logger.info("Loading embedding model: %s", embed_model_name)
embed_model = SentenceTransformer(embed_model_name)
logger.info("Embedding model loaded.")

# ---------------------------------------------------------------------
# 3. Prepare texts for embedding
# ---------------------------------------------------------------------
logger.info("Preparing texts for embedding...")
texts_to_embed = []
for idx, chunk in enumerate(all_chunks):
    headers = [chunk.metadata[h] for h in ["Header1", "Header2", "Header3"] if h in chunk.metadata]
    header_text = " | ".join(headers)
    source_text = chunk.metadata.get("source", "")
    embed_text = f"{header_text} | {source_text} | {chunk.page_content}"
    texts_to_embed.append(embed_text)
    logger.debug("Chunk %d prepared (len=%d)", idx, len(embed_text))

# ---------------------------------------------------------------------
# 4. Create embeddings
# ---------------------------------------------------------------------
logger.info("Creating embeddings for %d chunks...", len(texts_to_embed))
embeddings = embed_model.encode(texts_to_embed, convert_to_numpy=True)
logger.info("Embeddings shape: %s", embeddings.shape)

# ---------------------------------------------------------------------
# 5. Create and populate FAISS index
# ---------------------------------------------------------------------
embedding_dim = embeddings.shape[1]
logger.info("Creating FAISS index with dimension %d", embedding_dim)
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(embeddings)
logger.info("FAISS index populated with embeddings.")

# ---------------------------------------------------------------------
# 6. Keep chunk metadata
# ---------------------------------------------------------------------
metadata_store = all_chunks




  from .autonotebook import tqdm as notebook_tqdm
2025-09-13 23:08:19,904 | INFO | MarkdownRAG | Loading 1 markdown files...
2025-09-13 23:08:19,905 | INFO | MarkdownRAG | Reading /Users/vedansh.kapoor/atlan_project/experiments/knowledge_base_resource/atlan_documentation/apps_connectors_data-warehouses_databricks_how-tos_set-up-databricks.md
2025-09-13 23:08:19,906 | INFO | MarkdownRAG | File loaded: /Users/vedansh.kapoor/atlan_project/experiments/knowledge_base_resource/atlan_documentation/apps_connectors_data-warehouses_databricks_how-tos_set-up-databricks.md (size=39283 chars)
2025-09-13 23:08:19,909 | INFO | MarkdownRAG | Total chunks across all files: 27
2025-09-13 23:08:19,910 | INFO | MarkdownRAG | Loading embedding model: BAAI/bge-large-en-v1.5
2025-09-13 23:08:19,936 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device_name: mps
2025-09-13 23:08:19,936 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: BAAI/bge-large-en

In [6]:
import os

config = {
    "llm": {
        "provider": "groq",
        "config": {
            "model": "openrouter/meta-llama/llama-4-scout",
            "api_key": OPENROUTER_API_KEY,   # your notebook variable
            
        }
    },
    "embedder": {
        "provider": "huggingface",
        "config": {
            "model": "BAAI/bge-large-en-v1.5"
            # no API key needed for local model
        }
    }
}

from mem0 import Memory
memory = Memory.from_config(config)


2025-09-13 23:08:49,891 | INFO | mem0.vector_stores.qdrant | Inserting 1 vectors into collection mem0migrations


In [16]:
# ---------------------------------------------------------------------
# 7. DSPy signature
# ---------------------------------------------------------------------
class GenerateAnswer(dspy.Signature):
    context = dspy.InputField(desc="Relevant markdown context extracted from chunks")
    question = dspy.InputField(desc="User question")
    answer = dspy.OutputField(desc="Concise, accurate answer citing sources")

# ---------------------------------------------------------------------
# 8. DSPy RAG Module
# ---------------------------------------------------------------------
class MarkdownRAG(dspy.Module):
    def __init__(self, faiss_index, metadata_store, embed_model):
        super().__init__()
        self.faiss_index = faiss_index
        self.metadata_store = metadata_store
        self.embed_model = embed_model
        self.k = 3
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        logger.info("MarkdownRAG module initialized for %d chunks.", len(metadata_store))

    def forward(self, question: str):
        logger.info("Received query: %s", question)
        query_emb = self.embed_model.encode([question], convert_to_numpy=True)
        D, I = self.faiss_index.search(query_emb, self.k)
        logger.debug("FAISS search distances: %s", D)
        retrieved_chunks = [self.metadata_store[i] for i in I[0]]
        logger.info("Retrieved %d chunks for the query.", len(retrieved_chunks))

        context_pieces = []
        for idx, chunk in enumerate(retrieved_chunks):
            source = chunk.metadata.get("source", "")
            logger.debug("Chunk %d source: %s", idx, source)
            context_pieces.append(f"{chunk.page_content}\n(Source: {source})")
        context = "\n---\n".join(context_pieces)

        result = self.generate_answer(context=context, question=question)
        logger.info("Generated answer for query.")
        return dspy.Prediction(answer=result.answer)

# ---------------------------------------------------------------------
# 9. Notebook-friendly usage
# ---------------------------------------------------------------------
rag = MarkdownRAG(faiss_index, metadata_store, embed_model)
logger.info("MarkdownRAG ready to answer questions from %d files.", len(md_file_paths))

# Example cell to query:
prediction = rag("how can i set up databricks with atlan's platform?")
print(prediction.answer)

2025-09-13 23:30:23,328 | INFO | MarkdownRAG | MarkdownRAG module initialized for 27 chunks.
2025-09-13 23:30:23,330 | INFO | MarkdownRAG | MarkdownRAG ready to answer questions from 1 files.
2025-09-13 23:30:23,402 | INFO | MarkdownRAG | Received query: how can i set up databricks with atlan's platform?
2025-09-13 23:30:24,550 | INFO | MarkdownRAG | Retrieved 3 chunks for the query.
2025-09-13 23:30:28,263 | INFO | MarkdownRAG | Generated answer for query.


To set up Databricks with Atlan's platform, follow these steps:
1. Choose an authentication method: 
   - Personal access token authentication
   - AWS service principal authentication
   - Azure service principal authentication

2. Select a cluster option:
   - **Interactive Cluster**: Ensure an all-purpose interactive cluster is configured with Autopilot options enabled and JDBC/ODBC tab fields populated (Server Hostname, Port, and HTTP Path).
   - **SQL Warehouse (formerly SQL endpoint)**: Ensure a SQL warehouse is configured and connection details are noted (Server hostname, Port, and HTTP path).

3. **Grant Permissions to Crawl Metadata**:
   - Ensure a Unity Catalog-enabled Databricks workspace.
   - Grant the BROWSE privilege to the user or service principal for the catalog you want to crawl in Atlan.

For detailed steps and specific configurations, refer to the [Atlan documentation for setting up Databricks](https://docs.atlan.com/apps/connectors/data-warehouses/databricks/how-