In [13]:
import os
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    Settings,
    load_index_from_storage
)
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.extractors import TitleExtractor
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

# Configure settings with IBM Granite embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="ibm-granite/granite-embedding-107m-multilingual"
)

# Create Markdown-specific parser with title extraction
Settings.node_parser = MarkdownNodeParser(
    chunk_size=512,
    chunk_overlap=20
)

# Enable title extraction for better metadata
# Settings.transformations = [TitleExtractor(nodes="ALL")]

# Set path to your Markdown files
DATA_DIR = "/Users/alex/typetwo-public/rag/data/2025 Volume 68 (subset) Parsed"
PERSIST_DIR = "./storage"

def build_index():
    """Build and persist the vector index from Markdown files"""
    # Load documents from the directory, filtering for Markdown files
    documents = SimpleDirectoryReader(
        DATA_DIR,
        required_exts=[".md"],  # Only process Markdown files
        recursive=True          # Process subdirectories if needed
    ).load_data()
    
    print(f"Loaded {len(documents)} Markdown documents")
    
    # Create FAISS index with cosine similarity
    # The IBM Granite embedding dimension is 768
    dimension = 384
    faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Create the index - will parse Markdown into structured nodes
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
    )
    
    # Persist the index to disk
    # index.storage_context.persist(persist_dir=PERSIST_DIR)
    print(f"Index built and saved to {PERSIST_DIR}")
    return index

def load_existing_index():
    """Load existing index from disk"""
    if not os.path.exists(PERSIST_DIR):
        print(f"No existing index found at {PERSIST_DIR}")
        return None
        
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    print(f"Loaded existing index from {PERSIST_DIR}")
    return index

def retrieve_relevant_context(query, top_k=5):
    """Retrieve relevant context for a query with metadata"""
    # Load or create index
    index = load_existing_index()
    if index is None:
        index = build_index()
    
    # Create retriever with parameters for controlling retrieval
    retriever = index.as_retriever(
        similarity_top_k=top_k,
    )
    
    # Retrieve relevant nodes
    retrieved_nodes = retriever.retrieve(query)
    
    # Print retrieved context
    print(f"Retrieved {len(retrieved_nodes)} relevant Markdown chunks for query: '{query}'")
    
    # Return the retrieved context and metadata
    results = []
    for node in retrieved_nodes:
        result = {
            "text": node.text,
            "score": node.score if hasattr(node, "score") else None,
            "source": node.metadata.get("file_name", "Unknown"),
            "title": node.metadata.get("title", ""),
            # Include heading hierarchy if available
            "heading": node.metadata.get("heading", "")
        }
        results.append(result)
    
    return results

def display_retrieved_contexts(results):
    """Display retrieved contexts with metadata"""
    for i, result in enumerate(results):
        print(f"\n===== Context {i+1} =====")
        print(f"Source: {result['source']}")
        if result['title']:
            print(f"Title: {result['title']}")
        if result['heading']:
            print(f"Section: {result['heading']}")
        if result['score'] is not None:
            print(f"Relevance Score: {result['score']:.4f}")
        print("\nExcerpt:")
        print(result['text'][:300] + "..." if len(result['text']) > 300 else result['text'])
        print("="*30)


In [16]:
query = "I am trying to optimise potency and pk of a part of my molecule that has methoxybenzene."
results = retrieve_relevant_context(query, top_k=3)

# Display retrieved contexts with metadata
display_retrieved_contexts(results)

No existing index found at ./storage
Loaded 2 Markdown documents
Index built and saved to ./storage
Retrieved 3 relevant Markdown chunks for query: 'I am trying to optimise potency and pk of a part of my molecule that has methoxybenzene.'

===== Context 1 =====
Source: 0095-0107.md
Relevance Score: 0.6720

Excerpt:
## ■ DISCUSSION

Malaria drug development comes with substantial challenges, as seen by the very few new antimalarial drugs that have been approved in the last decades. Currently, the core dependency on artemisinin-based combination therapies is a major concern, as they are encountering declining ef...

===== Context 2 =====
Source: 0095-0107.md
Relevance Score: 0.6684

Excerpt:
## ■ RESULTS

Synthesis of 2 (TKK130). The synthesis of 2 (TKK130) followed the procedure recently reported by Knaab et al. 8 To accomplish the bioisosteric optimization, the essential amidine moiety in 2 (TKK130) was introduced in the last reaction step by reacting the 3-hydroxypropanenitrile 3 wit.