In [None]:
from llama_index.core import Document
import json

# Recursively flatten nested metadata dictionaries/lists into key: value lines
def flatten_metadata(obj, prefix=""):
    lines = []
    if isinstance(obj, dict):
        # Iterate through dictionary keys
        for k, v in obj.items():
            key = f"{prefix}{k}" if prefix else k
            # Recursively process nested items
            lines.extend(flatten_metadata(v, prefix=key + "."))
    elif isinstance(obj, list):
        # Enumerate list items with index
        for i, item in enumerate(obj):
            lines.extend(flatten_metadata(item, prefix=f"{prefix}[{i}]."))
    else:
        # Append flat key-value pair (strip trailing dot)
        lines.append(f"{prefix[:-1]}: {obj}")
    return lines

# Convert a list of metadata entries into a list of  Document objects
def convert_metadata_to_documents(metadata_list):
    documents = []
    for entry in metadata_list:
        # Flatten the entire metadata entry to plain text
        flattened_lines = flatten_metadata(entry)
        text = "\n".join(flattened_lines)

        # Use @id, or fallback to url/name as document ID
        doc_id = entry.get("@id", entry.get("url", entry.get("name", "unknown")))

        # Extract team names from the 'producer' field
        doc_team_names = []
        for producer in entry.get("producer", []):
            if isinstance(producer, dict):
                name = producer.get("name")
                if name:
                    doc_team_names.append(name)

        # Create Document with extracted metadata and flattened content
        documents.append(Document(
            text=text,
            doc_id=doc_id,
            metadata={
                "team_names":  ", ".join(doc_team_names),  # Join team names as string
                "name": entry.get("name", "unknown")       # Workflow name
            }
        ))
    return documents

# === Example usage ===
with open("workflows-bioschemas-dump (1).jsonld", "r", encoding="utf-8") as f:
    metadata = json.load(f)

# Convert metadata list to Document list
documents = convert_metadata_to_documents(metadata)

# Print metadata of the 4th document for inspection
print(documents[3].metadata)

{'team_names': 'GalaxyProject SARS-CoV-2', 'name': 'Genomics - Assembly of the genome sequence'}


This project includes two alternative code implementations for building the document retrieval system. You should choose only one of them to execute, based on your use case and complexity needs:

## Option1: MultiVectorRetriever

In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.storage import InMemoryStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid

# === Step 1: Create parent documents from original workflow metadata ===
# Each parent document represents one full workflow, with a unique UUID and its metadata (name, team)
parent_docs = []
for doc in documents:
    parent_id = str(uuid.uuid4())  # Generate a unique identifier for each document
    parent_docs.append(Document(
        page_content=doc.text,  # The full flattened metadata content
        metadata={
            "doc_id": parent_id,             # Unique ID for traceability
            "name": doc.metadata["name"],    # Workflow name from metadata
            "team": doc.metadata["team_names"]  # Contributing team names
        }
    ))

# === Step 2: Split parent documents into smaller chunks (child documents) ===
# These chunks are used for vector embedding and retrieval granularity
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500,   # Each chunk has a maximum of 500 tokens
    chunk_overlap=0   # No overlap between chunks
)

# Break each parent document into multiple child chunks, retaining doc_id for linking
child_docs = []
for parent in parent_docs:
    chunks = text_splitter.split_text(parent.page_content)
    for i, chunk in enumerate(chunks):
        child_docs.append(Document(
            page_content=chunk,
            metadata={"doc_id": parent.metadata["doc_id"]}  # Reference back to parent
        ))

# === Step 3: Build vectorstore (Chroma) and docstore (InMemory key-value store) ===
# Load embedding model (e.g., BAAI/bge-small-en-v1.5) for encoding document chunks
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

# Build Chroma vectorstore from child chunks
vectorstore = Chroma.from_documents(
    child_docs,
    embedding=embedding,
    collection_name="workflow-rag",  # Collection name for grouping
    persist_directory="./chroma_store_multi_vecto_metadata_500"  # Local persistence
)

# Persist vectorstore to disk for future use
vectorstore.persist()

# Initialize a simple in-memory docstore and populate it with parent documents
docstore = create_kv_docstore(InMemoryStore())
for doc in parent_docs:
    docstore.mset([(doc.metadata["doc_id"], doc)])  # Store by doc_id

# === Step 4: Create MultiVectorRetriever for hybrid retrieval ===
# Links the dense embedding retriever (child docs) with parent doc metadata
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,  # Handles similarity search on chunks
    docstore=docstore,        # Stores and returns full parent docs
    id_key="doc_id",          # Field used to connect child and parent
    search_kwargs={"k": 10}   # Retrieve top-10 most similar chunks
)

## Option2: vectorstore

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document as LCDocument
import json

# === Step 1: Convert LlamaIndex documents into LangChain Documents ===
# Each document is flattened and mapped to a LangChain-compatible structure
docs_list = []
for doc in documents:  # 'documents' is your list of LlamaIndex Document objects
    docs_list.append(
        LCDocument(
            page_content=doc.text,  # Full flattened metadata text as the content
            metadata={
                "source": doc.id_,  # Unique ID from LlamaIndex Document
                "team": doc.metadata.get("team_names", ""),  # Extract team names
                "name": doc.metadata.get("name", "")         # Extract workflow name
            }
        )
    )

# === Step 2: Split long documents into chunks ===
# Using token-based splitting (Tiktoken-compatible) for better retrieval granularity
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1500,     # Max token length per chunk
    chunk_overlap=0      # No overlapping tokens between chunks
)
doc_splits = text_splitter.split_documents(docs_list)  # Returns a list of split Documents

# === Step 3: Embed and store document chunks in Chroma vectorstore ===
# Load sentence embeddings model (BAAI/bge-small-en-v1.5)
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

# Build a persistent Chroma vectorstore for the document chunks
vectorstore = Chroma.from_documents(
    documents=doc_splits,              # Chunked documents
    collection_name="rag-chroma",      # Collection name for this index
    embedding=embedding,               # Embedding model
    persist_directory="./chroma_store" # Directory to persist the vectorstore
)

# === Step 4: Create retriever from Chroma vectorstore ===
# Allows downstream question-answering or RAG chains to retrieve similar documents
retriever = vectorstore.as_retriever()

  embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(model="deepseek-r1:latest", temperature=0)
# llm = ChatOllama(model="gemma3:4b", temperature=0)
# llm = ChatOllama(model="llama3.1:8b", temperature=0)

In [15]:
chat_history = []
def format_history(chat_history):
    return "\n\n".join(
        [f"Q: {entry['question']}\nA: {entry['answer']}" for entry in chat_history]
    )

In [None]:
from langchain.chains import create_history_aware_retriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_models import ChatOllama

# === Prompt template for rewriting follow-up questions ===
# This prompt instructs the LLM to rewrite the user's current question into a fully self-contained version,
# using previous chat history for context. The LLM should return only the rewritten question, without any explanation.
rewriting_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "Your task is to rewrite the user's latest question as a self-contained question. "
     "Use the chat history to understand references, but only return the rewritten question itself. "
     "**Do NOT explain, do NOT include 'Let's think' or 'Thoughts' or any additional commentary. "
     "Only output the final rewritten question as a plain sentence.**"),
    
    # This allows injecting previous Q&A history into the prompt.
    MessagesPlaceholder(variable_name="chat_history"),
    
    # The latest user message (question to be rewritten).
    ("human", "{input}")
])

# === Question rewriting chain ===
# Chain that passes the rewriting prompt to the LLM (Ollama),
# then parses the result as plain text (string output).
question_rewrite_chain = rewriting_prompt | llm | StrOutputParser()

In [None]:
# === format_history ===
# This function takes a list of past chat history entries,
# where each entry is a dictionary with 'question' and 'answer' fields.
# It formats them into a readable string to be used in prompts for LLMs.
def format_history(chat_history):
    return "\n\n".join([
        f"Q: {entry['question']}\nA: {entry['answer']}" for entry in chat_history
    ])
# === format_docs ===
# This function takes a list of retrieved Document objects (from LangChain),
# and returns a single string combining their metadata and content.
# This formatted string can be passed into an LLM to provide full retrieval context.
def format_docs(docs):
    return "\n\n".join(
        f"[Metadata]: {doc.metadata}\n[Content]: {doc.page_content}" for doc in docs
    )

In [None]:
import re

def clean_rewritten_question(text):
    """
    Clean the rewritten question returned by an LLM by removing any 
    hallucinated or unneeded <think>...</think> tags.

    This is especially useful when the model fails to follow instructions
    and outputs internal reasoning wrapped in <think>...</think> instead
    of returning only the plain question.

    Steps:
    1. Removes anything between <think>...</think> tags (including the tags).
    2. If only <think> is present without a closing tag, strip everything before it.

    Args:
        text (str): The raw text output from the model.

    Returns:
        str: A cleaned string containing only the rewritten question.
    """
    # Remove complete <think>...</think> blocks if present
    cleaned = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    # Handle case where model starts with <think> but forgets to close the tag
    if "<think>" in cleaned:
        cleaned = cleaned.split("<think>")[-1].strip()

    return cleaned

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

# Step 1: Create a reranker using Cohere's API.
# This will re-rank retrieved chunks and select the top N most relevant ones.
# You can replace "rerank-v3.5" with any other supported Cohere re-ranking model.
reranker = CohereRerank(
    top_n=3,  # Return the top 3 most relevant chunks
    model="rerank-v3.5",  # Use Cohere's v3.5 re-ranking model
    cohere_api_key=""  # 🔐 Replace with your own key in production!
)

# Step 2: Wrap the base retriever (multi-vector retriever) with a compression layer.
# The ContextualCompressionRetriever first calls the base retriever (e.g. vector search),
# then filters and re-ranks the results using the reranker above.
compression_retriever = ContextualCompressionRetriever(
    base_compressor=reranker,   # Apply the Cohere reranker on top
    base_retriever=retriever    # This is your previously defined MultiVectorRetriever
)

  reranker = CohereRerank(


In [None]:
# === Main function to handle user question with context-aware rewriting and RAG answering ===
def ask_question_with_rewriting(question, chat_history):
    # ✅ Step 1: Rewrite the user's follow-up question into a self-contained one using chat history
    rewritten_question = question_rewrite_chain.invoke({
        "chat_history": chat_history[-3:],  # Only use the last 3 turns for rewriting (can be tuned)
        "input": question
    })
    rewritten_question = clean_rewritten_question(rewritten_question)  # Remove unwanted tags like <think>
    print(f"📝 Rewritten question: {rewritten_question}")

    # ✅ Step 2: Retrieve relevant documents using the rewritten question and compression retriever (e.g., reranker)
    docs = compression_retriever.get_relevant_documents(rewritten_question)
    print("🔍 Retrieved workflow names:", [doc.metadata.get("name", "") for doc in docs])
    print("🔍 Retrieved workflow:", docs)

    # ✅ Step 3: Build the RAG prompt using the retrieved documents
    prompt = ChatPromptTemplate.from_template(
        "You are an assistant for question-answering tasks, and an expert in WorkflowHub. "
        "Use the following pieces of retrieved context to answer the question. "
        "If you don't know the answer, say that you don't know. "
        "Keep the answer concise."
        "\n\n"
        "Context:\n"
        "{context}"
        "\n\n"
        "Question:\n"
        "{question}"
    )

    # Create the final RAG chain by linking prompt → LLM → Output parser
    rag_chain = prompt | llm | StrOutputParser()

    # ✅ Step 4: Run the full RAG chain to generate the final answer
    response = rag_chain.invoke({
        "context": format_docs(docs),        # Format retrieved docs into readable string
        "question": rewritten_question,      # Pass in the rewritten question
        # "history": format_history(chat_history[-3:])  # Optionally include chat history
    })

    # ✅ Step 5: Append current question and answer to chat history for next round
    chat_history.append({"role": "human", "content": question})
    chat_history.append({"role": "ai", "content": response})

    print("💬 Answer:\n", response.strip())
    return response.strip()

In [11]:
chat_history=[]

In [None]:
response = ask_question_with_rewriting("Given the workflow Genomic variants - SNPs and INDELs detection using SAMTools which other workflows are similar?", chat_history)

In [21]:
chat_history=[]

In [22]:
response = ask_question_with_rewriting("Given the workflow Genomic variants - SNPs and INDELs detection using SAMTools which other workflows are similar, give me the workflow?", chat_history)

📝 Rewritten question: What are other genomic variant detection workflows similar to SNP and INDELs identification using SAMTools?
🔍 Retrieved workflow names: ['Genomic variants - SNPs and INDELs detection using SAMTools.', 'Genomic variants - SNPs and INDELs detection using SAMTools.', 'Genomic variants - SNPs and INDELs detection using VARSCAN2.']
🔍 Retrieved workflow : [Document(metadata={'name': 'Genomic variants - SNPs and INDELs detection using SAMTools.', 'source': 'https://workflowhub.eu/workflows/34', 'team': 'CWL workflow SARS-CoV-2', 'relevance_score': 0.8808076}, page_content='@context: https://schema.org\n@type.[0]: SoftwareSourceCode\n@type.[1]: ComputationalWorkflow\ndct:conformsTo: https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/\n@id: https://workflowhub.eu/workflows/34\ndescription: \r\nAuthor: AMBARISH KUMAR er.ambarish@gmail.com; ambari73_sit@jnu.ac.in\r\n\r\nThis is a proposed standard operating procedure for genomic variant detection using SAMTool