# RAG System with embedding model retrieving and Verbatim answering.

## Imports

In [36]:
import os
import json
import logging
import sys
import gzip
import os
import tarfile
import xml.etree.ElementTree as ET
import re
import random

from llama_index.core import (
    VectorStoreIndex,
    Settings,
    Document,
    StorageContext,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.chains import create_retrieval_chain
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document as LangChainDocument
from langchain_core.prompts import PromptTemplate
from typing import List, Any
import gradio as gr

from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain

from langchain_openai import ChatOpenAI
from typing import Any, Dict, List, Tuple

from pathlib import Path

from verbatim_rag import VerbatimRAG, VerbatimIndex
from verbatim_rag.schema import DocumentSchema
from verbatim_rag.ingestion import DocumentProcessor
from verbatim_rag.chunker_providers import SimpleChunkerProvider
from verbatim_rag.embedding_providers import SentenceTransformersProvider
from verbatim_rag.vector_stores import LocalMilvusStore
from verbatim_rag.providers import RAGProvider
from verbatim_rag import verbatim_query

from rich.console import Console

## Configs

In [3]:
# --- 1. CONFIGURATION & IMPORTS ---
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Install langchain and its dependencies if not already installed
!pip install -qU langchain langchain-community

## Paths and APIs settings

In [38]:
# Set parameters
# Local path
TAR_PATH = (
     r"Schweiz.tar"
)

# API key
api_key = ""

# Define where you want to save the indexed data
PERSIST_DIR = "./storage_llm_100" 
ROOT_DIR =  Path.cwd().parent

input_questions_path = ROOT_DIR / "data" /"questions_20.json"
output_answers_path = ROOT_DIR / "data" /"verbatim_20_100.json"

DOC_LIMIT = 100
SOFA_NAMESPACE = "{http:///uima/cas.ecore}Sofa"

## Text extraction and model downloading

In [39]:
# Set storage for Q&A
CHAT_LOG = [] 
LOG_FILE = "chat_history_llm.json" #

def save_log_to_disk():
    """Saves the current CHAT_LOG to a JSON file"""
    with open(LOG_FILE, "w", encoding="utf-8") as f:
        json.dump(CHAT_LOG, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(CHAT_LOG)} interactions to {LOG_FILE}")

# Setup Models
print("Loading Embedding Model")
Settings.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large")
Settings.node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=100)


# Load Data
def extract_raw_text(tar_path, doc_limit):
    """
    Iterates through the .xmi files in the tar archive and extracts the
    raw text content using ElementTree.
    """
    docs = []

    try:
        with tarfile.open(tar_path, "r:*") as tar:
            for m in tar.getmembers():
                name = m.name.lower()

                # Filter for XMI files
                if name.endswith((".xmi", ".xmi.gz", ".xmi.xmi.gz", ".xmi.xmi.gz")):
                    f = tar.extractfile(m)
                    if f is None:
                        continue

                    data = f.read()

                    if name.endswith(".gz"):
                        data = gzip.decompress(data)

                    # API for parsing and creating XML data
                    root = ET.fromstring(data)

                    # find sofa element with the text
                    sofa = root.find(f".//{SOFA_NAMESPACE}")

                    if sofa is not None:
                        text = sofa.get("sofaString")
                        if text:
                            # clean up unwanted characters before preprocessing
                            text = text.replace('\r\n', ' ').replace('\n', ' ').strip()
                            docs.append({
                                "id": os.path.basename(m.name).replace(".xmi", ""),
                                "text": text
                            })

                    # Stop after reaching the defined limit
                    if len(docs) >= doc_limit:
                        print(f"Reached document limit of {doc_limit}")
                        break

    except tarfile.TarError as e:
        print(f"Error reading tar file: {e}")
        return []
    except ET.ParseError as e:
        print(f"Error parsing XMI content: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

    print(f"Successfully extracted raw text from {len(docs)} documents.")
    return docs

raw_data = extract_raw_text(TAR_PATH, DOC_LIMIT)
if not raw_data:
    print("ERROR: No documents were found! Check your TAR_PATH and XML Namespace.")
    sys.exit(1)


print("Converting dictionaries to Documents")
documents = []
for entry in raw_data:
    doc = Document(
        text=entry["text"],
        id_=entry["id"],
        metadata={
            "id_": entry["id"]}
    )
    documents.append(doc)


document_ids = [doc.id_ for doc in documents]

print(document_ids)



2026-01-15 21:14:16,042 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large


Loading Embedding Model
Reached document limit of 100
Successfully extracted raw text from 100 documents.
Converting dictionaries to Documents
['20150914.gz', '20060918.gz', '20111220.gz', '20200302.gz', '20050308.gz', '20201217.gz', '20080918.gz', '20050317.gz', '20050927.gz', '20090320.gz', '20140305.gz', '20170502.gz', '20030605.gz', '20030925.gz', '20080312.gz', '20210923.gz', '20190320.gz', '20050920.gz', '20161208.gz', '20161216.gz', '20180308.gz', '20101208.gz', '20000324.gz', '20020311.gz', '20140926.gz', '20210609.gz', '20170302.gz', '20080305.gz', '20030304.gz', '20010620.gz', '20060620.gz', '20210602.gz', '20210316.gz', '20091126.gz', '20040507.gz', '20070612.gz', '20150611.gz', '20010614.gz', '20200618.gz', '20080529.gz', '20021128.gz', '20120615.gz', '20120227.gz', '20140312.gz', '20180911.gz', '20210616.gz', '20181127.gz', '20150603.gz', '20110926.gz', '20100928.gz', '20040923.gz', '20020919.gz', '20121211.gz', '20011002.gz', '20011204.gz', '20211129.gz', '20081203.gz', '

## Indexing

In [40]:
# Indexing
if os.path.exists(PERSIST_DIR):
    print("Loading index from storage")
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
else:
    print("Creating new index")
    index = VectorStoreIndex.from_documents(documents, show_progress=True)
    index.storage_context.persist(persist_dir=PERSIST_DIR)


# Adapter Class
class LlamaIndexToLangChainRetriever(BaseRetriever):
    llama_retriever: Any
    def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[LangChainDocument]:
        nodes = self.llama_retriever.retrieve(query)
        langchain_docs = []
        for node in nodes:
            langchain_docs.append(
                LangChainDocument(page_content=node.get_content(), metadata=node.metadata)
            )
        return langchain_docs

Loading index from storage


2026-01-15 21:15:55,997 - INFO - Loading all indices.


In [41]:
raw_retriever = index.as_retriever(similarity_top_k=5)
llama_retriever = LlamaIndexToLangChainRetriever(llama_retriever=raw_retriever)

## LangChain - Verabtim adapter

The following code was adapted from https://huggingface.co/blog/adaamko/verbatimrag

In [43]:
class LangChainRAGProvider(RAGProvider):
    def __init__(self, langchain_retriever):
        self.retriever = langchain_retriever

    def retrieve(
        self, question: str, k: int = 5, filter = None
    ) -> List[Dict[str, Any]]:
        # Use LangChain's retrieval
        docs = self.retriever.invoke(question)

        # Convert to Verbatim format
        context = []
        for doc in docs[:k]:
            context.append(
                {
                    "content": doc.page_content,
                    "title": doc.metadata.get("title", ""),
                    "source": doc.metadata.get("source", ""),
                    "metadata": doc.metadata,
                }
            )
        return context


# Test
# 4. Use Verbatim with your existing LangChain RAG
provider = LangChainRAGProvider(llama_retriever)
response = verbatim_query(provider, "Was war die Antwort von Andreas Babler auf die Frage, ob der Mars bald besiedelt wird?", k=5)

print(response.answer)

Extracting spans (batch mode)...


2026-01-15 21:16:10,748 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## Response

Based on the available documents:

No relevant information found in the provided documents.

---
*These excerpts are taken verbatim from the source documents to ensure accuracy.*


## Questions pipeline

In [45]:
def load_questions(input_path: str) -> List[Dict[str, Any]]:
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f"Input file not found: {input_path}")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list):
        raise ValueError("Input JSON must be a list.")

    for i, item in enumerate(data):
        if "question" not in item or not isinstance(item["question"], str):
            raise ValueError(f"Invalid question at index {i}")

    return data

In [46]:
def save_output(output_path: str, data: List[Dict[str, Any]]) -> None:
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [47]:
def ask_verbatim_question(
    provider,
    question_text: str,
    k: int = 5,
    snippet_len: int = 300
):
    response = verbatim_query(provider, question_text, k=k)

    # Answer
    answer_text = getattr(response, "answer", "")
    if not isinstance(answer_text, str):
        answer_text = str(answer_text)
    answer_text = answer_text.strip()

    # Sources
    source_ids = []
    context_snippets = []

    documents = getattr(response, "documents", []) or []

    for doc in documents:
        source_id = None
        try:
            source_id = doc.metadata.get("id_")
        except Exception:
            pass

        text = getattr(doc, "page_content", "") or ""

        if source_id:
            source_ids.append(source_id)
            context_snippets.append({
                "source_id": source_id,
                "snippet": text[:snippet_len].replace("\n", " ").strip()
            })

    source_ids = list(dict.fromkeys(source_ids))

    return answer_text, source_ids, context_snippets

In [48]:
def process_questions_file_verbatim(
    provider,
    input_path: str,
    output_path: str,
    k: int = 5
):
    items = load_questions(input_path)

    for idx, item in enumerate(items, start=1):
        question_text = item["question"].strip()

        try:
            answer, source_ids, snippets = ask_verbatim_question(
                provider=provider,
                question_text=question_text,
                k=k
            )
        except Exception as e:
            answer = f"Error: {str(e)}"
            source_ids = []
            snippets = []

        item["answer"] = answer
        item["answer_source_id"] = source_ids
        item["answer_context_snippets"] = snippets

        print(f"[{idx}/{len(items)}] {item.get('id', 'NA')} done")

    save_output(output_path, items)
    print(f"Saved results to {output_path}")

In [49]:
process_questions_file_verbatim(
    provider=provider,
    input_path=input_questions_path,
    output_path=output_answers_path,
    k=5
)

Extracting spans (batch mode)...


2026-01-15 21:16:45,758 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:16:47,776 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[1/20] 1 done
Extracting spans (batch mode)...


2026-01-15 21:16:49,893 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[2/20] 2 done
Extracting spans (batch mode)...


2026-01-15 21:16:56,804 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:16:58,461 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[3/20] 3 done
Extracting spans (batch mode)...


2026-01-15 21:17:02,763 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:17:04,436 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[4/20] 4 done
Extracting spans (batch mode)...


2026-01-15 21:17:10,229 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:17:11,142 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[5/20] 5 done
Extracting spans (batch mode)...


2026-01-15 21:17:14,690 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:17:15,802 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[6/20] 6 done
Extracting spans (batch mode)...


2026-01-15 21:17:21,144 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:17:22,588 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[7/20] 7 done
Extracting spans (batch mode)...


2026-01-15 21:17:27,349 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:17:28,562 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[8/20] 8 done
Extracting spans (batch mode)...


2026-01-15 21:17:31,584 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:17:32,701 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[9/20] 9 done
Extracting spans (batch mode)...


2026-01-15 21:17:37,375 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:17:38,610 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[10/20] 10 done
Extracting spans (batch mode)...


2026-01-15 21:17:44,671 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:17:46,024 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[11/20] 11 done
Extracting spans (batch mode)...


2026-01-15 21:17:52,928 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:17:54,948 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[12/20] 12 done
Extracting spans (batch mode)...


2026-01-15 21:17:58,352 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:17:59,594 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[13/20] 13 done
Extracting spans (batch mode)...


2026-01-15 21:18:08,435 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:18:11,252 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[14/20] 14 done
Extracting spans (batch mode)...


2026-01-15 21:18:15,228 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:18:16,148 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[15/20] 15 done
Extracting spans (batch mode)...


2026-01-15 21:18:27,418 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:18:30,485 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[16/20] 16 done
Extracting spans (batch mode)...


2026-01-15 21:18:35,131 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:18:36,112 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[17/20] 17 done
Extracting spans (batch mode)...


2026-01-15 21:18:45,483 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




2026-01-15 21:18:46,687 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[18/20] 18 done
Extracting spans (batch mode)...


2026-01-15 21:18:51,331 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-15 21:18:53,109 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[19/20] 19 done
Extracting spans (batch mode)...


2026-01-15 21:18:55,633 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[20/20] 20 done
Saved results to D:\Study\NLP\tuw-ds-ws2025-nlp-g25-t13-main\data\verbatim_20_100.json
