In [1]:
import os
import json
import logging
import sys
import gzip
import os
import tarfile
import xml.etree.ElementTree as ET
import re
import random

from llama_index.core import (
    VectorStoreIndex,
    Settings,
    Document,
    StorageContext,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.chains import create_retrieval_chain
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document as LangChainDocument
from langchain_core.prompts import PromptTemplate
from typing import List, Any
import gradio as gr

from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain

from langchain_openai import ChatOpenAI
from typing import Any, Dict, List, Tuple

In [2]:
from verbatim_rag import VerbatimRAG, VerbatimIndex
from verbatim_rag.schema import DocumentSchema
from verbatim_rag.ingestion import DocumentProcessor
from verbatim_rag.chunker_providers import SimpleChunkerProvider
from verbatim_rag.embedding_providers import SentenceTransformersProvider
from verbatim_rag.vector_stores import LocalMilvusStore


from rich.console import Console
from pathlib import Path

In [3]:
#test

txt_path = Path("./data/Schweiz/txt/20111220.txt")

with open(txt_path, "r", encoding="utf-8") as f:
    text = f.read()

doc = DocumentSchema.model_validate({
    "title": "Staatspolitische Kommission – Abstimmung Volksinitiativen",
    "doc_type": "parliament_protocol",
    "content": text,
    "content_type": "txt",
    "source": str(txt_path),
    "metadata": {
        "source_id": "20111220",
        "language": "de",
        "country": "CH"
    }
})

print(doc.id, doc.title, len(doc.content))

c3b742a0-8d2b-4e72-9aae-4a29365f0c85 Staatspolitische Kommission – Abstimmung Volksinitiativen 144032


In [4]:
def load_txt_documents(txt_dir: str):
    docs = []

    for path in Path(txt_dir).glob("*.txt"):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()

        doc = DocumentSchema.model_validate({
            "title": path.stem,
            "doc_type": "parliament_protocol",
            "content": text,
            "content_type": "txt",
            "source": str(path),
            "metadata": {
                "source_id": path.stem,
                "language": "de",
                "country": "CH"
            }
        })

        docs.append(doc)

    return docs

In [5]:
docs = load_txt_documents("./data/Schweiz/txt")
print(len(docs))
print(docs[0].title, len(docs[0].content))

10
20050317 227883


In [6]:
console = Console()

console.print(docs[0].content[:1000])
console.print(docs[1].content[:1000])
console.print(docs[2].content[:1000])

In [7]:
chunker = SimpleChunkerProvider()

def chunk_all_documents(docs):
    all_chunks = []

    for doc in docs:
        for i, (chunk_text, chunk_tag) in enumerate(chunker.chunk(doc.content)):
            all_chunks.append({
                "doc_id": doc.id,
                "title": doc.title,
                "source": doc.source,
                "chunk_index": i,
                "text": chunk_text,
                "chunk_tag": chunk_tag,
            })

    return all_chunks

#chunks = chunker.chunk(paper.content)

#console.print(f"Chunk 1: {chunks[1][0]}")
#console.print(f"Chunk 2: {chunks[2][0]}")

In [8]:
all_chunks = chunk_all_documents(docs)
print("Total chunks:", len(all_chunks))
print(all_chunks[0]["title"])
print(all_chunks[0]["chunk_tag"])
print(all_chunks[0]["text"][:20])

Total chunks: 2568
20050317
Gesamtabstimmung - Vote sur l'ensemble  Für Annahme des Entwurfes .... 93 Stimmen Dagegen .... 61 Stimmen   Abschreibung - Classement  Antrag des Bundesrates Abschreiben der parlamentarischen Vorstösse gemäss Brief an die eidgenössischen Räte Proposition du Conseil fédéral Classer les interventions parlementaires selon lettre aux Chambres fédérales  Angenommen - Adopté
Frau Teuscher, ich hätte eigentlich zwei Fragen an Sie. Sie haben ja auch zwei Fragen miteinander behandelt, die Offroadfahrzeuge und die Pl
Gesamtabstimmung - V


In [9]:
chunks = chunker.chunk(docs[0].content)

console.print(f"Chunk 1: {all_chunks[0]}")
console.print(f"Chunk 2: {all_chunks[1]}")

In [10]:
from verbatim_rag.embedding_providers import SentenceTransformersProvider
from verbatim_rag.vector_stores import LocalMilvusStore

dense_provider = SentenceTransformersProvider(
    model_name="ibm-granite/granite-embedding-107m-multilingual", device='cpu'
)

vector_store = LocalMilvusStore(
    db_path="./rag_verbatim.db",
    collection_name='rag_verbatim',
    dense_dim=dense_provider.get_dimension(),
    enable_dense=True,
    enable_sparse=False,
    nlist=16384,
)

index = VerbatimIndex(
        vector_store=vector_store,
        dense_provider=dense_provider,
        chunker_provider=chunker,
    )

index.add_documents(docs)

2025-12-19 00:27:59,334 - INFO - Load pretrained SentenceTransformer: ibm-granite/granite-embedding-107m-multilingual
2025-12-19 00:28:03,696 - INFO - Loaded SentenceTransformers model: ibm-granite/granite-embedding-107m-multilingual


ConnectionConfigException: <ConnectionConfigException: (code=1, message=milvus-lite is required for local database connections. Please install it with: pip install pymilvus[milvus_lite])>