In [None]:
import os
import json
import sys
import gzip
import tarfile
import xml.etree.ElementTree as ET
import random
from typing import Any, Dict, List, Tuple
import tiktoken
from llama_index.core import VectorStoreIndex, Settings, Document, StorageContext, load_index_from_storage

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.postprocessor.sbert_rerank import SentenceTransformerRerank

from langchain_ollama import ChatOllama
from langchain.chains import create_retrieval_chain
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document as LangChainDocument
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from pydantic import Field

import gradio as gr
import logging
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index.embeddings.ollama").setLevel(logging.WARNING)

In [None]:
# Set parameters
# Local path
TAR_PATH = r""
PERSIST_DIR = "./storage_local_llama" 
ROOT_DIR =  Path.cwd().parent
DATA_DIR = ROOT_DIR / "data" / "documents_ollama"
os.makedirs(DATA_DIR, exist_ok=True)

input_questions_path = ROOT_DIR / "data" /"questions_20.json"
output_answers_path = ROOT_DIR / "data" /"ollama_local_20.json"

DOC_LIMIT = 100
SOFA_NAMESPACE = "{http:///uima/cas.ecore}Sofa"



In [4]:
# Set storage for Q&A
CHAT_LOG = []
LOG_FILE = "chat_history_llama.json"

# Utility functions
def save_log_to_disk():
    with open(LOG_FILE, "w", encoding="utf-8") as f:
        json.dump(CHAT_LOG, f, ensure_ascii=False, indent=2)



# Setup Models
Settings.embed_model = OllamaEmbedding(
    model_name="nomic-embed-text",
    embed_batch_size=10,
    request_timeout=300.0
)
Settings.llm = Ollama(model="llama3.2:3b", request_timeout=120.0)

# Setup SentenceWindow 
Settings.node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

try:
    ChatOllama.model_rebuild()
except Exception:
    pass

# Main Chat LLM 
llm = ChatOllama(model="llama3.2:3b", temperature=0)

# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Load Data
def extract_raw_text(tar_path, doc_limit):
    docs = []
    total_tokens = 0  # Global token counter
    
    if not os.path.exists(tar_path):
        print(f"Error: Tar file not found at {tar_path}")
        return [], 0

    try:
        with tarfile.open(tar_path, "r:*") as tar:
            for m in tar.getmembers():
                name = m.name.lower()
                if name.endswith((".xmi", ".xmi.gz")):
                    f = tar.extractfile(m)
                    if f is None: continue
                    data = f.read()
                    if name.endswith(".gz"):
                        data = gzip.decompress(data)
                    
                    root = ET.fromstring(data)

                    sofa = root.find(f".//{SOFA_NAMESPACE}")
                    
                    if sofa is not None:
                        text = sofa.get("sofaString")
                        if text:
                            text = text.replace('\r\n', ' ').replace('\n', ' ').strip()
                            
                          
                            tokens = tokenizer.encode(text)
                            total_tokens += len(tokens)
                            
                            docs.append({
                                "id": os.path.basename(m.name).replace(".xmi", ""),
                                "text": text
                            })
                            
                if len(docs) >= doc_limit:
                    break
    except Exception as e:
        print(f"Extraction error: {e}")
        return [], 0
        
    return docs, total_tokens

if os.path.exists(DATA_DIR) and any(f.endswith('.json') for f in os.listdir(DATA_DIR)):
    print(f"Loading documents from {DATA_DIR}...")
    documents = []
    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".json"):
            with open(DATA_DIR / filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # This reconstructs the Document with your specific ID and Metadata
                doc = Document(
                    text=data["text"], 
                    id_=data["metadata"]["id_"], 
                    metadata=data["metadata"]
                )
                documents.append(doc)
else:
    print("Data folder empty. Running extraction from TAR")
    # 1. Run your custom function
    raw_data, total_raw_tokens = extract_raw_text(TAR_PATH, DOC_LIMIT)
    
    # 2. Create Document objects
    documents = [
        Document(text=e["text"], id_=e["id"], metadata={"id_": e["id"]}) 
        for e in raw_data
    ]
    
    # 3. Save as individual JSON files to preserve your custom metadata (id_)
    os.makedirs(DATA_DIR, exist_ok=True)
    for doc in documents:
        file_path = os.path.join(DATA_DIR, f"{doc.id_}.json")
        with open(file_path, 'w', encoding='utf-8') as f:
            # We save as JSON to keep the 'metadata' and 'text' linked
            json.dump({"text": doc.text, "metadata": doc.metadata}, f)

document_ids = [doc.id_ for doc in documents]
print(document_ids)




Loading documents from c:\Users\ssick\OneDrive\Master_TU\NLP\nlp_project\tuw-ds-ws2025-nlp-g25-t13\data\documents_ollama...
['20000324.gz', '20010614.gz', '20010620.gz', '20011002.gz', '20011204.gz', '20020305.gz', '20020307.gz', '20020311.gz', '20020606.gz', '20020919.gz', '20021128.gz', '20030304.gz', '20030605.gz', '20030925.gz', '20040311.gz', '20040504.gz', '20040507.gz', '20040602.gz', '20040615.gz', '20040923.gz', '20040928.gz', '20050308.gz', '20050317.gz', '20050613.gz', '20050920.gz', '20050927.gz', '20060620.gz', '20060918.gz', '20070305.gz', '20070612.gz', '20080305.gz', '20080312.gz', '20080318.gz', '20080529.gz', '20080918.gz', '20081203.gz', '20090320.gz', '20091126.gz', '20091210.gz', '20100310.gz', '20100601.gz', '20100609.gz', '20100610.gz', '20100923.gz', '20100928.gz', '20101208.gz', '20101217.gz', '20110307.gz', '20110316.gz', '20110531.gz', '20110926.gz', '20111220.gz', '20120227.gz', '20120503.gz', '20120615.gz', '20121210.gz', '20121211.gz', '20130321.gz', '2013

In [5]:
# Indexing
if os.path.exists(PERSIST_DIR):
    print(f"Load local index from {PERSIST_DIR}")
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
else:
    # Create one note per sentence and store 3 before/after neighbors in metadata
    print("Creating new Sentence Window index")
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    
    nodes = node_parser.get_nodes_from_documents(documents)
    
    # Build the index 
    index = VectorStoreIndex(nodes, show_progress=True)
    index.storage_context.persist(persist_dir=PERSIST_DIR)



Load local index from ./storage_local_llama
INFO:llama_index.core.indices.loading:Loading all indices.


In [None]:
class LlamaIndexToLangChainRetriever(BaseRetriever):
    llama_retriever: Any = Field(exclude=True)

    postprocessors: List[Any] = Field(exclude=True) 
    
    def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[LangChainDocument]:
        # Initial Retrieval
        nodes_with_score = self.llama_retriever.retrieve(query)
        
        # Postprocessing Reranking 
        processed_nodes = nodes_with_score
        for processor in self.postprocessors:
            processed_nodes = processor.postprocess_nodes(
                processed_nodes, 
                query_str=query
            )
        
        return [
            LangChainDocument(
                page_content=n.get_content(), 
                metadata=n.metadata
            ) for n in processed_nodes
        ]

raw_retriever = index.as_retriever(similarity_top_k=15)

# Postprozess definition
reranker = SentenceTransformerRerank(
    model="local_reranker", 
    top_n=5,
    cross_encoder_kwargs={"trust_remote_code": True} 
)

# Replace Window with context
metadata_postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window")

llama_retriever = LlamaIndexToLangChainRetriever(
    llama_retriever=raw_retriever, 
    postprocessors=[reranker, metadata_postprocessor]
)

# Create prompt
custom_template = """
Du bist ein präziser Analyst für Schweizer Parlamentsprotokolle. 
Deine Aufgabe ist es, Fragen ausschließlich auf Basis der unten stehenden Textauszüge (KONTEXT) zu beantworten.

### DEINE REGELN:
1. QUELLEN-TREUE: Antworte nur mit Informationen, die explizit im KONTEXT stehen. Wenn die Information fehlt, antworte: "Information nicht im Dokument enthalten."
2. DYNAMISCHE ZITATION: Füge nach jeder Information die spezifische Source-ID des Snippets ein, aus dem die Info stammt. Nutze NIEMALS eine ID, die nicht im aktuellen Kontext aufgeführt ist. Format: [ID: date.gz].
3. VOLLSTÄNDIGKEIT: Lies ALLE Snippets, bevor du entscheidest, dass eine Information fehlt. Achte auf Synonyme und inhaltliche Zusammenhänge.
4. ZAHLEN: Extrahiere alle Statistiken, Daten und Geldbeträge mit maximaler Präzision.

### DEIN ARBEITSPROZESS (Interner Monolog):
Bevor du die finale Antwort schreibst, führe intern diese Schritte aus:
1. RELEVANZ-CHECK: Welche Snippets enthalten Begriffe aus der Frage oder thematisch verwandte Aspekte?
2. EXTRAKTION: Welche konkreten Fakten und Source-IDs gehören zusammen?
3. VERIFIKATION: Entspricht die Antwort exakt dem Wortlaut/Sinn der Quelle?

### KONTEXT:
{context}

### FRAGE: 
{input}

### ANTWORT (Strukturierte Fakten):
"""

PROMPT = ChatPromptTemplate.from_template(custom_template)
document_chain = create_stuff_documents_chain(llm=llm, prompt=PROMPT)
qa_chain = create_retrieval_chain(retriever=llama_retriever, combine_docs_chain=document_chain)


In [17]:
from huggingface_hub import snapshot_download
import os

model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
target_dir = "mein_lokaler_reranker"

# Download in den lokalen Ordner
snapshot_download(repo_id=model_id, local_dir=target_dir)

Fetching 23 files: 100%|██████████| 23/23 [01:04<00:00,  2.79s/it]


'C:\\Users\\ssick\\OneDrive\\Master_TU\\NLP\\nlp_project\\tuw-ds-ws2025-nlp-g25-t13\\src\\mein_lokaler_reranker'

In [19]:
def load_questions(input_path: str) -> List[Dict[str, Any]]:
    if not os.path.isfile(input_path):
        raise FileNotFoundError(f"Input file not found: {input_path}")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list):
        raise ValueError("Input JSON must be a list of objects (array).")

    for i, item in enumerate(data):
        if not isinstance(item, dict):
            raise ValueError(f"Item at index {i} is not an object.")
        if "question" not in item:
            raise ValueError(f"Item at index {i} has no 'question' field.")
        if not isinstance(item["question"], str) or not item["question"].strip():
            raise ValueError(f"Item at index {i} has empty/invalid 'question' field.")

    return data


def save_output(output_path: str, data: List[Dict[str, Any]]) -> None:
    out_dir = os.path.dirname(os.path.abspath(output_path))
    os.makedirs(out_dir, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def ask_rag_only_question(qa_chain, question_text: str, snippet_len: int = 300):
    response = qa_chain.invoke({"input": question_text})

    answer_text = response.get("answer", "")
    if not isinstance(answer_text, str):
        answer_text = str(answer_text)
    answer_text = answer_text.strip()

    source_ids = []
    context_snippets = []

    docs = response.get("context", None)
    if docs is None:
        docs = response.get("documents", [])

    for doc in docs:
        s_id = None
        try:
            s_id = doc.metadata.get("id_")
        except Exception:
            s_id = None

        text = getattr(doc, "page_content", "") or ""

        if isinstance(s_id, str) and s_id:
            source_ids.append(s_id)
            snippet = text[:snippet_len].replace("\n", " ").strip()
            context_snippets.append({"source_id": s_id, "snippet": snippet})

    source_ids = list(dict.fromkeys(source_ids))
    return answer_text, source_ids, context_snippets


def process_questions_file(qa_chain, input_path: str, output_path: str) -> None:
    items = load_questions(input_path)

    for idx, item in enumerate(items, start=1):
        question_text = item["question"].strip()

        answer_text = ""
        answer_source_ids = []
        context_snippets = []

        try:
            answer_text, answer_source_ids, context_snippets = ask_rag_only_question(
                qa_chain, question_text
            )
        except Exception as e:
            answer_text = f"Error: {str(e)}"
            answer_source_ids = []
            context_snippets = []

        item["answer"] = answer_text
        item["answer_source_id"] = answer_source_ids
        item["answer_context_snippets"] = context_snippets

        print(f"[{idx}/{len(items)}] id={item.get('id', 'NA')} done")

    save_output(output_path, items)
    print(f"Saved results to {output_path}")



process_questions_file(qa_chain, input_questions_path, output_answers_path)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


[1/20] id=1 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]


[2/20] id=2 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]


[3/20] id=3 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]


[4/20] id=4 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.91it/s]


[5/20] id=5 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.55it/s]


[6/20] id=6 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.18it/s]


[7/20] id=7 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


[8/20] id=8 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.53it/s]


[9/20] id=9 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.14it/s]


[10/20] id=10 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]


[11/20] id=11 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]


[12/20] id=12 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


[13/20] id=13 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.92it/s]


[14/20] id=14 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.54it/s]


[15/20] id=15 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


[16/20] id=16 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.46it/s]


[17/20] id=17 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s]


[18/20] id=18 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s]


[19/20] id=19 done


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


[20/20] id=20 done
Saved results to c:\Users\ssick\OneDrive\Master_TU\NLP\nlp_project\tuw-ds-ws2025-nlp-g25-t13\data\ollama_local_20.json
