### Import libraries and set up model config

In [None]:
import os
import json
from dotenv import load_dotenv

try:
    from src.config import Source, ChunkerConf, LLMConf, KnowledgeGraphConfig, EmbedderConf
    from src.ingestion.local_ingestor import LocalIngestor
    from src.ingestion.cleaner import Cleaner
    from src.ingestion.chunker import Chunker
    from src.ingestion.embedder import ChunkEmbedder
    from src.agents.ontology_explorer import OntologyExplorer
except:
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), "..")))
    from src.config import Source, ChunkerConf, LLMConf, KnowledgeGraphConfig, EmbedderConf
    from src.ingestion.local_ingestor import LocalIngestor
    from src.ingestion.cleaner import Cleaner
    from src.ingestion.chunker import Chunker
    from src.ingestion.embedder import ChunkEmbedder
    from src.agents.ontology_explorer import OntologyExplorer

load_dotenv('config.env', override=True)

In [67]:
# Model configurations

# Knowledge graph config
kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name="vectors"
)

# Chunker config
chunker_conf = ChunkerConf(
    type="recursive",
    chunk_size=os.getenv("ONTOLOGY_CHUNKSIZE"),
    chunk_overlap=os.getenv("ONTOLOGY_OVERLAP")
)

# LLM config
llm_conf = LLMConf(
    model=os.getenv("AZURE_OPENAI_LLM_MODEL_NAME"),
    temperature=0,
    type="azure-openai",
    deployment=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_LLM_API_KEY"),
    endpoint=os.getenv("AZURE_OPENAI_LLM_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_LLM_VERSION"),
)

# Embedding model config
embedder_conf = EmbedderConf(
    model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"),
    type="azure-openai",
    deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_EMBEDDING_VERSION"),
)

### Loading, chunking, embedding
Loading, cleaning, chunking and embedding a financial compliance document (Finance Rules Playbook v3.1)

In [None]:
# Source data folder
source=Source(folder="raw_data")

# Load from local folder
ingestor=LocalIngestor(source=source)
docs=ingestor.batch_ingest()

# Clean docs
rem_start=5477
rem_end=-1
cleaner=Cleaner()
docs_cleaned=cleaner.clean_documents(docs)
docs_cleaned[0].source=docs_cleaned[0].source[rem_start:rem_end] # Remove start and end characters e.g. Contents, Appendix, References

In [71]:
# Chunking
chunker=Chunker(conf=chunker_conf)
docs_chunks=chunker.chunk_documents(docs_cleaned)
for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs_chunks[i].chunks)}")

2026-01-14 10:57:00,215 - src.ingestion.chunker - INFO - DOcument finance-rules-v3.1-apr-2024.pdf has been chunked into 173 chunks.


Number of chunks in doc 0: 173


In [72]:
# # Embedding
# embedder=ChunkEmbedder(conf=embedder_conf)
# docs_embeddings=embedder.embed_documents_chunks(docs_chunks)

### Ontology Exploration
Exploring most suitable ontologies for the documents above.

In [None]:
DOMAIN_DESCRIPTION="""
The domain focuses on the governance, management, and operational control of public sector finance within the UK Parliament, as codified in the Finance Rules Handbook V3.1.
It covers:

- Governance & Accountability: Structures, roles, and responsibilities for financial oversight, including committees, delegated authorities, and compliance frameworks.
- Budgeting & Planning: Annual and medium-term financial planning cycles, Estimates, budget setting, forecasting, and reporting mechanisms.
- Procurement & Expenditure: Rules and procedures for purchasing goods, services, and works; contract management; authorisation limits; and payment methods.
- Risk, Fraud & Internal Control: Policies for risk management, prevention and detection of fraud, loss management, and internal audit processes.
- Asset & Data Management: Safeguarding, recording, and disposal of assets; inventory control; insurance; and data security protocols.
- External Engagement: Procedures for engaging consultants, interims, and agency staff, including IR35 compliance and assurance requirements.
- Income, Debtors & Special Payments: Management of income streams, debtor processes, overpayments, losses, write-offs, and special payments.
- Transparency & Reporting: Requirements for financial disclosures, publication of expenditure, and annual reporting to Parliament and the public.

This domain provides a comprehensive framework for ensuring financial integrity, value for money, transparency, and accountability in the management of public resources within a parliamentary context.
"""

# LLM-driven ontology generation
ontology_explorer=OntologyExplorer(
    llm_conf, 
    domain_description=DOMAIN_DESCRIPTION
)
ontology=ontology_explorer.find_suitable_ontology(docs=docs_chunks, pct_chunks=0.1, chunks_limit=1)

In [None]:
# Check ontology
ont=ontology.model_dump() # Ontology in json format
nodes=ont.allowed_labels
edges=ont.allowed_relations
label_desc=ont.labels_descriptions
print(f"Allowed nodes: {nodes}")
print(f"Allowed edges: {edges}")
print(f"Node descriptions: {label_desc}")

In [None]:
# Save ontology
try:
    path=os.path.abspath(os.path.join(__file__, "../assets/ontology.json"))
except:
    path=os.path.abspath(os.path.join(os.getcwd(), "assets/ontology.json"))

with open(path, "w", encoding="utf-8") as f:
    f.write(json.dumps(ontology.model_dump()))

In [None]:
import os
from openai import AzureOpenAI

AZURE_OPENAI_LLM_API_KEY=os.getenv("AZURE_OPENAI_LLM_API_KEY")
AZURE_OPENAI_LLM_ENDPOINT=os.getenv("AZURE_OPENAI_LLM_ENDPOINT")
AZURE_OPENAI_LLM_MODEL_NAME=os.getenv("AZURE_OPENAI_LLM_MODEL_NAME")
AZURE_OPENAI_LLM_DEPLOYMENT_NAME=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT_NAME")
AZURE_OPENAI_LLM_VERSION=os.getenv("AZURE_OPENAI_LLM_VERSION")


client = AzureOpenAI(
    api_version=AZURE_OPENAI_LLM_VERSION,
    azure_endpoint=AZURE_OPENAI_LLM_ENDPOINT,
    api_key=AZURE_OPENAI_LLM_API_KEY,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "I am going to Paris, what should I see?",
        }
    ],
    max_completion_tokens =16384,
    model=AZURE_OPENAI_LLM_DEPLOYMENT_NAME
)

print(response.choices[0].message.content)