In [None]:
import os
import json
from dotenv import load_dotenv


from src.config import Source, ChunkerConf, LLMConf

from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.chunker import Chunker
from src.ingestion.cleaner import Cleaner

from src.agents.ontology_explorer import OntologyExplorer

env = load_dotenv('config.env')

### Config
Set up graph db, LLM and embedding model configuration using Neo4j credentials and Azure OpenAI deployment configurations. 

In [None]:
# kg_config = KnowledgeGraphConfig(
#     uri=os.getenv("NEO4J_URI"),
#     user=os.getenv("NEO4J_USERNAME"),
#     password=os.getenv("NEO4J_PASSWORD"),
#     index_name="vectors"
# )

chunker_conf = ChunkerConf(
    type="recursive",
    chunk_size=1000,
    chunk_overlap=100
)

llm_conf = LLMConf(
    model=os.getenv("AZURE_OPENAI_LLM_MODEL_NAME"),
    temperature=0,
    type="azure-openai",
    deployment=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    endpoint=os.getenv("AZURE_OPENAI_LLM_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_LLM_VERSION"),
)

# embedder_conf = EmbedderConf(
#     model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"),
#     type="azure-openai",
#     deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"),
#     endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
#     api_version=os.getenv("AZURE_OPENAI_EMBEDDING_VERSION"),
# )

### Loading, chunking, embedding
Loading, cleaning, chunking and embedding a financial compliance document (Finance Rules Playbook v3.1)

In [None]:
# Source data folder
source=Source(folder="raw_data")

# Load from local folder
ingestor=LocalIngestor(source=source)
docs=ingestor.batch_ingest()

# Clean docs
cleaner=Cleaner()
docs_cleaned=cleaner.clean_documents(docs)

# Chunking
chunker=Chunker(conf=chunker_conf)
docs_chunks=chunker.chunk_documents(docs_cleaned)
for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs_chunks[i].chunks)}")

# # Embedding
# embedder=ChunkEmbedder(conf=embedder_conf)
# docs_embeddings=embedder.embed_documents_chunks(docs_chunks)

### Ontology Exploration
Exploring most suitable ontologies for the documents above.

In [None]:
DOMAIN_DESCRIPTION="""
The domain focuses on the governance, management, and operational control of public sector finance within the UK Parliament, as codified in the Finance Rules Handbook V3.1.
It covers:

- Governance & Accountability: Structures, roles, and responsibilities for financial oversight, including committees, delegated authorities, and compliance frameworks.
- Budgeting & Planning: Annual and medium-term financial planning cycles, Estimates, budget setting, forecasting, and reporting mechanisms.
- Procurement & Expenditure: Rules and procedures for purchasing goods, services, and works; contract management; authorisation limits; and payment methods.
- Risk, Fraud & Internal Control: Policies for risk management, prevention and detection of fraud, loss management, and internal audit processes.
- Asset & Data Management: Safeguarding, recording, and disposal of assets; inventory control; insurance; and data security protocols.
- External Engagement: Procedures for engaging consultants, interims, and agency staff, including IR35 compliance and assurance requirements.
- Income, Debtors & Special Payments: Management of income streams, debtor processes, overpayments, losses, write-offs, and special payments.
- Transparency & Reporting: Requirements for financial disclosures, publication of expenditure, and annual reporting to Parliament and the public.

This domain provides a comprehensive framework for ensuring financial integrity, value for money, transparency, and accountability in the management of public resources within a parliamentary context.
"""

# LLM-driven ontology generation
ontology_explorer = OntologyExplorer(
    llm_conf, 
    domain_description=DOMAIN_DESCRIPTION
)
ontology=ontology_explorer.find_suitable_ontology(docs=docs_chunks, pct_chunks=0.5, chunks_limit=50)
ontology.model_dump() # Ontology in json format

In [None]:
# Save ontology
try:
    path=os.path.abspath(os.path.join(__file__, "../assets/ontology.json"))
except:
    path=os.path.abspath(os.path.join(os.getcwd(), "assets/ontology.json"))

with open(path, "w", encoding="utf-8") as f:
    f.write(json.dumps(ontology.model_dump()))