### Import libraries and set up various config

In [None]:
import os
import json
from dotenv import load_dotenv

# from typing import List, Optional, Dict
# from langchain_core.load.serializable import Serializable
# from pydantic import BaseModel, Field

from src.graph.graph_model import Ontology
from src.graph.knowledge_graph import KnowledgeGraph
from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.cleaner import Cleaner
from src.ingestion.chunker import Chunker
from src.ingestion.embedder import ChunkEmbedder
from src.config import Source, ChunkerConf, LLMConf, KnowledgeGraphConfig, EmbedderConf
from src.ingestion.graph_miner import GraphMiner

env=load_dotenv("config.env", override=True)

In [None]:
kg_config = KnowledgeGraphConfig(
    uri=os.getenv("NEO4J_URI"),
    user=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    index_name="vector"
)

chunker_conf = ChunkerConf(
    type="recursive",
    chunk_size=1000,
    chunk_overlap=100
)

llm_conf = LLMConf(
    model=os.getenv("AZURE_OPENAI_LLM_MODEL_NAME"),
    temperature=0,
    type="azure-openai",
    deployment=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    endpoint=os.getenv("AZURE_OPENAI_LLM_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_LLM_VERSION"),
)

embedder_conf = EmbedderConf(
    model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME"),
    type="azure-openai",
    deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_EMBEDDING_VERSION"),
)

### Loading, chunking, embedding

In [None]:
# Source data folder
source=Source(folder="raw_data")

# Load from local folder
ingestor=LocalIngestor(source=source)
docs=ingestor.batch_ingest()

# Clean docs
cleaner=Cleaner()
docs_cleaned=cleaner.clean_documents(docs)

# Chunking
chunker=Chunker(conf=chunker_conf)
docs_chunks=chunker.chunk_documents(docs_cleaned)
# for i in range(len(docs)):
#     print(f"Number of chunks in doc {i}: {len(docs_chunks[i].chunks)}")

# Embedding
embedder=ChunkEmbedder(conf=embedder_conf)
docs_embeddings=embedder.embed_documents_chunks(docs_chunks)

### Load existing ontology

In [None]:
# Load existing ontologies if they exist
try:
    path=os.path.abspath(os.path.join(__file__, "../assets/ontology.json"))
except:
    path=os.path.abspath(os.path.join(os.getcwd(), "assets/ontology.json"))

try:
    os.path.exists(path)
    with open(path, "r", encoding="utf-8") as f:
        ont_json=json.load(f)
except:
    print("Ontology does not exist")

ontology=Ontology(
    allowed_labels=ont_json["allowed_labels"], 
    labels_descriptions=ont_json["labels_descriptions"],
    allowed_relations=ont_json["allowed_relations"]
)

### Generate graph

In [None]:
# Mine graph nodes and edges
graph_miner=GraphMiner(
    conf=llm_conf, 
    ontology=ontology
)
graph_components=graph_miner.mine_graph_from_doc_chunks(docs_embeddings[0])

In [None]:
i=0
print(f"Nodes identified in chunk {i}: {graph_components.chunks[0].nodes}")
print(f"Relationships identified in chunk {i}: {graph_components.chunks[0].relationships}")

### Insert into Neo4j graph instance

In [None]:
# Connect to neo4j graph instance
knowledge_graph=KnowledgeGraph(
    conf=kg_config, 
    embeddings_model=embedder.embeddings
)
knowledge_graph._driver.verify_connectivity()
knowledge_graph._driver.verify_authentication()

In [None]:
# Check number of nodes and edges
print(f"Number of nodes: {knowledge_graph.number_of_labels}")
print(f"Number of edges: {knowledge_graph.number_of_relationships}")
print(f"Name of indexer: {knowledge_graph.index_name}")

In [None]:
# Create knowledge graph
knowledge_graph.store_chunks_for_doc(
    doc=graph_components
)