In [1]:
!which python

/Users/tareksanger/Library/Caches/pypoetry/virtualenvs/lumis-tzzr_5k5-py3.12/bin/python


In [2]:
from lumis.config import config
import os
import pprint

pp = pprint.PrettyPrinter(indent=4)

os.environ["OPENAI_API_KEY"] = config.openai_api_key

In [3]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import logging
import sys
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output


# define LLM

# from llama_index.embeddings.openai import OpenAIEmbedding

llm = OpenAI(temperature=0, model="gpt-4-turbo")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")


Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

In [4]:
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"  # default is "nebula"
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"

space_name = "stantec"

edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  # default, could be omit if create from an empty kg
tags = ["entity"]

In [5]:
def delete_space(address: str, space_name: str):
    from nebula3.gclient.net import ConnectionPool
    from nebula3.Config import Config

    # Configuration for the connection
    config = Config()
    config.max_connection_pool_size = 10
    # Initialize connection pool
    connection_pool = ConnectionPool()
    if not connection_pool.init([address.split(":")], config):
        pp.pprint("Failed to initialize connection pool.")
        return

    # Connect to the server
    session = connection_pool.get_session(
        os.environ["NEBULA_USER"], os.environ["NEBULA_PASSWORD"]
    )

    try:
        # Drop the space
        session.execute(f"DROP SPACE IF EXISTS {space_name}")

    finally:
        session.release()
        connection_pool.close()

In [6]:
delete_space(os.environ["NEBULA_ADDRESS"], space_name)

INFO:nebula3.logger:Get connection to ('127.0.0.1', '9669')


In [7]:
def create_namespace_if_not_exists(
    address: str, namespace_name: str, vid_type="FIXED_STRING", vid_size=256
):
    from nebula3.gclient.net import ConnectionPool
    from nebula3.Config import Config

    # Configuration for the connection
    config = Config()
    config.max_connection_pool_size = 10
    # Initialize connection pool
    connection_pool = ConnectionPool()
    if not connection_pool.init([address], config):
        pp.pprint("Failed to initialize connection pool.")
        return

    # Connect to the server
    session = connection_pool.get_session(
        os.environ["NEBULA_USER"], os.environ["NEBULA_PASSWORD"]
    )
    try:
        # Determine the vid_type specification
        if vid_type == "FIXED_STRING":
            vid_spec = f"fixed_string({vid_size})"
        else:
            vid_spec = "int64"
        create_space_command = f"CREATE SPACE IF NOT EXISTS {namespace_name}(vid_type={vid_spec}, partition_num=1, replica_factor=1)"
        create_space_result = session.execute(create_space_command)
        if create_space_result.is_succeeded():
            pp.pprint(
                [
                    f"Namespace '{namespace_name}' created successfully.",
                    create_space_result.comment(),
                ]
            )
        else:
            pp.pprint(["Failed to create namespace.", create_space_result.error_msg()])

    finally:
        session.release()
        connection_pool.close()

In [8]:
create_namespace_if_not_exists(os.environ["NEBULA_ADDRESS"], space_name)

INFO:nebula3.logger:Get connection to ('127.0.0.1', '9669')
["Namespace 'stantec' created successfully.", '']


In [9]:
from llama_index.core import StorageContext
from llama_index.graph_stores.nebula import NebulaGraphStore

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

# Looks like we might be able to add more storage methods to the storage context
storage_context = StorageContext.from_defaults(graph_store=graph_store)

RuntimeError: Failed to get session, cannot set the session space to stantec error: -1005 SpaceNotFound: SpaceName `stantec`

In [None]:
graph_store.execute("CREATE TAG IF NOT EXISTS entity(name string)")
graph_store.execute("CREATE EDGE IF NOT EXISTS relationship(relationship string)")
graph_store.execute("CREATE TAG INDEX entity_index ON entity(name(256))")
# CREATE TAG entity(name string);
# CREATE EDGE relationship(relationship string);
# CREATE TAG INDEX entity_index ON entity(name(256));

ResultSet(None)

In [None]:
from llama_index.core import Document
from llama_index.core.utils import globals_helper
from nltk.stem.porter import PorterStemmer


def process_document(document: Document):

    words = document.text.split(" ")
    words = [word for word in words]
    document.text = " ".join(words)
    return document

In [None]:
from llama_index.readers.wikipedia import WikipediaReader

loader = WikipediaReader()

documents = loader.load_data(
    pages=["Stantec"],
    auto_suggest=True,
)
# documents = [process_document(document) for document in documents]

In [None]:
from llama_index.core.ingestion import IngestionPipeline

# https://docs.llamaindex.ai/en/stable/module_guides/indexing/metadata_extraction/
# https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/

# pipeline = IngestionPipeline(
#   storage_context=storage_context,
#   settings=Settings,
#   transformations=[

#   ])

In [None]:
from llama_index.core import KnowledgeGraphIndex
from llama_index.core.node_parser import SentenceSplitter

# Use pipelines for more control over data processing

index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=100,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
    retriever_mode="embedding",
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:openai._base_client:Retrying request to /chat/completions in 0.858618 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:openai._base_client:Retrying request to /chat/completions in 0.798494 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://a

In [None]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5,
)

In [None]:
response = query_engine.query(
    "Can you tell me what resources Stantec has it could utilize for future innovation?"
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: 04b51dfa-a192-4530-bc0a-f6fa7202b41f: === Growth ===
Stantec has 28,000 employees and 400 locations on six continen...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: c180a377-af56-47b0-a1f3-53a53464e305: Between 2008 and 2011, gross revenue increased from $1.4 billion to $1.7 bill...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: 98ef6c4b-3002-4a6d-8b6c-3b4e16991bc3: Staff numbers neared 900 and the firm went public on the Toronto Stock Exchan...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [None]:
import pprint

pprint.pprint(response.response)

('Stantec has a robust set of resources that could be utilized for future '
 'innovation, including a global presence with 400 locations across six '
 'continents, a diverse workforce of 28,000 employees, and a broad range of '
 'services in various sectors such as environmental sciences, landscape '
 "architecture, and project economics. Additionally, the company's history of "
 'acquiring over 130 firms since 1994, including significant acquisitions like '
 'MWH Global, Inc., provides a wealth of expertise and capabilities in '
 "infrastructure and engineering. Stantec's involvement in major projects like "
 'the Panama Canal Expansion and the Stantec Tower also demonstrates its '
 'capacity to handle large-scale and complex projects, which could further '
 'drive innovation. The leadership under CEO Gord Johnston, who has extensive '
 'experience in design and project management, along with his qualifications '
 'as a certified project management professional and an Envision '
 'Sus

Query an already built GraphRAG

https://docs.llamaindex.ai/en/stable/examples/query_engine/knowledge_graph_rag_query_engine/
