In [2]:
import weave
weave.init("graphrag-poc")

Logged in as Weights & Biases user: vedmanivaidya.
View Weave data at https://wandb.ai/vedmanivaidya/graphrag-poc/weave


<weave.trace.weave_client.WeaveClient at 0x1b4f11a0ad0>

In [3]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()
openai_llm = AzureOpenAI(
    model=os.getenv("DEPLOYMENT_NAME"),
    deployment_name=os.getenv("DEPLOYMENT_NAME"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("OPENAI_API_VERSION"),
    temperature=0.0,
)
openai_embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    # deployment_name="my-custom-embedding",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("OPENAI_API_VERSION"),
)
embeddings = openai_embed_model.get_text_embedding("Hello, world!")
len(embeddings)

1536

In [4]:
from llama_index.core import Settings

Settings.llm = openai_llm
Settings.embed_model = openai_embed_model

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    "D:\projects\graphrag-poc\data\selection 1"
).load_data()

In [7]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

# Note: used to be `Neo4jPGStore`
graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password="12345678",
    url="bolt://localhost:7687",
    database="graphrag-openai-custom-extractor-prompt",
)

In [8]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=50,
)
nodes = splitter.get_nodes_from_documents(documents)
len(nodes)
# create a list of lists each of 10 nodes
nodes_list = [nodes[i : i + 10] for i in range(0, len(nodes), 10)]
print(len(nodes_list))

8


In [9]:
prompt_template = (
    "You are a knowledge graph expert specializing in extracting structured information from text about startup founders, companies, and entrepreneurship guidance, particularly from Paul Graham's writings."
    "\nYour task is to extract up to {max_knowledge_triplets} knowledge triplets from the provided text. "
    "A knowledge triplet consists of (head, relation, tail) along with their types and properties."
    "\n\nCONTEXT AWARENESS:"
    "\n- Focus on startup-related entities: founders, companies, investors, concepts"
    "\n- Identify key entrepreneurship principles and advice"
    "\n- Capture relationships between people, organizations, and ideas"
    "\n- Extract time-sensitive information when available (founded_date, funding_rounds, etc.)"
    "\n---------------------\n"
    "INITIAL ONTOLOGY:\n"
    "Entity Types: {allowed_entity_types}\n"
    "Entity Properties: {allowed_entity_properties}\n"
    "Relation Types: {allowed_relation_types}\n"
    "Relation Properties: {allowed_relation_properties}\n"
    "\n"
    "Use these types as a starting point, but introduce new types if necessary based on the context.\n"
    "If the Entity Properties, Relation Properties contain property 'description', generate a context aware detailed description, which will have some uniqe non generic information addition"
    "\n"
    "GUIDELINES:\n"
    "- Output in JSON format: [{{'head': '', 'head_type': '', 'head_props': {{...}}, 'relation': '', 'relation_props': {{...}}, 'tail': '', 'tail_type': '', 'tail_props': {{...}}}}]\n"
    "- Use the most complete form for entities (e.g., 'United States of America' instead of 'USA') but where its ambiguous, use the entity as it is\n"
    "- Keep entities concise\n"
    "- While writing description property for entities and relations keep context in mind and just dont write the description of the entity or relation, but the description of the entity or relation in the context of the text\n"
    "- Ensure the knowledge graph is coherent and easily understandable\n"
    "- While extracting relation, use singular form of the relation. Use EXPAND instead of EXPANDS or EXPECT instead of EXPECTS\n"
    "- The goal is to make relations as generics as possible, so that there are less duplicate relations in the graph, which have same meaning\n"
    "- If there are two names present in the text treat them as separate entities. For example Jessica Livingston and Robert Morris then they are two separate entities Jessica Livingston, Robert Morris\n"
    "- Focus on startup-specific metrics and relationships (funding rounds, valuations, mentor relationships)\n"
    "- Capture temporal aspects of relationships when mentioned (founding dates, acquisition dates)\n"
    "- Include relevant contextual properties (industry sector, technology stack, market focus)\n"
    "---------------------\n"
    "EXAMPLE:\n"
    "Text: Tim Cook, CEO of Apple Inc., announced the new Apple Watch that monitors heart health. "
    "UC Berkeley researchers studied the benefits of apples.\n"
    "Output:\n"
    "[{{'head': 'Tim Cook', 'head_type': 'PERSON', 'head_props': {{'description': 'Technology executive who made the product announcement for Apple Watch, demonstrating leadership in health-focused technology initiatives'}}, 'relation': 'CEO_OF', 'relation_props': {{'description': 'Executive leadership role involving product announcements and strategic health technology initiatives'}}, 'tail': 'Apple Inc.', 'tail_type': 'COMPANY', 'tail_props': {{'description': 'Technology company expanding into health monitoring through wearable devices'}}}},\n"
    " {{'head': 'Apple Inc.', 'head_type': 'COMPANY', 'head_props': {{'description': 'Company developing health-focused consumer technology products under Tim Cook's leadership'}}, 'relation': 'PRODUCE', 'relation_props': {{'description': 'Strategic initiative to enter health monitoring market through consumer devices'}}, 'tail': 'Apple Watch', 'tail_type': 'PRODUCT', 'tail_props': {{'description': 'Health-focused smartwatch representing Apple's expansion into medical monitoring technology'}}}},\n"
    " {{'head': 'Apple Watch', 'head_type': 'PRODUCT', 'head_props': {{'description': 'Wearable device specifically designed to track and monitor user health metrics'}}, 'relation': 'MONITOR', 'relation_props': {{'description': 'Continuous health monitoring capability focusing on cardiac metrics'}}, 'tail': 'heart health', 'tail_type': 'HEALTH_METRIC', 'tail_props': {{'description': 'Critical health metric monitored through Apple Watch's advanced sensors'}}}},\n"
    " {{'head': 'UC Berkeley', 'head_type': 'UNIVERSITY', 'head_props': {{'description': 'Academic institution conducting research on nutritional benefits and health impacts'}}, 'relation': 'STUDY', 'relation_props': {{'description': 'Academic research focusing on health benefits and nutritional value analysis'}}, 'tail': 'benefits of apples', 'tail_type': 'RESEARCH_TOPIC', 'tail_props': {{'description': 'Scientific investigation into the health advantages and nutritional properties of apples'}}}}]\n"
    "---------------------\n"
    "Text: {text}\n"
    "Output:\n"
)

In [21]:
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor

dyn_llm_path_extractor = DynamicLLMPathExtractor(
    llm=openai_llm,
    allowed_entity_props=["description"],
    allowed_relation_props=["description"],
    extract_prompt=prompt_template,
)

In [22]:
from llama_index.core import PropertyGraphIndex
# from llama_index.core.indices.property_graph import SimpleLLMPathExtractor

index = PropertyGraphIndex(
    nodes_list[0],
    embed_model=openai_embed_model,
    kg_extractors=[dyn_llm_path_extractor],
    property_graph_store=graph_store,
    show_progress=True,
)

# index = PropertyGraphIndex.from_existing(
#     property_graph_store=graph_store,
#     llm=llm,
#     embed_model=embed_model,
#     kg_extractors=[dyn_llm_path_extractor],
#     show_progress=True,
# )


Extracting and inferring knowledge graph from text:   0%|          | 0/10 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:46<00:00, 10.62s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Generating embeddings: 100%|██████████| 17/17 [00:01<00:00,  9.49it/s]


In [46]:
from llama_index.core.indices.property_graph import VectorContextRetriever

vector_retriever = VectorContextRetriever(
    index.property_graph_store,
    # only needed when the graph store doesn't support vector queries
    # vector_store=index.vector_store,
    embed_model=openai_embed_model,
    # include source chunk text with retrieved paths
    include_text=True,
    # the number of nodes to fetch
    similarity_top_k=10,
    # the depth of relations to follow after node retrieval
    path_depth=1,
    # can provide any other kwargs for the VectorStoreQuery class
)


In [25]:
from llama_index.core.response_synthesizers import TreeSummarize

tree_summarize = TreeSummarize(verbose=False)

In [35]:
query_engine = index.as_query_engine(
    include_text=True,
    # sub_retrievers=[vector_retriever],
    similarity_top_k=10,
    path_depth=2,
    response_synthesizer=tree_summarize,
)
response = query_engine.query(
    "Provide all the information available related to Viaweb. Answer in 400 words."
)
print(response)

Viaweb, founded by Robert Morris and another individual (age 29 and 30 respectively at the time of founding), was a startup that eventually became Anomaly.  The founders, despite being relatively older, maintained a frugal lifestyle similar to that of 23-year-olds, possessing minimal assets. This lack of financial burden proved advantageous, allowing for flexibility and a cost-conscious approach to business.  Their pricing strategy reflected this mindset; they offered services at a significantly lower price point than competitors, a daring move that ultimately contributed to their success.  This low-cost approach mirrored the strategy of Apple II, which also benefited from its affordability.

Viaweb successfully raised $2.5 million in funding from angel investors without ever agreeing to vesting. This was unusual for such a large funding amount, as vesting is typically the norm.  The founders' inexperience led to their initial aversion to vesting, a decision that ultimately proved bene

# Insert data in batches of 10 in property graph


In [23]:
index._insert_nodes(nodes_list[1])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [02:30<00:00, 15.04s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
Generating embeddings: 100%|██████████| 15/15 [00:01<00:00,  9.81it/s]


[TextNode(id_='37b09d24-7bc8-4ad6-b025-ab8700ef4826', embedding=[0.0024725589901208878, -0.0019366942578926682, 0.025776643306016922, -0.024811742827296257, -0.01437702588737011, 0.01255060639232397, -0.02239948883652687, -0.0033082321751862764, -0.006292533595114946, -0.01534192729741335, 0.024163879454135895, 0.02703101374208927, 0.004031908232718706, 0.0032927249558269978, 0.01363956555724144, 0.012123293243348598, 0.029360560700297356, -0.014583790674805641, -0.0004111598536837846, -0.024039821699261665, -0.02603854425251484, -0.014611358754336834, -0.010682833380997181, 0.011227313429117203, -0.019973453134298325, 0.017685258761048317, 0.016830632463097572, -0.013239821419119835, 0.006723292637616396, 0.013543075881898403, 0.018622590228915215, 0.011055009439587593, 0.009800638072192669, 0.010324441827833652, -0.03539808467030525, -0.01618276908993721, 0.0033633694984018803, -0.0011664964258670807, 0.016127632930874825, 0.0029308870434761047, 0.023157626390457153, 0.01117906812578

In [19]:
index.insert_nodes(nodes_list[2])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:37<00:00,  9.72s/it]
Generating embeddings: 100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
Generating embeddings: 100%|██████████| 15/15 [00:07<00:00,  2.10it/s]


In [20]:
index.insert_nodes(nodes_list[3])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:19<00:00,  7.99s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
Generating embeddings: 100%|██████████| 13/13 [00:05<00:00,  2.18it/s]


In [50]:
index.insert_nodes(nodes_list[4])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [04:05<00:00, 24.60s/it]


Error during extraction: Request timed out.


Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
Generating embeddings: 100%|██████████| 13/13 [00:01<00:00,  8.46it/s]


In [51]:
index.insert_nodes(nodes_list[5])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [02:18<00:00, 13.85s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]
Generating embeddings: 100%|██████████| 12/12 [00:01<00:00,  8.38it/s]


In [52]:
index.insert_nodes(nodes_list[6])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:30<00:00,  9.06s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]
Generating embeddings: 100%|██████████| 11/11 [00:01<00:00,  7.69it/s]


In [61]:
_nodes = []
for lst in nodes_list:
    for node in lst:
        if node.id_ == "1f6abf7d-ff59-44c7-b6df-830a5d4fa6b1":
            _nodes.append(node)
_nodes

[TextNode(id_='1f6abf7d-ff59-44c7-b6df-830a5d4fa6b1', embedding=[0.015514697879552841, -0.008058945648372173, 0.012239216826856136, -0.05302491411566734, -0.01233741082251072, -0.004187284968793392, -0.006978808436542749, -0.007490821182727814, -0.012793312780559063, -0.02174302004277706, 0.01286345161497593, 0.02344037964940071, -0.0030966270714998245, 0.0009942171163856983, 0.010289358906447887, 0.0033649078104645014, 0.04146604612469673, -0.00658953795209527, -0.009524845518171787, -0.007006864063441753, -0.03605133295059204, -0.012323383241891861, -0.020761078223586082, 0.002212878316640854, -0.018895385786890984, 0.017899416387081146, 0.018825247883796692, 0.003692806698381901, 0.005130651406943798, -0.0013282530708238482, 0.008676166646182537, -0.0002917334786616266, -0.009630054235458374, 0.004997387994080782, -0.033610500395298004, -0.006070510949939489, -0.004134681075811386, 0.017282195389270782, 0.019372330978512764, -0.014616921544075012, 0.025530515238642693, 0.02429607324

In [69]:
index.insert_nodes(_nodes)

Extracting and inferring knowledge graph from text: 100%|██████████| 1/1 [00:57<00:00, 57.20s/it]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


In [63]:
index.insert_nodes(nodes_list[7])

Extracting and inferring knowledge graph from text: 100%|██████████| 1/1 [00:31<00:00, 31.57s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.16it/s]


In [64]:
index.insert_nodes(_nodes)

Extracting and inferring knowledge graph from text: 100%|██████████| 1/1 [00:40<00:00, 40.27s/it]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


In [87]:
retriever = index.as_retriever(
    include_text=True,
    # sub_retrievers=[vector_retriever],
    similarity_top_k=10,
    path_depth=1,
    # response_synthesizer=tree_summarize,
    hybrid_search=True,
    limit=50,
    include_properties=True
)
res = retriever.retrieve("Tell me all about yahoo")

[NodeWithScore(node=TextNode(id_='73897872-3170-4d1e-ad01-49523d4e04d0', embedding=None, metadata={'file_path': 'D:\\projects\\graphrag-poc\\data\\selection 1\\Hiring is Obsolete (HIVE).txt', 'file_name': 'Hiring is Obsolete (HIVE).txt', 'file_type': 'text/plain', 'file_size': 26756, 'creation_date': '2024-11-26', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0a281480-c379-42da-9f0f-679b8e70841a', node_type='4', metadata={'file_path': 'D:\\projects\\graphrag-poc\\data\\selection 1\\Hiring is Obsolete (HIVE).txt', 'file_name': 'Hiring is Obsolete (HIVE).txt', 'file_type': 'text/plain', 'file_size': 26756, 'creation_date': '2024-11-26', 'last_modified_date': '2024-11-20'

In [91]:
res[0]


NodeWithScore(node=TextNode(id_='73897872-3170-4d1e-ad01-49523d4e04d0', embedding=None, metadata={'file_path': 'D:\\projects\\graphrag-poc\\data\\selection 1\\Hiring is Obsolete (HIVE).txt', 'file_name': 'Hiring is Obsolete (HIVE).txt', 'file_type': 'text/plain', 'file_size': 26756, 'creation_date': '2024-11-26', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0a281480-c379-42da-9f0f-679b8e70841a', node_type='4', metadata={'file_path': 'D:\\projects\\graphrag-poc\\data\\selection 1\\Hiring is Obsolete (HIVE).txt', 'file_name': 'Hiring is Obsolete (HIVE).txt', 'file_type': 'text/plain', 'file_size': 26756, 'creation_date': '2024-11-26', 'last_modified_date': '2024-11-20'}

In [76]:
from llama_index.core.indices.property_graph import (
    LLMSynonymRetriever,
    VectorContextRetriever
)

vector_retriever = VectorContextRetriever(
     index.property_graph_store,
     embed_model=embed_model,
     similarity_top_k=4,
     path_depth=1,
     include_text=True
)

In [85]:
query_engine = index.as_query_engine(
    include_text=True,
    sub_retrievers=[vector_retriever],
    include_properties=True
)
response = query_engine.query(
    "Provide all the information available related to Viaweb. Answer in 400 words."
)
print(response)

The context provided does not contain any information related to Viaweb.


In [None]:
print(repr(response))

In [71]:
nodes = retriever.retrieve("Tell me all about yahoo")