In [1]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding


In [17]:
llm = Gemini(
    model="models/gemini-1.5-flash-latest",
    temperature=0.0,
)

In [18]:
# embed_model = AzureOpenAIEmbedding(
#     model="text-embedding-ada-002",
#     # deployment_name="my-custom-embedding",
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"),
#     azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
#     api_version=os.getenv("OPENAI_API_VERSION"),
# )
embed_model = GeminiEmbedding()
embeddings = embed_model.get_text_embedding("Hello, world!")

In [19]:
len(embeddings)

768

In [20]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [21]:
import nest_asyncio

nest_asyncio.apply()

In [22]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    "D:\projects\graphrag-poc\data\selection 1"
).load_data()

In [23]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)
len(nodes)
# create a list of lists each of 10 nodes
nodes_list = [nodes[i : i + 10] for i in range(0, len(nodes), 10)]
print(len(nodes_list))

7


In [24]:
DYNAMIC_EXTRACT_PROPS_TMPL = (
    "The Text I will provide you is from Paul Grahms data. It is about founders, startups and to guide students."
    "Extract up to {max_knowledge_triplets} knowledge triplets from the given text. "
    "Each triplet should be in the form of (head, relation, tail) with their respective types and properties.\n"
    "While extracting the triplets keep in mind the context of the data, so foucs on key phrases, names of companies or people, critical concepts or some specific phenomenon, using this extracted information to build a property graph"
    "---------------------\n"
    "INITIAL ONTOLOGY:\n"
    "Entity Types: {allowed_entity_types}\n"
    "Entity Properties: {allowed_entity_properties}\n"
    "Relation Types: {allowed_relation_types}\n"
    "Relation Properties: {allowed_relation_properties}\n"
    "\n"
    "Use these types as a starting point, but introduce new types if necessary based on the context.\n"
    "If the Entity Properties, Relation Properties contain property 'description', generate a context aware detailed description, which will have some uniqe non generic information addition"
    "\n"
    "GUIDELINES:\n"
    "- Output in JSON format: [{{'head': '', 'head_type': '', 'head_props': {{...}}, 'relation': '', 'relation_props': {{...}}, 'tail': '', 'tail_type': '', 'tail_props': {{...}}}}]\n"
    "- Use the most complete form for entities (e.g., 'United States of America' instead of 'USA') but where its ambiguous, use the entity as it is\n"
    "- Keep entities concise\n"
    "- While writing description for entities and relations keep context in mind and just dont write the description of the entity or relation, but the description of the entity or relation in the context of the text\n"
    "- Ensure the knowledge graph is coherent and easily understandable\n"
    "- While extracting relation, use singular form of the relation. Use EXPAND instead of EXPANDS or EXPECT instead of EXPECTS\n"
    "- The goal is to make relations as generics as possible, so that there are less duplicate relations in the graph, which have same meaning\n"
    "- If there are two names present in the text treat them as separate entities. For example Jessica Livingston and Robert Morris then they are two separate entities Jessica Livingston, Robert Morris\n"
    "---------------------\n"
    "EXAMPLE:\n"
    "Text: Tim Cook, CEO of Apple Inc., announced the new Apple Watch that monitors heart health. "
    "UC Berkeley researchers studied the benefits of apples.\n"
    "Output:\n"
    "[{{'head': 'Tim Cook', 'head_type': 'PERSON', 'head_props': {{'prop1': 'val', ...}}, 'relation': 'CEO_OF', 'relation_props': {{'prop1': 'val', ...}}, 'tail': 'Apple Inc.', 'tail_type': 'COMPANY', 'tail_props': {{'prop1': 'val', ...}}}},\n"
    " {{'head': 'Apple Inc.', 'head_type': 'COMPANY', 'head_props': {{'prop1': 'val', ...}}, 'relation': 'PRODUCES', 'relation_props': {{'prop1': 'val', ...}}, 'tail': 'Apple Watch', 'tail_type': 'PRODUCT', 'tail_props': {{'prop1': 'val', ...}}}},\n"
    " {{'head': 'Apple Watch', 'head_type': 'PRODUCT', 'head_props': {{'prop1': 'val', ...}}, 'relation': 'MONITORS', 'relation_props': {{'prop1': 'val', ...}}, 'tail': 'heart health', 'tail_type': 'HEALTH_METRIC', 'tail_props': {{'prop1': 'val', ...}}}},\n"
    " {{'head': 'UC Berkeley', 'head_type': 'UNIVERSITY', 'head_props': {{'prop1': 'val', ...}}, 'relation': 'STUDIES', 'relation_props': {{'prop1': 'val', ...}}, 'tail': 'benefits of apples', 'tail_type': 'RESEARCH_TOPIC', 'tail_props': {{'prop1': 'val', ...}}}}]\n"
    "---------------------\n"
    "Text: {text}\n"
    "Output:\n"
)

In [25]:
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor

dyn_llm_path_extractor = DynamicLLMPathExtractor(
    llm=llm,
    allowed_entity_props=["description"],
    allowed_relation_props=["description"],
    max_triplets_per_chunk=20,
    extract_prompt=DYNAMIC_EXTRACT_PROPS_TMPL,
)

In [26]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

dyn_llm_path_extractor_graphstore = Neo4jPropertyGraphStore(
    username="neo4j",
    password="12345678",
    url="bolt://localhost:7687",
    database="gemini-graphrag-2",
)

In [38]:
from llama_index.core import PropertyGraphIndex
# from llama_index.core.indices.property_graph import SimpleLLMPathExtractor

index = PropertyGraphIndex(
    nodes_list[0],
    embed_model=embed_model,
    kg_extractors=[dyn_llm_path_extractor],
    property_graph_store=dyn_llm_path_extractor_graphstore,
    show_progress=True,
)

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:49<00:00, 10.92s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Generating embeddings: 100%|██████████| 20/20 [00:01<00:00, 15.10it/s]


In [39]:
index._insert_nodes(nodes_list[1])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:44<00:00, 10.49s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Generating embeddings: 100%|██████████| 12/12 [00:01<00:00,  7.96it/s]


[TextNode(id_='e0da2668-7218-4d64-a10b-4ff429341758', embedding=[0.016896322, -0.04348157, -0.049052905, -0.011544441, 0.058382783, -0.005177583, -0.014994866, 0.003281688, 0.03322994, 0.025783613, 0.04153282, 0.015322062, -0.05003815, -0.018901804, 0.04449902, -0.035513714, 0.0059574163, 0.015325581, -0.011737793, -0.04146679, -0.017107988, 0.016775083, -0.0059961723, -0.03128142, -0.015596254, 0.037463322, 0.05432311, -0.080541514, -0.016760191, 0.024911005, -0.051147852, 0.039301097, -0.035613798, -0.0027679328, -0.021885745, -0.000989414, -0.004323101, -0.0067633977, 0.015445196, 0.0058474815, -0.014131949, -0.020861173, -0.009358527, 0.024507003, 0.019734712, 0.010008172, 0.004156611, 0.05279857, 0.043534998, -0.039742187, 0.00066099263, 0.019891877, 0.051824667, 0.003957737, 0.023269001, 0.0043046894, 0.052662, -0.051258426, -0.018617606, 0.027547676, 0.004161279, 0.0007795545, 0.0121578, -0.009389171, -0.012984681, -0.07004128, -0.07630766, 0.022363644, 0.064806536, 0.030534811,

In [40]:
index.insert_nodes(nodes_list[2])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:56<00:00, 11.64s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Generating embeddings: 100%|██████████| 14/14 [00:02<00:00,  6.22it/s]


In [41]:
index.insert_nodes(nodes_list[3])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:49<00:00, 10.93s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  7.86it/s]


In [42]:
index.insert_nodes(nodes_list[4])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:44<00:00, 10.45s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Generating embeddings: 100%|██████████| 14/14 [00:01<00:00,  9.20it/s]


In [43]:
index.insert_nodes(nodes_list[5])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:53<00:00, 11.38s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Generating embeddings: 100%|██████████| 13/13 [00:01<00:00, 10.62it/s]


In [44]:
index.insert_nodes(nodes_list[6])

Extracting and inferring knowledge graph from text: 100%|██████████| 10/10 [01:41<00:00, 10.15s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Generating embeddings: 100%|██████████| 11/11 [00:01<00:00,  6.19it/s]


In [37]:
query_engine = index.as_query_engine()
query_engine.query("Tell me about startups")

Response(response='Startups are judged by their users and may be limited by market size.  They work on technology, thrive in startup hubs, and are designed for growth.  They are characterized by low costs, blurred work-life boundaries, and frequent failures.  Startups pose financial risks, require funding and intense work, and involve dealing with challenges. They hire people, launch during YC, and may transition to VC firms.  Their outcomes range from wealth to failure.  They are not designed for adversity, lack office hours, and often grow out of schools.  Many are located in the Bay Area, Boston, or Seattle, and may fail within a year.  They need someone focused on customer needs and a large market to succeed by creating wealth.  They offer solutions to financial problems and may affect academic career prospects.  They provide time efficiency.  Successful startups are usually started by multiple people and may receive acquisition offers; they may also experience near-financial ruin 

In [26]:
dyn_llm_path_extractor_graphstore.structured_query("""
CREATE VECTOR INDEX entity IF NOT EXISTS
FOR (m:`__Entity__`)`
ON m.embedding
OPTIONS {indexConfig: {
 `vector.dimensions`: 1536,
 `vector.similarity_function`: 'cosine'
}}
""")

[]

In [27]:
similarity_threshold = 0.9
word_edit_distance = 5
data = dyn_llm_path_extractor_graphstore.structured_query("""
MATCH (e:__Entity__)
CALL {
  WITH e
  CALL db.index.vector.queryNodes('entity', 10, e.embedding)
  YIELD node, score
  WITH node, score
  WHERE score > toFLoat($cutoff)
      AND (toLower(node.name) CONTAINS toLower(e.name) OR toLower(e.name) CONTAINS toLower(node.name)
           OR apoc.text.distance(toLower(node.name), toLower(e.name)) < $distance)
      AND labels(e) = labels(node)
  WITH node, score
  ORDER BY node.name
  RETURN collect(node) AS nodes
}
WITH distinct nodes
WHERE size(nodes) > 1
WITH collect([n in nodes | n.name]) AS results
UNWIND range(0, size(results)-1, 1) as index
WITH results, index, results[index] as result
WITH apoc.coll.sort(reduce(acc = result, index2 IN range(0, size(results)-1, 1) |
        CASE WHEN index <> index2 AND
            size(apoc.coll.intersection(acc, results[index2])) > 0
            THEN apoc.coll.union(acc, results[index2])
            ELSE acc
        END
)) as combinedResult
WITH distinct(combinedResult) as combinedResult
// extra filtering
WITH collect(combinedResult) as allCombinedResults
UNWIND range(0, size(allCombinedResults)-1, 1) as combinedResultIndex
WITH allCombinedResults[combinedResultIndex] as combinedResult, combinedResultIndex, allCombinedResults
WHERE NOT any(x IN range(0,size(allCombinedResults)-1,1) 
    WHERE x <> combinedResultIndex
    AND apoc.coll.containsAll(allCombinedResults[x], combinedResult)
)
RETURN combinedResult  
""", param_map={'cutoff': similarity_threshold, 'distance': word_edit_distance})
for row in data:
    print(row)



{'combinedResult': ['Undergrads', 'Undergraduates', 'undergraduates']}
{'combinedResult': ['Starting a Startup', 'Starting a Startup at 30', 'Starting a startup', 'starting startups', 'starting startups while in college']}
{'combinedResult': ['Startup founders', 'Successful Startup Founders']}
{'combinedResult': ['Jessica Livingston', 'Jessica Livingston and Robert Morris', 'Robert Morris']}
{'combinedResult': ['Steve Wozniak', 'Woz']}
{'combinedResult': ['Kiko', 'Xerox']}
{'combinedResult': ['Startups', 'Successful startups', 'due to startups']}
{'combinedResult': ['Good Colleges', 'college', 'colleges']}
{'combinedResult': ['Students', 'college students', 'grad students', 'high school students']}
{'combinedResult': ['Cofounders', 'Founders', 'Startup founder', 'Stripe founders', 'Twitch founders', 'VCs, Founders, Independent Member', 'aspiring startup founders', 'cofounder', 'founders', 'potential cofounders', 'startup founders', 'successful startup founders']}
{'combinedResult': ['2