In [1]:
%load_ext autoreload
%autoreload 2

## Spin up a nebula graph instance

In the command line, type in:
```
mkdir nebula-docker-compose
cd nebula-docker-compose
curl --output docker-compose.yaml https://raw.githubusercontent.com/vesoft-inc/nebula-docker-compose/master/docker-compose-lite.yaml
docker compose up
```

In [2]:
# load NebulaGraph Jupyter extension to enable %ngql magic
%load_ext ngql
# connect to NebulaGraph service
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula
# create a graph space(think of a Database Instance) named: llamaindex_nebula_property_graph
%ngql CREATE SPACE IF NOT EXISTS llamaindex_nebula_property_graph(vid_type=FIXED_STRING(256));

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


In [3]:
# use the graph space, which is similar to "use database" in MySQL
# The space was created in async way, so we need to wait for a while before using it, retry it if failed
%ngql USE llamaindex_nebula_property_graph;

In [4]:
import os
from dotenv import load_dotenv, find_dotenv
import nest_asyncio
import warnings

_ = load_dotenv(find_dotenv())
nest_asyncio.apply()
warnings.filterwarnings('ignore')

In [5]:
from llama_parse import LlamaParse

docs = LlamaParse(result_type="text").load_data("../../resume/resume.pdf")

Started parsing the file under job_id 86ebdee5-60dc-42a9-9130-c91e00ecee66


## Setup Nebula graph store

In [6]:
from llama_index.graph_stores.nebula import (
    NebulaPropertyGraphStore
)

graph_store = NebulaPropertyGraphStore(
    space="llamaindex_nebula_property_graph", overwrite=True
)

## Setup Qdrant Vector Store
In the terminal, type:
```
docker run -p 6333:6333 -p 6334:6334 \
    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
    qdrant/qdrant
```

In [7]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient

client = QdrantClient(host="localhost", port=6333)
aclient = AsyncQdrantClient(host="localhost", port=6333)

# delete collection if it exists
if client.collection_exists("Titus_Resume"):
    client.delete_collection("Titus_Resume")

vector_store = QdrantVectorStore(
    "Titus_Resume",
    client = client,
    aclient = aclient,
    fastembed_sparse_model="Qdrant/bm42-all-minilm-l6-v2-attentions",
)

Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

## Construct Property Graph

In [8]:
from llama_index.core import Settings, PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI("gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding()

In [9]:
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor
)

kg_extractors=[
    ImplicitPathExtractor(),
    SimpleLLMPathExtractor(
        llm=Settings.llm,
        num_workers=4,
        max_paths_per_chunk=10,
    ),
]

In [10]:
index = PropertyGraphIndex.from_documents(
    docs,
    kg_extractors = kg_extractors,
    property_graph_store=graph_store,
    vector_store = vector_store,
    embed_kg_nodes = True,
    show_progress = True,
)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting implicit paths: 100%|██████████| 3/3 [00:00<00:00, 55676.60it/s]
Extracting paths from text: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]


In [11]:
%ngql SHOW TAGS

Unnamed: 0,Name
0,Chunk__
1,Entity__
2,Node__
3,Props__


In [12]:
%ngql SHOW EDGES

Unnamed: 0,Name
0,Relation__
1,__meta__node_label__
2,__meta__rel_label__


In [25]:
%ngql MATCH p=(v:Entity__)-[r]->(t:Entity__) RETURN v.Entity__.name AS src, r.label AS relation, t.Entity__.name AS dest

Unnamed: 0,src,relation,dest
0,Vader,Used for,Sentiment analysis
1,Lim hsien yong,Developed,Algorithms for optical character recognition
2,Lim hsien yong,Prototyped,Instance segmentation model
3,Lim hsien yong,Built,Knowledge graph web application
4,Lim hsien yong,Developed,Llm workshops
5,Lim hsien yong,Graduated from,National university of singapore
6,Lim hsien yong,Awarded,Nus undergraduate scholarship
7,Lim hsien yong,Is,Senior data scientist
8,Lim hsien yong,Graduated from,Singapore management university
9,Lim hsien yong,Awarded,Smu ai talent development grant


In [14]:
%ngql MATCH p=(v:Entity__)-[r]->(t:Entity__) RETURN p LIMIT 2;

Unnamed: 0,p
0,"(""Vader"" :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""88a14516-a093-435a-84b5-cf60eb416591""} :Node__{label: ""entity""} :Entity__{name: ""Vader""})-[:Relation__@0{label: ""Used for"", file_path: __NULL__, file_name: __NULL__, file_type: __NULL__, file_size: __NULL__, _node_type: __NULL__, creation_date: __NULL__, document_id: __NULL__, last_modified_date: __NULL__, doc_id: __NULL__, _node_content: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""88a14516-a093-435a-84b5-cf60eb416591""}]->(""Sentiment analysis"" :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""88a14516-a093-435a-84b5-cf60eb416591""} :Node__{label: ""entity""} :Entity__{name: ""Sentiment analysis""})"
1,"(""Lim hsien yong"" :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""29458f57-b4e4-43b3-b487-96df2e18bfb0""} :Node__{label: ""entity""} :Entity__{name: ""Lim hsien yong""})-[:Relation__@0{label: ""Developed"", file_path: __NULL__, file_name: __NULL__, file_type: __NULL__, file_size: __NULL__, _node_type: __NULL__, creation_date: __NULL__, document_id: __NULL__, last_modified_date: __NULL__, doc_id: __NULL__, _node_content: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""29458f57-b4e4-43b3-b487-96df2e18bfb0""}]->(""Algorithms for optical character recognition"" :Props__{_node_content: __NULL__, _node_type: __NULL__, creation_date: __NULL__, doc_id: __NULL__, document_id: __NULL__, file_name: __NULL__, file_path: __NULL__, file_size: __NULL__, file_type: __NULL__, last_modified_date: __NULL__, ref_doc_id: __NULL__, triplet_source_id: ""29458f57-b4e4-43b3-b487-96df2e18bfb0""} :Node__{label: ""entity""} :Entity__{name: ""Algorithms for optical character recognition""})"


In [15]:
%ng_draw

[1;3;38;2;249;93;106m[ERROR]: No valid %ngql query result available. 
Please execute a valid query before using %ng_draw. 
Or pass a query as an argument to %ng_draw or %%ng_draw(multiline).[0m


''

In [16]:
from IPython.display import display, Markdown

### Test property graph index

In [17]:
query_engine = index.as_query_engine(include_text=True)

response = query_engine.query("What is Titus' current vocation?")

display(Markdown(str(response)))

Titus is currently working as a Senior Data Scientist at Illumina in Singapore.

## Define Knowledge Graph Retriever

In [18]:
from llama_index.core.indices.property_graph import (
    VectorContextRetriever
)

kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    similarity_top_k=2,
    path_depth=1,
    # include_text=False,
    include_text=True,
)

In [27]:
kg_retriever.retrieve("What is Hsien Yong's current job title?")

[]

## Define Vector Retriever

In [21]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_vector_store(
    index.vector_store
)
vector_retriever = vector_index.as_retriever(similarity_top_k=2, sparse_top_k=4)

## Define custom retriever

In [22]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore
from typing import List

class FusionRetriever(BaseRetriever):
    """Custom retriever that performs both KG vector search and direct vector search."""
    
    def __init__(self, kg_retriever, vector_retriever):
        self._kg_retriever = kg_retriever
        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        unique_nodes = {n.node_id: n for n in kg_nodes}
        unique_nodes.update({n.node_id: n for n in vector_nodes})
        return list(unique_nodes.values())

In [23]:
fusion_retriever = FusionRetriever(kg_retriever, vector_retriever)