In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = "/home/tjustin/ragtest/output/20240806-152635/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [3]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")

entity_df

Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,top_level_node_id,x,y
0,0,GEORGE GOH,PERSON,"George Goh, a businessman and the founder of H...","226999648403d7b861f157424ef9d0dd,38b9800c055f3...",1,17,0,b45241d70f0e43fca764df95b2b81f77,17,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,HARVEY NORMAN OSSIA,ORGANIZATION,Harvey Norman Ossia is a company founded by Ge...,"b82fff22533b0da06c9aac9c29dd6dd5,c8876e6c55219...",1,3,1,4119fd06010c494caa07f439b333f4c5,3,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,SINGAPORE,LOCATION,"Singapore is a country where various events, d...","181424ccca876c684d29b92130e202e5,2a76cde17fc6d...",10,19,2,d3835bf3dda84ead99deadbeac5d0d7d,19,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,THARMAN SHANMUGARATNAM,PERSON,Tharman Shanmugaratnam is a former Senior Mini...,"226999648403d7b861f157424ef9d0dd,295c7a9c9182e...",7,19,3,077d2820ae1845bcbb1803379a3d1eae,19,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,INDEPENDENT CANDIDATE,GROUP,"The ""INDEPENDENT CANDIDATE"" is an individual r...","0d9dfafad762b3663c21595e561fbd5e,c8876e6c55219...",1,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,2,ENTITIES,GROUP,Entities identified in the text document,33aba9b9b3839b41e12291338e195321,,1,471,1f7b02bf486e4f42b23e9cb1a63207f3,1,,1f7b02bf486e4f42b23e9cb1a63207f3,0,0
1424,2,RELATIONSHIPS,GROUP,Relationships among the identified entities,33aba9b9b3839b41e12291338e195321,,1,472,e744c118ae7f4638a01d060bbaedd6e9,1,,e744c118ae7f4638a01d060bbaedd6e9,0,0
1425,2,TIMESTAMP,ATTRIBUTE,Timestamp indicating the date and time of the ...,33aba9b9b3839b41e12291338e195321,,1,473,e1c1080c717d437996def1a41772d179,1,,e1c1080c717d437996def1a41772d179,0,0
1426,2,CNA_3734106_IMG3,ENTITY,Specific identifier within the text document,33aba9b9b3839b41e12291338e195321,,1,474,63fba9a7c47a4f14ac0bee6bc90d0fea,1,,63fba9a7c47a4f14ac0bee6bc90d0fea,0,0


#### Read entities - embedding 

In [4]:
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entity_embedding_df

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,GEORGE GOH,PERSON,"George Goh, a businessman and the founder of H...",0,,"[226999648403d7b861f157424ef9d0dd, 38b9800c055...","[-0.017695769667625427, 0.017364203929901123, ..."
1,4119fd06010c494caa07f439b333f4c5,HARVEY NORMAN OSSIA,ORGANIZATION,Harvey Norman Ossia is a company founded by Ge...,1,,"[b82fff22533b0da06c9aac9c29dd6dd5, c8876e6c552...","[-0.009658072143793106, 0.036502767354249954, ..."
2,d3835bf3dda84ead99deadbeac5d0d7d,SINGAPORE,LOCATION,"Singapore is a country where various events, d...",2,,"[181424ccca876c684d29b92130e202e5, 2a76cde17fc...","[-0.0038641251157969236, 0.0012783686397597194..."
3,077d2820ae1845bcbb1803379a3d1eae,THARMAN SHANMUGARATNAM,PERSON,Tharman Shanmugaratnam is a former Senior Mini...,3,,"[226999648403d7b861f157424ef9d0dd, 295c7a9c918...","[-0.020518304780125618, -0.021766113117337227,..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,INDEPENDENT CANDIDATE,GROUP,"The ""INDEPENDENT CANDIDATE"" is an individual r...",4,,"[0d9dfafad762b3663c21595e561fbd5e, c8876e6c552...","[0.009030026383697987, -0.021186936646699905, ..."
...,...,...,...,...,...,...,...,...
233,1f7b02bf486e4f42b23e9cb1a63207f3,ENTITIES,GROUP,Entities identified in the text document,471,,[33aba9b9b3839b41e12291338e195321],"[-0.018409516662359238, 0.04041050374507904, 0..."
234,e744c118ae7f4638a01d060bbaedd6e9,RELATIONSHIPS,GROUP,Relationships among the identified entities,472,,[33aba9b9b3839b41e12291338e195321],"[-0.005274868570268154, -0.013643065467476845,..."
235,e1c1080c717d437996def1a41772d179,TIMESTAMP,ATTRIBUTE,Timestamp indicating the date and time of the ...,473,,[33aba9b9b3839b41e12291338e195321],"[-0.018489323556423187, 0.02154754474759102, 0..."
236,63fba9a7c47a4f14ac0bee6bc90d0fea,CNA_3734106_IMG3,ENTITY,Specific identifier within the text document,474,,[33aba9b9b3839b41e12291338e195321],"[0.009851847775280476, 0.024521438404917717, 0..."


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

relationships

[Relationship(id='1cce5cebf437428eb1a60dffbdfa603f', short_id='0', source='GEORGE GOH', target='HARVEY NORMAN OSSIA', weight=1.0, description='George Goh is the founder of Harvey Norman Ossia', description_embedding=None, text_unit_ids=['c8876e6c55219846d1bd69e2de1dbba6'], document_ids=None, attributes={'rank': 20}),
 Relationship(id='dc94039d6643460ca3c66150b9087129', short_id='1', source='GEORGE GOH', target='SINGAPORE', weight=1.0, description='George Goh is running for the Singapore presidency', description_embedding=None, text_unit_ids=['c8876e6c55219846d1bd69e2de1dbba6'], document_ids=None, attributes={'rank': 36}),
 Relationship(id='f197d75f159943f8a3ff441199790bc7', short_id='2', source='GEORGE GOH', target='INDEPENDENT CANDIDATE', weight=1.0, description='George Goh is contesting as an independent candidate', description_embedding=None, text_unit_ids=['c8876e6c55219846d1bd69e2de1dbba6'], document_ids=None, attributes={'rank': 19}),
 Relationship(id='4d8890c699684c9381105b03b0b

In [6]:
# %pip install yfiles_jupyter_graphs --quiet
from yfiles_jupyter_graphs import GraphWidget


# converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
def convert_entities_to_dicts(df):
    """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
    nodes_dict = {}
    for _, row in df.iterrows():
        # Create a dictionary for each row and collect unique nodes
        node_id = row["title"]
        if node_id not in nodes_dict:
            nodes_dict[node_id] = {
                "id": node_id,
                "properties": row.to_dict(),
            }
    return list(nodes_dict.values())


# converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
def convert_relationships_to_dicts(df):
    """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
    relationships = []
    for _, row in df.iterrows():
        # Create a dictionary for each row
        relationships.append({
            "start": row["source"],
            "end": row["target"],
            "properties": row.to_dict(),
        })
    return relationships


w = GraphWidget()
w.directed = True
w.nodes = convert_entities_to_dicts(entity_df)
w.edges = convert_relationships_to_dicts(relationship_df)

In [7]:
# show title on the node
w.node_label_mapping = "title"


# map community to a color
def community_to_color(community):
    """Map a community to a color."""
    colors = [
        "crimson",
        "darkorange",
        "indigo",
        "cornflowerblue",
        "cyan",
        "teal",
        "green",
    ]
    return (
        colors[int(community) % len(colors)] if community is not None else "lightgray"
    )


def edge_to_source_community(edge):
    """Get the community of the source node of an edge."""
    source_node = next(
        (entry for entry in w.nodes if entry["properties"]["title"] == edge["start"]),
        None,
    )
    source_node_community = source_node["properties"]["community"]
    return source_node_community if source_node_community is not None else None


w.node_color_mapping = lambda node: community_to_color(node["properties"]["community"])
w.edge_color_mapping = lambda edge: community_to_color(edge_to_source_community(edge))
# map size data to a reasonable factor
w.node_scale_factor_mapping = lambda node: 0.5 + node["properties"]["size"] * 1.5 / 20
# use weight for edge thickness
w.edge_thickness_factor_mapping = "weight"

In [8]:
# # Use the circular layout for this visualization. For larger graphs, the default organic layout is often preferrable.
w.circular_layout()

In [9]:
# display(w)

### 
Visualizing the result context of `graphrag` queries



In [10]:
# setup (see also ../../local_search.ipynb)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
# claims = read_indexer_covariates(covariate_df)
# covariates = {"claims": claims}
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

api_key = "" 
llm_model = "llama3"
embedding_model = "text-embedding-3-small"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

## Run local search on sample queries

In [11]:
result = await search_engine.asearch("Who are the candidates for the 2023 presidential elections")
print(result.response)

In the upcoming 2023 Presidential Election in Singapore, several candidates have emerged to compete for the position of President. Among the notable candidates are Mr. Ng Kok Song, Mr. Tan Kin Lian, Mr. Tharman Shanmugaratnam, and Mr. George Goh [Data: Entities (105, 15, 17, 48)]. Each candidate brings a unique background and perspective to the election, shaping the dynamics of the political landscape leading up to the election.

Mr. Ng Kok Song, a former GIC investment officer, has actively participated in the election campaign, emphasizing his commitment to divesting his stake in Avanda Investment Management to ensure his independence if elected [Data: Relationships (324, 316, 320); Sources (38)]. On the other hand, Mr. Tan Kin Lian has faced scrutiny over his social media posts, sparking debates on the acceptability of his behavior as a presidential candidate [Data: Sources (5, 14, 89)]. Additionally, Mr. Tharman Shanmugaratnam, a former Senior Minister, has positioned himself as a 

## Inspecting the context data used to generate the response

In [12]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,71,PRESIDENTIAL CANDIDATES,Presidential candidates are individuals runnin...,2,True
1,92,CANDIDATES,"The ""Candidates"" are individuals who are activ...",7,True
2,33,PE 2023,PE 2023 refers to the upcoming Presidential El...,8,True
3,461,PRESIDENTIAL FORUM,Presidential forum is an event where candidate...,1,True
4,101,APPLICANT,Applicant refers to individuals applying for c...,5,True


In [13]:
result.context_data["relationships"].head()
# print(len(result.context_data["relationships"]))

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,86,MR GOH,NG KOK SONG,Both Mr Goh and Ng Kok Song are candidates and...,2.0,59,2,True
1,167,PE 2023,NG KOK SONG,Ng Kok Song is one of the candidates for the P...,2.0,41,2,True
2,82,MR GOH,MR TAN,Mr Goh and Mr Tan are both candidates and pres...,2.0,40,1,True
3,165,PE 2023,CANDIDATES,Candidates are participating in the Presidenti...,2.0,15,1,True
4,162,CERTIFICATE OF ELIGIBILITY,NG KOK SONG,Ng Kok Song mentions his application for a cer...,1.0,49,5,True


In [14]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,15,Mr. Ng and the Presidential Election Community,# Mr. Ng and the Presidential Election Communi...


In [15]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,84,"have time to argue about that"" and that he wo..."
1,10,cent of the voters.\n\nHer contacts on Linked...
2,17,"applications on Thursday evening, ELD said th..."
3,21,5526_img1\n
4,15,_img1\n


## Visualizing the result context as graph

In [16]:
"""
Helper function to visualize the result context with `yfiles-jupyter-graphs`.

The dataframes are converted into supported nodes and relationships lists and then passed to yfiles-jupyter-graphs.
Additionally, some values are mapped to visualization properties.
"""
def show_graph(result):
    """Visualize the result context with yfiles-jupyter-graphs."""
    from yfiles_jupyter_graphs import GraphWidget

    if (
        "entities" not in result.context_data
        or "relationships" not in result.context_data
    ):
        msg = "The passed results do not contain 'entities' or 'relationships'"
        raise ValueError(msg)

    # converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_entities_to_dicts(df):
        """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
        nodes_dict = {}
        for _, row in df.iterrows():
            # Create a dictionary for each row and collect unique nodes
            node_id = row["entity"]
            if node_id not in nodes_dict:
                nodes_dict[node_id] = {
                    "id": node_id,
                    "properties": row.to_dict(),
                }
        return list(nodes_dict.values())

    # converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_relationships_to_dicts(df):
        """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
        relationships = []
        for _, row in df.iterrows():
            # Create a dictionary for each row
            relationships.append({
                "start": row["source"],
                "end": row["target"],
                "properties": row.to_dict(),
            })
        return relationships

    w = GraphWidget()
    # use the converted data to visualize the graph
    w.nodes = convert_entities_to_dicts(result.context_data["entities"])
    w.edges = convert_relationships_to_dicts(result.context_data["relationships"])
    w.directed = True
    # show title on the node
    w.node_label_mapping = "entity"
    # use weight for edge thickness
    w.edge_thickness_factor_mapping = "weight"
    display(w)


show_graph(result)

GraphWidget(layout=Layout(height='700px', width='100%'))