In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = "/home/tjustin/ragtest/output/20240806-152635/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [3]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")

entity_df

Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,ISWARAN,PERSON,"Iswaran, also known as S Iswaran, is a governm...","0cb5774a229508e0d84bbfce11de09f6,12960edd561d1...",18,18,0,b45241d70f0e43fca764df95b2b81f77,18,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,SINGAPORE,LOCATION,Singapore is a country in Southeast Asia known...,"003814c7c2f12401d9706d441d0d78b8,0078cd13d70e1...",2,611,1,4119fd06010c494caa07f439b333f4c5,611,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,PRIME MINISTER LEE,PERSON,Lee Hsien Loong is the Prime Minister of Singa...,"466696ce9a54965230df4c5b61560fb8,5d4ff31938b3a...",39,2,2,d3835bf3dda84ead99deadbeac5d0d7d,2,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,TRANSPORT MINISTER,POSITION,"The Transport Minister, held by Iswaran, is a ...","0cb5774a229508e0d84bbfce11de09f6,466696ce9a549...",18,1,3,077d2820ae1845bcbb1803379a3d1eae,1,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"S$8,500",MONEY,"""S$8,500"" refers to the reduced pay that Minis...","466696ce9a54965230df4c5b61560fb8,92d27d8aef2b9...",18,1,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,1,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83885,4,FIXIE-RELATED SALES,,,2f94dbbdc8787a2c83dc611a0c9be630,,1,16773,799ad1424d854025905279f3781b53b5,1,,,799ad1424d854025905279f3781b53b5,0,0
83886,4,ROAD BIKES,PRODUCT,Road bikes are a type of bike that customers h...,fee73eba470e6edec8b7cb3363199217,,1,16774,4be6b48107f848fe901659703b2dd8e3,1,,,4be6b48107f848fe901659703b2dd8e3,0,0
83887,4,YEE QING XIANG,PERSON,Yee Qing Xiang is the founder of Ascent Bikes,fee73eba470e6edec8b7cb3363199217,,1,16775,ec11badacafd451aa68fa03b79218d0c,1,,,ec11badacafd451aa68fa03b79218d0c,0,0
83888,4,CYCLING INTEREST GROUPS,GROUP,Cycling interest groups are organizations that...,fee73eba470e6edec8b7cb3363199217,,1,16776,e350ee63f47245d5b4ca2d98627b7c8c,1,,,e350ee63f47245d5b4ca2d98627b7c8c,0,0


#### Read entities - embedding 

In [4]:
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entity_embedding_df

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,ISWARAN,PERSON,"Iswaran, also known as S Iswaran, is a governm...",0,,"[0cb5774a229508e0d84bbfce11de09f6, 12960edd561...","[-0.034168317914009094, -0.01597527228295803, ..."
1,4119fd06010c494caa07f439b333f4c5,SINGAPORE,LOCATION,Singapore is a country in Southeast Asia known...,1,,"[003814c7c2f12401d9706d441d0d78b8, 0078cd13d70...","[0.008972381241619587, 0.02507088892161846, 0...."
2,d3835bf3dda84ead99deadbeac5d0d7d,PRIME MINISTER LEE,PERSON,Lee Hsien Loong is the Prime Minister of Singa...,2,,"[466696ce9a54965230df4c5b61560fb8, 5d4ff31938b...","[0.03169342130422592, -0.053404662758111954, 0..."
3,077d2820ae1845bcbb1803379a3d1eae,TRANSPORT MINISTER,POSITION,"The Transport Minister, held by Iswaran, is a ...",3,,"[0cb5774a229508e0d84bbfce11de09f6, 466696ce9a5...","[-0.021919820457696915, -0.020537102594971657,..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"S$8,500",MONEY,"""S$8,500"" refers to the reduced pay that Minis...",4,,"[466696ce9a54965230df4c5b61560fb8, 92d27d8aef2...","[-0.013535933569073677, 0.00680833263322711, 0..."
...,...,...,...,...,...,...,...,...
241,799ad1424d854025905279f3781b53b5,FIXIE-RELATED SALES,,,16773,,[2f94dbbdc8787a2c83dc611a0c9be630],"[-0.006175657268613577, 0.005057513248175383, ..."
242,4be6b48107f848fe901659703b2dd8e3,ROAD BIKES,PRODUCT,Road bikes are a type of bike that customers h...,16774,,[fee73eba470e6edec8b7cb3363199217],"[0.051137614995241165, 0.008133879862725735, -..."
243,ec11badacafd451aa68fa03b79218d0c,YEE QING XIANG,PERSON,Yee Qing Xiang is the founder of Ascent Bikes,16775,,[fee73eba470e6edec8b7cb3363199217],"[0.0434858463704586, -0.015871243551373482, -0..."
244,e350ee63f47245d5b4ca2d98627b7c8c,CYCLING INTEREST GROUPS,GROUP,Cycling interest groups are organizations that...,16776,,[fee73eba470e6edec8b7cb3363199217],"[-0.0134590370580554, 0.005320436786860228, 0...."


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

relationships

[Relationship(id='0fe3d521c2e0460ba18afb321887707f', short_id='0', source='ISWARAN', target='TRANSPORT MINISTER', weight=2.0, description='Iswaran, also known as the Transport Minister, holds the position in Singapore. Despite being interdicted from duty, Iswaran continues to serve as the Transport Minister in Singapore.', description_embedding=None, text_unit_ids=['0cb5774a229508e0d84bbfce11de09f6', '466696ce9a54965230df4c5b61560fb8'], document_ids=None, attributes={'rank': 19}),
 Relationship(id='b33568f6b23f47cf8915d50e8cd71297', short_id='1', source='ISWARAN', target='CPIB', weight=8.0, description='The Corrupt Practices Investigation Bureau (CPIB) is conducting an investigation into Transport Minister S. Iswaran in a corruption case. Iswaran is actively assisting with the investigations related to this case, which also involves billionaire Ong Beng Seng.', description_embedding=None, text_unit_ids=['0cb5774a229508e0d84bbfce11de09f6', '12960edd561d130a4ca8b45ef2e5e5c4', '1ae4d1e68b

In [6]:
# %pip install yfiles_jupyter_graphs --quiet
from yfiles_jupyter_graphs import GraphWidget


# converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
def convert_entities_to_dicts(df):
    """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
    nodes_dict = {}
    for _, row in df.iterrows():
        # Create a dictionary for each row and collect unique nodes
        node_id = row["title"]
        if node_id not in nodes_dict:
            nodes_dict[node_id] = {
                "id": node_id,
                "properties": row.to_dict(),
            }
    return list(nodes_dict.values())


# converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
def convert_relationships_to_dicts(df):
    """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
    relationships = []
    for _, row in df.iterrows():
        # Create a dictionary for each row
        relationships.append({
            "start": row["source"],
            "end": row["target"],
            "properties": row.to_dict(),
        })
    return relationships


w = GraphWidget()
w.directed = True
w.nodes = convert_entities_to_dicts(entity_df)
w.edges = convert_relationships_to_dicts(relationship_df)

In [7]:
# show title on the node
w.node_label_mapping = "title"


# map community to a color
def community_to_color(community):
    """Map a community to a color."""
    colors = [
        "crimson",
        "darkorange",
        "indigo",
        "cornflowerblue",
        "cyan",
        "teal",
        "green",
    ]
    return (
        colors[int(community) % len(colors)] if community is not None else "lightgray"
    )


def edge_to_source_community(edge):
    """Get the community of the source node of an edge."""
    source_node = next(
        (entry for entry in w.nodes if entry["properties"]["title"] == edge["start"]),
        None,
    )
    source_node_community = source_node["properties"]["community"]
    return source_node_community if source_node_community is not None else None


w.node_color_mapping = lambda node: community_to_color(node["properties"]["community"])
w.edge_color_mapping = lambda edge: community_to_color(edge_to_source_community(edge))
# map size data to a reasonable factor
w.node_scale_factor_mapping = lambda node: 0.5 + node["properties"]["size"] * 1.5 / 20
# use weight for edge thickness
w.edge_thickness_factor_mapping = "weight"

In [8]:
# # Use the circular layout for this visualization. For larger graphs, the default organic layout is often preferrable.
w.circular_layout()

In [9]:
# display(w)

### 
Visualizing the result context of `graphrag` queries



In [10]:
# setup (see also ../../local_search.ipynb)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
# claims = read_indexer_covariates(covariate_df)
# covariates = {"claims": claims}
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

api_key = "sk-proj-mTeFxdXu5jqzt27AzGbwT3BlbkFJKaWpXqAhdo7lbF9BQsFy"
llm_model = "gpt-3.5-turbo"
embedding_model = "text-embedding-3-small"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


## Run local search on sample queries

In [11]:
result = await search_engine.asearch("Who are the candidates for the 2023 presidential elections")
print(result.response)

In the upcoming 2023 Presidential Election in Singapore, several notable candidates are vying for the position of President. Among them are Mr. Ng Kok Song, a former chief investment officer of the sovereign wealth fund GIC, Mr. Tharman Shanmugaratnam, a former senior minister, and Mr. Tan Kin Lian, the former chief executive officer of NTUC Income insurance cooperative [Data: Sources (763); Relationships (1637, 6161, 4858)]. These candidates bring diverse backgrounds and experiences to the electoral landscape, each emphasizing their contributions to Singapore and their approach to exercising custodial powers if elected as President [Data: Sources (763); Relationships (509, 2781, 6213, 6206, 1886, +more)].

The candidates are actively engaging with voters through various platforms, including social media, interviews, and campaign events, to present their visions for the country and connect with the electorate [Data: Sources (763); Relationships (3613, 6256, 6258, 6265, +more)]. Their a

## Inspecting the context data used to generate the response

In [12]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,4522,THREE ASPIRING CANDIDATES,The three aspiring candidates are individuals ...,2,True
1,328,PRESIDENTIAL ELECTION 2023,Presidential Election 2023 is a specific elect...,2,True
2,10614,2023 PRESIDENTIAL ELECTION,The 2023 Presidential Election is a significan...,6,True
3,7389,NOMINATIONS,Nominations have been made for the presidentia...,1,True
4,13654,PRESIDENTIAL ASPIRANTS,Presidential aspirants are individuals running...,2,True


In [13]:
result.context_data["relationships"].head()
# print(len(result.context_data["relationships"]))

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,2346,ELD,PRESIDENTIAL ELECTION CANDIDATES,Presidential election candidates work with ELD...,1.0,66,3,True
1,2340,ELD,PE2023,ELD revised the Registers of Electors for the ...,1.0,62,3,True
2,2305,ELD,PRESIDENTIAL ELECTION 2023,ELD revised the Registers of Electors for the ...,1.0,61,3,True
3,1637,NG KOK SONG,2023 PRESIDENTIAL ELECTION,Ng Kok Song is a candidate for the 2023 Presid...,1.0,153,2,True
4,1611,NG KOK SONG,PRESIDENTIAL ASPIRANTS,Ng Kok Song is one of the individuals running ...,1.0,149,2,True


In [14]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,2175,Presidential Candidates in Singapore,# Presidential Candidates in Singapore\n\nThe ...
1,763,Singapore Presidential Election Candidates,# Singapore Presidential Election Candidates\n...


In [15]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,694,"Dr Walid said that so far, the three aspiring..."
1,40,the previous election.\n\nPrior to every elec...
2,2909,"noon, about 52 per cent, or 1,406,182 Singapo..."
3,2618,"adopt a positive, constructive, and open-mind..."
4,1853,"minister, chief justice, attorney-general or ..."


## Visualizing the result context as graph

In [16]:
"""
Helper function to visualize the result context with `yfiles-jupyter-graphs`.

The dataframes are converted into supported nodes and relationships lists and then passed to yfiles-jupyter-graphs.
Additionally, some values are mapped to visualization properties.
"""
def show_graph(result):
    """Visualize the result context with yfiles-jupyter-graphs."""
    from yfiles_jupyter_graphs import GraphWidget

    if (
        "entities" not in result.context_data
        or "relationships" not in result.context_data
    ):
        msg = "The passed results do not contain 'entities' or 'relationships'"
        raise ValueError(msg)

    # converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_entities_to_dicts(df):
        """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
        nodes_dict = {}
        for _, row in df.iterrows():
            # Create a dictionary for each row and collect unique nodes
            node_id = row["entity"]
            if node_id not in nodes_dict:
                nodes_dict[node_id] = {
                    "id": node_id,
                    "properties": row.to_dict(),
                }
        return list(nodes_dict.values())

    # converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_relationships_to_dicts(df):
        """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
        relationships = []
        for _, row in df.iterrows():
            # Create a dictionary for each row
            relationships.append({
                "start": row["source"],
                "end": row["target"],
                "properties": row.to_dict(),
            })
        return relationships

    w = GraphWidget()
    # use the converted data to visualize the graph
    w.nodes = convert_entities_to_dicts(result.context_data["entities"])
    w.edges = convert_relationships_to_dicts(result.context_data["relationships"])
    w.directed = True
    # show title on the node
    w.node_label_mapping = "entity"
    # use weight for edge thickness
    w.edge_thickness_factor_mapping = "weight"
    display(w)


show_graph(result)

GraphWidget(layout=Layout(height='700px', width='100%'))

In [17]:
import pandas as pd

# Another New Entity for "SINGAPORE'S PRESIDENTIAL ELECTION" with different details
another_new_entity_singapore_presidential = pd.DataFrame({
    'id': "453",  # Different ID to distinguish from the first entity
    'entity': ['THARMAN'],
    'description': ['Singapore\'s Presidential Hopeful and ex-PAP member.'],
    'number of relationships': [2],  # Initially no relationships, can be updated later
    'in_context': [True]
})

another_new_entity_singapore_presidential2 = pd.DataFrame({
    'id': "413",  # Different ID to distinguish from the first entity
    'entity': ['MONETARY AUTHORITY OF SINGAPORE'],
    'description': ['The Monetary Authority of Singapore or (MAS) is the central bank and financial regulatory authority of Singapore'],
    'number of relationships': [0],  # Initially no relationships, can be updated later
    'in_context': [True]
})

another_new_entity_singapore_presidential3 = pd.DataFrame({
    'id': "467",  # Different ID to distinguish from the first entity
    'entity': ['PEOPLE ACTION PARTY'],
    'description': ['The People\'s Action Party (PAP) is a political party of the centre-right in Singapore'],
    'number of relationships': [0],  # Initially no relationships, can be updated later
    'in_context': [True]
})


# New Relationship for the separate "SINGAPORE'S PRESIDENTIAL ELECTION" linked to "2023 PRESIDENTIAL ELECTION"
new_relationship_another_singapore_to_2023_1 = pd.DataFrame({
    'id': "358",  # A new ID for this relationship
    'source': ['THARMAN'],  # Source entity
    'target': ['NOMINATIONS'],  # Target entity
    'description': ['Tharman is a presidential candidate in the 2023 Presidential Elections'],
    'weight': [1.0],
    'links': [129],  # A new hypothetical link id
    'in_context': [True]
})

new_relationship_another_singapore_to_2023_2 = pd.DataFrame({
    'id': "561",  # A new ID for this relationship
    'source': ['MONETARY AUTHORITY OF SINGAPORE'],  # Source entity
    'target': ['THARMAN'],  # Target entity
    'description': ['Tharman was the chairman of MAS from 2011-2023'],
    'weight': [1.0],
    'links': [129],  
    'in_context': [True]
})


new_relationship_another_singapore_to_2023_3 = pd.DataFrame({
    'id': "261",  # A new ID for this relationship
    'source': ['PAP'],  # Source entity
    'target': ['THARMAN'],  # Target entity
    'description': ['Tharman former member of the governing People\'s Action Party (PAP), he was the Member of Parliament (MP) representing Jurong GRC between 2001 and 2023.'],
    'weight': [1.0],
    'links': [512],  
    'in_context': [True]
})


new_relationship_another_singapore_to_2023_4 = pd.DataFrame({
    'id': "321",  # A new ID for this relationship
    'source': ['PRESIDENTIAL CANDIDATES'],  # Source entity
    'target': ['PRESIDENTIAL ELECTION'],  # Target entity
    'description': ['Tharman is a presidential candidate in the 2023 Presidential Elections'],
    'weight': [1.0],
    'links': [875],  
    'in_context': [True]
})


result.context_data["relationships"] = pd.concat([
    result.context_data["relationships"], 
    new_relationship_another_singapore_to_2023_1,
    new_relationship_another_singapore_to_2023_2,
    new_relationship_another_singapore_to_2023_3,
    new_relationship_another_singapore_to_2023_4
], ignore_index=True)


result.context_data["entities"] = pd.concat([
    result.context_data["entities"], 
    another_new_entity_singapore_presidential,
    another_new_entity_singapore_presidential2,
    another_new_entity_singapore_presidential3,
    ], ignore_index=True)


show_graph(result)

Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


GraphWidget(layout=Layout(height='730px', width='100%'))