In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = "../ragtest/output/20240917-211927/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 19


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,top_level_node_id,x,y
0,0,华中科技大学信息安全202101班,ORGANIZATION,This is a class at Huazhong University of Scie...,f768d495eef9bb0f53d3c3243e87ba72,5,10,0,b45241d70f0e43fca764df95b2b81f77,10,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,梅松,PERSON,"Mei Song is a lecturer at the university, teac...",f768d495eef9bb0f53d3c3243e87ba72,3,2,1,4119fd06010c494caa07f439b333f4c5,2,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,肖凌,PERSON,"Xiao Ling is an associate professor, teaching ...",f768d495eef9bb0f53d3c3243e87ba72,7,2,2,d3835bf3dda84ead99deadbeac5d0d7d,2,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,袁斌,PERSON,"Yuan Bin is an associate professor, teaching ""...",f768d495eef9bb0f53d3c3243e87ba72,0,2,3,077d2820ae1845bcbb1803379a3d1eae,2,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,周潘,PERSON,"Zhou Pan is a professor, teaching ""Online Mach...",f768d495eef9bb0f53d3c3243e87ba72,1,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


#### Read relationships

In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 20


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,华中科技大学信息安全202101班,梅松,8.0,"Mei Song teaches ""Network Security Programming...",[f768d495eef9bb0f53d3c3243e87ba72],04dbbb2283b845baaeac0eaf0c34c9da,0,10,2,12
1,华中科技大学信息安全202101班,肖凌,7.0,"Xiao Ling teaches ""Network Security Course Des...",[f768d495eef9bb0f53d3c3243e87ba72],1943f245ee4243bdbfbd2fd619ae824a,1,10,2,12
2,华中科技大学信息安全202101班,袁斌,7.0,"Yuan Bin teaches ""Cloud Computing and Virtuali...",[f768d495eef9bb0f53d3c3243e87ba72],273daeec8cad41e6b3e450447db58ee7,2,10,2,12
3,华中科技大学信息安全202101班,周潘,8.0,"Zhou Pan teaches ""Online Machine Learning"" for...",[f768d495eef9bb0f53d3c3243e87ba72],e69dc259edb944ea9ea41264b9fcfe59,3,10,2,12
4,华中科技大学信息安全202101班,文明,7.0,"Wen Ming teaches ""Program Analysis and Securit...",[f768d495eef9bb0f53d3c3243e87ba72],e2f5735c7d714423a2c4f61ca2644626,4,10,2,12


In [5]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 9


#### Read community reports

In [6]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 8


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,0,# HUST Information Security Class and Yuan Bin...,0,3.5,HUST Information Security Class and Yuan Bin's...,The impact severity rating is moderate due to ...,This community is centered around the relation...,[{'explanation': 'Yuan Bin is an associate pro...,"{\n ""title"": ""HUST Information Security Cla...",d50c169d-d9b3-4957-a695-b769091fc6a8
1,1,# Zhou Pan and Online Machine Learning Communi...,0,3.5,Zhou Pan and Online Machine Learning Community,The impact severity rating is moderate due to ...,"This community is centered around Zhou Pan, a ...",[{'explanation': 'Zhou Pan is a central figure...,"{\n ""title"": ""Zhou Pan and Online Machine L...",33421b98-e4ea-42fd-8b5a-62550f7b134a
2,2,# Wen Ming and Program Analysis & Security Com...,0,3.5,Wen Ming and Program Analysis & Security Commu...,The impact severity rating is moderate due to ...,"This community is centered around Wen Ming, an...",[{'explanation': 'Wen Ming is a key entity in ...,"{\n ""title"": ""Wen Ming and Program Analysis...",ab3d70b5-4c34-4c59-bcd4-faea02ccecf7
3,3,# HUST Information Security Community\n\nThis ...,0,4.5,HUST Information Security Community,The impact severity rating is moderate-to-high...,"This community is centered around Mei Song, a ...",[{'explanation': 'Mei Song is a key figure in ...,"{\n ""title"": ""HUST Information Security Com...",e853749f-4cb8-4bf9-9037-ecbe5ccb8cf1
4,4,# Wang Haoyu and Mobile Application Security C...,0,4.5,Wang Haoyu and Mobile Application Security Com...,The impact severity rating is moderate due to ...,"This community is centered around Wang Haoyu, ...",[{'explanation': 'Wang Haoyu is a key entity i...,"{\n ""title"": ""Wang Haoyu and Mobile Applica...",a55eb659-5fb9-4261-af1b-9afad7b39f6e


#### Read text units

In [7]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 1


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,f768d495eef9bb0f53d3c3243e87ba72,华中科技大学信息安全202101班2024-2025年度第一学期课表\n\n课程信息|序号|...,475,[41865280f7f0e857db22b9364823469b],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[04dbbb2283b845baaeac0eaf0c34c9da, 1943f245ee4...","[1a684884-3219-449d-9226-f33237e0ed47, d7ac26d..."


In [8]:
api_key = "67565f211247a70ca588f9bb4e9561aa.WKluNGHOe5UR4Ot8"
llm_model = "glm-4"
embedding_model = "embedding-3"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_base="https://open.bigmodel.cn/api/paas/v4/",
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base="https://open.bigmodel.cn/api/paas/v4/",
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [9]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [10]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [11]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [12]:
result = await search_engine.asearch("告诉我有关肖凌老师的信息")
print(result.response)

肖凌老师是华中科技大学的一名副教授，他主要负责教授“网络安全课程设计”这一课程。根据数据，肖凌老师在教育领域具有一定的影响力，他的教学活动与华中科技大学信息安全202101班紧密相关，该班级专注于信息安全的课程学习[Data: Entities (2), Relationships (1)]。

在“网络安全课程设计”方面，肖凌老师的教学可能会对学生的网络信息安全知识和技能产生深远的影响。他的课程被视为信息安全和网络防护领域中的重要组成部分，有助于学生掌握网络安全的原理和实践[Data: Entities (11), Relationships (11)]。

此外，肖凌老师的教学不仅局限于课堂内部，他所传授的知识和技能有可能超越课堂，对更广泛的学术和专业网络产生积极的影响，从而提高整体的网络安全意识和实践水平[Data: Entities (2), Relationships (11)]。

综上所述，肖凌老师作为一位教育者在网络安全领域的贡献，可能会对未来的网络安全格局产生重要影响，为培养新一代网络安全专业人员做出贡献[Data: Entities (2), Relationships (1)]。


In [13]:
question = "Tell me about Dr. WenMing"
result = await search_engine.asearch(question)
print(result.response)

Dr. Wen Ming is an associate professor at Huazhong University of Science and Technology (HUST). According to the available data, he is involved in teaching the course "Program Analysis and Security." This course is part of the curriculum for the class "Huazhong University of Science and Technology Information Security 202101."

Dr. Wen Ming's role in the educational network at HUST suggests his expertise in program analysis and its applications in security. His contributions to the field are likely to shape the understanding and technical capabilities of the students enrolled in his course [Data: Entities (5), Relationships (4)].

Given the importance of program analysis in cybersecurity, Dr. Wen Ming's teachings could have a significant impact on the students' ability to identify and mitigate potential security vulnerabilities in software programs. His course may also enhance the broader cybersecurity education program at HUST by providing a solid foundation in this critical area [Dat

#### Inspecting the context data used to generate the response

In [14]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,5,文明,"Wen Ming is an associate professor, teaching ""...",2,True
1,4,周潘,"Zhou Pan is a professor, teaching ""Online Mach...",2,True
2,3,袁斌,"Yuan Bin is an associate professor, teaching ""...",2,True
3,10,王浩宇,"Wang Haoyu is a professor, teaching ""Mobile Ap...",2,True
4,2,肖凌,"Xiao Ling is an associate professor, teaching ...",2,True


In [15]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,0,华中科技大学信息安全202101班,梅松,"Mei Song teaches ""Network Security Programming...",8.0,12,10,True
1,1,华中科技大学信息安全202101班,肖凌,"Xiao Ling teaches ""Network Security Course Des...",7.0,12,10,True
2,2,华中科技大学信息安全202101班,袁斌,"Yuan Bin teaches ""Cloud Computing and Virtuali...",7.0,12,10,True
3,3,华中科技大学信息安全202101班,周潘,"Zhou Pan teaches ""Online Machine Learning"" for...",8.0,12,10,True
4,4,华中科技大学信息安全202101班,文明,"Wen Ming teaches ""Program Analysis and Securit...",7.0,12,10,True


In [16]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,5,HUST Cyberspace Security Comprehensive Practic...,# HUST Cyberspace Security Comprehensive Pract...
1,4,Wang Haoyu and Mobile Application Security Com...,# Wang Haoyu and Mobile Application Security C...


In [17]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,0,华中科技大学信息安全202101班2024-2025年度第一学期课表\n\n课程信息|序号|...


In [18]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

Empty DataFrame
Columns: [in_context]
Index: []


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [19]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [20]:
question_history = [
    "告诉我有关肖凌老师的信息",
    "Tell me about Dr. WenMing",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What courses does Dr. Wen Ming teach in addition to "Program Analysis and Security"?', "- How does Dr. Wen Ming's teaching contribute to the overall cybersecurity curriculum at HUST?", '- Can you provide more information on Dr. Wen Ming\'s role in the "Information Security" class of 202101?', "- What is the significance of Dr. Wen Ming's research and how does it impact his teaching?", "- How do students perceive Dr. Wen Ming's teaching style and expertise in the field of cybersecurity?"]
