In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [3]:
INPUT_DIR = "./inputs/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 224


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,信息安全202101班,ACADEMIC_GROUP,A specific class focusing on information secur...,2708e6738dd5195e194421771a53b40a,0,2,0,b45241d70f0e43fca764df95b2b81f77,2,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,2024-2025年度第一学期,TIME_PERIOD,"The entity referred to as ""2024-2025年度第一学期"" co...","2708e6738dd5195e194421771a53b40a,3616bb0177409...",0,2,1,4119fd06010c494caa07f439b333f4c5,2,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,课程表,DOCUMENT,The 课程表 is a comprehensive document that outli...,"2708e6738dd5195e194421771a53b40a,3616bb0177409...",0,9,2,d3835bf3dda84ead99deadbeac5d0d7d,9,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,课程信息,DATA,课程信息提供了关于每门课程的详尽资料，包括课程编号、名称、学时/学分以及教师姓名。此外，还包...,"2708e6738dd5195e194421771a53b40a,3a7935cac78be...",8,9,3,077d2820ae1845bcbb1803379a3d1eae,9,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,网络安全程序设计,COURSE,"The course titled ""网络安全程序设计"" is a focused stud...","12ba9e47870f8c249262dc1f538a6e75,2442adc660a5a...",2,9,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,9,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 134


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,信息安全202101班,华中科技大学,9.0,The Information Security class of 202101 is pa...,[2708e6738dd5195e194421771a53b40a],482027a59f32484c9c44fd700615c1b6,0,2,12,14
1,信息安全202101班,课程表,8.0,The course schedule is created for the specifi...,[2708e6738dd5195e194421771a53b40a],de837ff3d626451282ff6ac77a82216d,1,2,9,11
2,2024-2025年度第一学期,网络空间安全202102班,8.0,The class follows the schedule of the first se...,[e2fe1db2a6d20a59173d46f91579cf14],460295fed3ae4cd39f9f274cec9c2506,2,2,3,5
3,2024-2025年度第一学期,网络空间安全202103班,8.0,The class schedule is for the first semester o...,[ed61588fe075862adce9021e6448eacd],553b285bba60460ab1ed8341ae61282b,3,2,2,4
4,课程表,课程信息,7.0,The course information is detailed within the ...,[2708e6738dd5195e194421771a53b40a],cec95bf17e7e4c939b56c9c6f402a29f,4,9,9,18


In [6]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 91


#### Read community reports

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 19


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,10,# HuaZhong University of Science and Technolog...,1,8.5,HuaZhong University of Science and Technology'...,The rating reflects the community's strong ali...,The community is structured around the Network...,[{'explanation': 'The network security class o...,"{\n ""title"": ""HuaZhong University of Scienc...",d6e25ee8-5db6-423d-ac6d-392aa8785a7f
1,11,# HuaZhong University of Science and Technolog...,1,8.0,HuaZhong University of Science and Technology'...,The rating is high due to the comprehensive ra...,The community is centered around HuaZhong Univ...,[{'explanation': 'HuaZhong University of Scien...,"{\n ""title"": ""HuaZhong University of Scienc...",d60750ba-8191-48ca-aad1-cdde6b649e77
2,12,# HuaZhong University of Science and Technolog...,1,8.5,HuaZhong University of Science and Technology'...,The rating reflects the community's strong ali...,This community is structured around the compre...,[{'explanation': 'The 课程表 provides a diverse r...,"{\n ""title"": ""HuaZhong University of Scienc...",77985104-003d-4c28-9628-7d468931b1e2
3,13,# Huazhong University of Science and Technolog...,1,8.0,Huazhong University of Science and Technology'...,The rating is high due to the specialized focu...,The community is centered around the Cybersecu...,[{'explanation': 'Classrooms B221-3 and B105 a...,"{\n ""title"": ""Huazhong University of Scienc...",4a60f2ef-bd91-4815-af15-b9c53e36d7f5
4,14,# HuaZhong University of Science and Technolog...,1,8.0,HuaZhong University of Science and Technology'...,The rating is high due to the community's clea...,The community is centered around HuaZhong Univ...,[{'explanation': 'The 网络空间安全（本硕博）202101班 class...,"{\n ""title"": ""HuaZhong University of Scienc...",b0141076-121f-46c7-9be3-0aa8b3098c7c


#### Read text units

In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 19


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,2708e6738dd5195e194421771a53b40a,华中科技大学信息安全202101班2024-2025年度第一学期课表|Unnamed: 1|...,1200,[d2118a27ade083d9ba57d266aaf1dc5f],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[482027a59f32484c9c44fd700615c1b6, de837ff3d62...","[ccc984e7-bfc2-49bf-8da6-43ae2a35e659, 618eaa9..."
1,ca786eb823b82150e93843485960fd81,安基地B105\n 云计算与虚拟化 10-15周 网安基地C103| | | | | | |...,1200,[d2118a27ade083d9ba57d266aaf1dc5f],"[4119fd06010c494caa07f439b333f4c5, 077d2820ae1...","[cccfa151fedc4b218a8d96adc7dceabe, ce54725672a...","[b01e5e81-382c-44d5-b9eb-096f889df3fc, 3f0654e..."
2,d902ebdeec39f407689ddaa744ebbfc7,程序分析与安全| | |24/1.5|文明（副教授）| |专选课| | | | | | | ...,1200,[d2118a27ade083d9ba57d266aaf1dc5f],"[3671ea0dd4e84c1a9b02c5ab2c8f4bac, 19a7f254a5d...","[43544b99c3b04b059546198a0ae6366d, 422433aa458...","[2e11b407-9200-471d-b06c-1e98c240a920, 7eaaca5..."
3,9728077c193f63b769747613d565b1c9,| | | | | | | | \n\n |第2节|在线机器学习 3-8周 网安基地B21...,1200,[d2118a27ade083d9ba57d266aaf1dc5f],"[4119fd06010c494caa07f439b333f4c5, 3671ea0dd4e...","[8870cf2b5df64d2cab5820f67e29b9f1, cd130938a28...","[f70098dd-55d2-453d-b9cc-249afaa70760, fa9cca6..."
4,7e5bb2dda016cb1317a4bc9bfc328428,| | | | | | | \n\n |第3节|云计算与虚拟化 10-15周 网安基地C1...,1200,[d2118a27ade083d9ba57d266aaf1dc5f],"[4119fd06010c494caa07f439b333f4c5, d3835bf3dda...","[599164aead034bc19446efacc77554d2, 43544b99c3b...","[7c6243a2-208d-427e-afec-d4bc7879f6ed, 0feb8c7..."


In [9]:
api_key = "67565f211247a70ca588f9bb4e9561aa.WKluNGHOe5UR4Ot8"
llm_model = "glm-4"
embedding_model = "embedding-3"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_base="https://open.bigmodel.cn/api/paas/v4/",
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base="https://open.bigmodel.cn/api/paas/v4/",
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [10]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [13]:
result = await search_engine.asearch("告诉我有关网络安全课程设计的信息")
print(result.response)

华中科技大学提供的网络安全课程设计是一门专注于网络安全教育课程设计的课程。这门课程涵盖了网络安全教育的各个方面，包括理论知识和实践技能。课程内容旨在帮助学生深入理解网络安全课程设计的理论和实践要点，为学生提供网络安全教育领域的专业知识。

课程设计包括网络安全课程设计、网络安全程序设计等，这些课程都强调了理论与实践的结合。例如，网络安全课程设计课程由张云鹤副研究员（自然科学）教授，而网络安全程序设计课程则由梅松讲师（高校）教授。这些课程不仅涵盖了网络安全教育的理论知识，还包括了实践技能的培养，如程序设计、云计算与虚拟化、在线机器学习等。

此外，华中科技大学还提供了网络安全综合实践（V）课程，由陈凯教授、李冰倩和王美珍副教授共同教授。这门课程进一步强化了学生的实践能力，为他们提供了更全面的网络安全教育。

总的来说，华中科技大学的网络安全课程设计课程为学生提供了全面的网络安全教育，不仅注重理论知识的传授，还注重实践技能的培养，为学生未来的职业生涯奠定了坚实的基础。


In [None]:
question = "Tell me about Dr. Jordan Hayes"
result = await search_engine.asearch(question)
print(result.response)

#### Inspecting the context data used to generate the response

In [14]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,95,网络空间安全课程,"Courses focused on cybersecurity, including th...",2,True
1,83,网络安全程序设计 3-8周,A specific session of the network security pro...,0,True
2,5,网络安全课程设计,"The course titled ""网络安全课程设计"" is a focused prog...",21,True
3,64,NETWORK SECURITY CURRICULUM DESIGN,A course focusing on the design of network sec...,2,True
4,101,网络安全,,1,True


In [15]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,41,网络安全课程设计,网络空间安全202101班,This course is part of the curriculum for the ...,7.0,32,2,True
1,35,网络安全课程设计,梅松（讲师）,The network security course is taught by Mei Song,7.0,22,1,True
2,44,网络安全课程设计,网络安全,"The course ""网络安全课程设计"" is part of the academic ...",8.0,22,1,True
3,25,网络安全程序设计,网络空间安全202101班,This course is offered to the network security...,7.0,20,2,True
4,114,华中科技大学,网络空间安全202101班,The network security class of 202101 is part o...,9.0,23,9,True


In [16]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,11,HuaZhong University of Science and Technology'...,# HuaZhong University of Science and Technolog...
1,5,HuaZhong University of Science and Technology'...,# HuaZhong University of Science and Technolog...
2,11,HuaZhong University of Science and Technology'...,# HuaZhong University of Science and Technolog...
3,5,HuaZhong University of Science and Technology'...,# HuaZhong University of Science and Technolog...


In [17]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,11,| | | | | | | | | | | |2|3-4节\n\n |第4节|程序分析与安全...
1,9,|2w/1|陈凯（教授），李冰倩，王美珍（副教授）| | | | | | | | | \n...
2,10,周 网安基地B213| | | | | | | | | | | | | \n\n | |网络...
3,14,称| | | | | | | | | \n\n |1|网络安全程序设计| | |32/2|梅...


In [18]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

Empty DataFrame
Columns: [in_context]
Index: []


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [19]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [20]:
question_history = [
    "Tell me about Agent Mercer",
    "What happens in Dulce military base?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What is the primary function of the Dulce military base?', '- Who operates the Dulce military base?', '- What types of operations are conducted at the Dulce military base?', '- Is there any public information available about the Dulce military base?', '- Are there any reported incidents or controversies associated with the Dulce military base?']
