In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

import pandas as pd
import tiktoken
import json

from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

## Global Search example

Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).

### LLM setup

In [None]:
# Load the .env file from the _MRP directory
env_path = Path(os.getcwd()).parent / '_MRP' / '.env'
print(f"Looking for .env file at: {env_path.absolute()}")
if env_path.exists():
    print("Found .env file, loading environment variables...")
    load_dotenv(env_path)
api_key = os.environ.get("GRAPHRAG_API_KEY")
llm_model = 'gpt-4o-mini'

if not api_key:
    raise ValueError(
        "GRAPHRAG_API_KEY environment variable is not set. "
        "Please set it to your OpenAI API key before running this notebook."
    )

config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
model = ModelManager().get_or_create_chat_model(
    name="global_search",
    model_type=ModelType.OpenAIChat,
    config=config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

### Load community reports as context for global search

- Load all community reports in the `community_reports` table from GraphRAG, to be used as context data for global search.
- Load entities from the `entities` tables from GraphRAG, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)
- Load all communities in the `communities` table from the GraphRAG, to be used to reconstruct the community graph hierarchy for dynamic community selection.

In [4]:
# parquet files generated from indexing pipeline
INPUT_DIR = "../_MRP/output"
COMMUNITY_TABLE = "default/swap/communities_default_swap_40%"
COMMUNITY_REPORT_TABLE = "default/swap/community_reports_default_swap_40%"
ENTITY_TABLE = "entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [5]:
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

communities = read_indexer_communities(community_df, report_df)
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 284
Report count after filtering by community level 2: 277


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,c0e868a25174484992f4ccae19f684fb,277,277,3,163,[],COVID-19 Pandemic and Public Authority in Canada,The community centers around the COVID-19 pand...,# COVID-19 Pandemic and Public Authority in Ca...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'The COVID-19 pandemic repres...,"{\n ""title"": ""COVID-19 Pandemic and Public ...",2025-08-15,2
1,ebc4bedf105c4cc3a5e4efa85541915a,278,278,3,163,[],Canada's Taxation and Regulatory Framework,This report provides an overview of Canada's t...,# Canada's Taxation and Regulatory Framework\n...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Canada operates under a comp...,"{\n ""title"": ""Canada's Taxation and Regulat...",2025-08-15,48
2,fdb6b806b70d489a9c1dfa5273f88a6b,279,279,3,163,[],SIFT and Municipal Authorities in Canada,"The community consists of SIFT, a type of enti...",# SIFT and Municipal Authorities in Canada\n\n...,6.5,The impact severity rating is moderate to high...,[{'explanation': 'SIFT refers to a specific ty...,"{\n ""title"": ""SIFT and Municipal Authoritie...",2025-08-15,2
3,3c54b738b7d7497db4a5563e84e7714f,280,280,3,205,[],Canadian Corporate Taxation and Financial Mana...,This community encompasses key entities involv...,# Canadian Corporate Taxation and Financial Ma...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'The Treasury Board is a cruc...,"{\n ""title"": ""Canadian Corporate Taxation a...",2025-08-15,7
4,0678149d80634d9386b124b36649d011,281,281,3,205,[],Canadian Taxation and Corporate Structures,This community encompasses key entities involv...,# Canadian Taxation and Corporate Structures\n...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'Income tax laws are critical...,"{\n ""title"": ""Canadian Taxation and Corpora...",2025-08-15,6


#### Build global context based on community reports

In [6]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

#### Instantiate global search

In [7]:
context_builder_params = {
    "use_community_summary": True,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [8]:
search_engine = GlobalSearch(
    model=model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Question List: only need to run this cell once for all runs

In [9]:
# Build the query file for use in evaluation.py

# questions = pd.read_csv('questions.csv', encoding='utf-8')
# # questions = questions[:3] # Test run with 3 questions
# query_filename = "../_MRP_eval/query_file.txt"
# with open(query_filename, 'w', encoding='utf-8') as f:
#     for index, row in questions.iterrows():
#         # Get the question number (starting from 1) and the question text
#         question_number = index + 1
#         question_text = row['Question']
#         # Output query file with proper format
#         f.write(f"- Question {question_number}: {question_text}\n")


### Import questions

In [10]:
questions = pd.read_csv('questions.csv', encoding='utf-8')
questions = questions["Question"].to_list()
# questions = questions[:3]   # Test run with 3 questions

In [11]:
len(questions)

100

### Run Global Query

In [12]:
run_type = "default"  # Change this to "default" or "ECCD" depending on which clustering method was used
run_iter = "swap_40%"  # Change this to the run iteration you are on

In [13]:
def save_results(filename: str, results: list, error_msg: str = None) -> None:
    """Save results to a JSON file and exit if there's an error message."""
    print("Saving results...")
    if error_msg:
        print(f"Error: {error_msg}")
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    if error_msg:
        sys.exit(1)

if run_type == "default":
    output_filename = f'../_MRP_eval/default/result_file_default_{run_iter}.json'
elif run_type == "ECCD":
    output_filename = f'../_MRP_eval/ECCD/result_file_eccd_{run_iter}.json'
else:
    raise ValueError("Invalid run_type. Use 'default' or 'ECCD'.")

# Check if output directory exists
output_dir = os.path.dirname(output_filename)
if not os.path.exists(output_dir):
    print(f"Error: Output directory does not exist: {output_dir}")
    sys.exit(1)

result_lst = []
for i, question in enumerate(questions, start=1):
    try:
        if question is None or str(question).strip() == '':
            save_results(output_filename, result_lst, f"Search query cannot be empty for Question #{i}")
            
        query = str(question)
        result = await search_engine.search(query)
        result_lst.append({"result": result.response})
    except ValueError as e:
        save_results(output_filename, result_lst, f"Error converting to string for Question #{i}: {e}")
    except Exception as e:
        save_results(output_filename, result_lst, f"Unexpected error processing Question #{i}: {e}")

# Save final results
save_results(output_filename, result_lst)

Saving results...


### Optional

In [12]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,summary,rank
0,2,Taxpayer Community in Canada,1.000000,The taxpayer community in Canada encompasses i...,8.5
1,5,Canadian Financial and Governance Entities,0.556701,This community encompasses key entities involv...,7.5
2,16,Taxpayer and Partnership Community,0.416495,The community is centered around the relations...,7.5
3,246,Taxation and Partnership Framework,0.294845,The community encompasses various entities rel...,7.5
4,10,Employee Benefit and Disability Savings Community,0.280412,This community encompasses various entities re...,7.5
...,...,...,...,...,...
272,261,Payment and Payer Community,0.004124,The community centers around the concepts of P...,4.0
273,91,Listed Terrorist Entities and Legal Compliance,0.002062,This community focuses on the implications of ...,8.5
274,160,2010 Paralympic Winter Games and International...,0.002062,The community centers around the 2010 Paralymp...,7.5
275,211,Canadian Lease Provisions Community,0.002062,The community is centered around Subsection 16...,6.5


In [13]:
# inspect number of LLM calls and tokens
print(
    f"LLM calls: {result.llm_calls}. Prompt tokens: {result.prompt_tokens}. Output tokens: {result.output_tokens}."
)

LLM calls: 4. Prompt tokens: 26205. Output tokens: 1275.
