# GraphRAG per query globali

In [21]:
import os
import pandas as pd
import tiktoken
import os
from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
from IPython.display import Markdown, display
import time

## Selezione dei file utili

In [22]:
file_path = '../../output/20240925-154939/artifacts'

if not os.path.exists(file_path) or not os.listdir(file_path):
    print("The specified path is empty or does not exist.")
else:
    print("The path exists and is not empty.")

The path exists and is not empty.


In [23]:
INPUT_DIR = file_path
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"

In [24]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
nodes_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
df_report = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

In [25]:
token_encoder = tiktoken.get_encoding("cl100k_base")

In [26]:
INPUT_DIR = file_path
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

COMMUNITY_LEVEL = 4

In [27]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

In [28]:
from dotenv import load_dotenv

load_dotenv()

True

## Definizione del LLM

In [29]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
api_base = "http://172.18.21.132:8000/v1"

In [30]:
llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  
    api_base=api_base,  
    max_retries=20,
)

## Configurazione context builder

In [31]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  
    token_encoder=token_encoder,
)

In [32]:
context_builder_params = {
    "use_community_summary": True,  
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 6000,  
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1500,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 1500,  
    "temperature": 0.0,
}

## Definizione Search engine

In [33]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=6000, 
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  
    json_mode=True,  
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="Single page",  
)

## Funzione per esecuzione query

In [34]:
import asyncio
from IPython.display import display, Markdown

In [35]:
async def ask_question(question):
    try:
        result = await search_engine.asearch(question)
        answer = result.response
    except Exception as e:
        print(f"Error processing question: {question}\nException: {e}")
        answer = "Error: Unable to retrieve answer."
    return answer

## Estrazione delle domande globali

In [36]:
import json

with open('../../DatasetCreation/Global_questions.json', 'r') as file:
    data = json.load(file)

In [37]:
questions_list = [item['question'] for item in data['questions']]

# Print the questions to verify
for idx, question in enumerate(questions_list, 1):
    print(f"Question {idx}: {question}")

Question 1: What are the main topics covered by the data in the set of time-series papers?
Question 2: How does RestAD leverage both statistical methods and machine learning to achieve robust anomaly detection in noisy time-series data?
Question 3: What are the key features and benefits of RestAD in anomaly detection for time-series data?
Question 4: What are the key features and benefits of RestAD in anomaly detection for time-series data?
Question 5: How does TimeLLM differ from other models in time-series forecasting?
Question 6: How does AnomalyBERT work?
Question 7: How does TimeGPT approach time-series forecasting?
Question 8: What types of real-world applications can benefit from models like TimeLLM, RestAD, TimeGPT, AnomalyBERT, LagLLama and the other models described?
Question 9: What distinguishes LagLLama in its approach to time-series analysis?
Question 10: How do models like AnomalyBERT handle non-stationary data, and why is this important?
Question 11: What are the main t

## Risposte del modello alle domande globali

In [38]:
import nest_asyncio

In [39]:
async def process_all_questions(questions):
    tasks = []
    for question in questions:
        tasks.append(ask_question(question))
    answers = await asyncio.gather(*tasks)
    return answers

In [40]:
start_time = time.time()

answers = await process_all_questions(questions_list)

end_time = time.time()
total_time_global = end_time - start_time

In [41]:
model_answers = []
for question, answer in zip(questions_list, answers):
    model_answers.append({
        "question": question,
        "answer": answer
    })

## Salvataggio risposte in JSON

In [42]:
output_data = {
    "questions": model_answers
}

with open('GraphRAG_responses.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

## Salvataggio tempi

In [43]:
import csv
time_data = [
    ["Type of question", "Time (seconds)", "Number of questions"],
    ["Global", total_time_global, 37]
]

output_file = 'gr_global.csv'
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(time_data)

print(f"Tempi salvati in {output_file}")

Tempi salvati in gr_global.csv
