# GraphRAG per query locali

In [156]:
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
import os
import pandas as pd
import tiktoken
import time

## Selezione dei file utili

In [157]:
file_path = '../../output/20241001-101207/artifacts'

if not os.path.exists(file_path) or not os.listdir(file_path):
    print("The specified path is empty or does not exist.")
else:
    print("The path exists and is not empty.")

The path exists and is not empty.


In [158]:
INPUT_DIR = file_path
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 4

In [159]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
nodes_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
df_report = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

In [160]:
token_encoder = tiktoken.get_encoding("cl100k_base")

In [161]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

In [162]:
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

In [163]:
from dotenv import load_dotenv

load_dotenv()

True

## Definizione del LLM

In [164]:
api_key = os.environ["GRAPHRAG_API_KEY"]

llm_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
api_base = "http://172.18.21.132:8000/v1"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  
    api_base=api_base,  
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

## Definizione dell'Embedder

In [165]:
embedding_model = "nextfire/paraphrase-multilingual-minilm:l12-v2"  
api_base = "http://localhost:11434/api/embeddings"

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,  
    api_type=OpenaiApiType.OpenAI,  
    model=embedding_model,
    deployment_name=embedding_model,  
    max_retries=20
)

## Configurazione context builder

In [166]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=None, 
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [167]:
local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 20,
    "top_k_relationships": 20,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  
    "max_tokens": 6000,  
}

llm_params = {
    "max_tokens": 1000, 
    "temperature": 0.0,
}

## Definizione Search engine

In [168]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="Single page",  
)

## Funzione per esecuzione di query

In [169]:
import asyncio
from IPython.display import display, Markdown

In [170]:
async def ask_question(question):
    try:
        result = await search_engine.asearch(question)
        answer = result.response
    except Exception as e:
        print(f"Error processing question: {question}\nException: {e}")
        answer = "Error: Unable to retrieve answer."
    return answer

## Estrazione domande locali

In [171]:
import json
def extract_questions_from_json(file_path):
    # extraction from single json file
    questions = []
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        if 'examples' in data:
            for example in data['examples']:
                questions.append(example['question'])
    return questions

In [172]:
def extract_all_questions_from_directory(directory_path):

    all_questions = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):  
            file_path = os.path.join(directory_path, filename)
            questions = extract_questions_from_json(file_path)
            all_questions.extend(questions) 
    
    return all_questions

In [173]:
data_path = '../DatasetCreation/Local'
all_questions = extract_all_questions_from_directory(data_path)

In [174]:
print(f"Numero totale di domande estratte: {len(all_questions)}")

Numero totale di domande estratte: 45


## Risposte del modello alle domande locali

In [175]:
import nest_asyncio

In [176]:
async def process_all_questions(questions):
    tasks = []
    for question in questions:
        tasks.append(ask_question(question))
    answers = await asyncio.gather(*tasks)
    return answers

In [177]:
start_time = time.time()
answers = await process_all_questions(all_questions)

end_time = time.time()
total_time_local = end_time - start_time



In [178]:
model_answers = []
for question, answer in zip(all_questions, answers):
    model_answers.append({
        "question": question,
        "answer": answer
    })

## Salvataggio risposte in JSON

In [179]:
output_data = {
    "questions": model_answers
}

with open('GraphRAG_local_responses.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

## Salvataggio tempi

In [180]:
import csv
time_data = [
    ["Type of question", "Time (seconds)", "Number of questions"],
    ["Local", total_time_local, 40]
]

output_file = 'gr_local.csv'
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(time_data)

print(f"Tempi salvati in {output_file}")

Tempi salvati in gr_local.csv
