# Read the data from book.txt, chunk and create docs

In [1]:
import nest_asyncio
import asyncio
nest_asyncio.apply()
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

# Load or create your document
with open("./book.txt") as f:
    doc = f.read() 
document = Document(text=doc)

# Initialize the splitter
splitter = SentenceSplitter(
    chunk_size=1024,    # Maximum number of characters per chunk
    chunk_overlap=20,   # Number of characters overlapping between chunks
)

# Parse the document into sentence-level nodes
nodes = splitter.get_nodes_from_documents([document])

docs = list()
# Each node contains a sentence
for node in nodes:
    docs.append(node.text)

# Initialize HippoRAG and create the indexing

In [2]:
from hipporag import HippoRAG

save_dir = 'hipporag_books'
llm_model_name = 'gpt-3.5-turbo'
embedding_model_name = 'text-embedding-3-small'

#Startup a HippoRAG instance
hipporag = HippoRAG(save_dir=save_dir, 
                    llm_model_name=llm_model_name,
                    embedding_model_name=embedding_model_name) 

#Run indexing
hipporag.index(docs=docs)

2025-04-23 00:40:17,654	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
90it [00:00, 53092.46it/s]
90it [00:00, 2483469.47it/s]


# Test a sample query

In [3]:
query = """"How does Dickens establish Scrooge's character through 
environmental imagery rather than direct description? 
Make sure the answer does not exceed 300 characters."""

print(hipporag.rag_qa(queries=[query])[0][0].answer)

Retrieving: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.53it/s]
Collecting QA prompts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 760.53it/s]
QA Reading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 583.03it/s]
Extraction Answers from LLM Response: 1it [00:00, 1084.92it/s]

Dickens uses environmental imagery like fog, frost, and darkness to reflect Scrooge's cold and bitter personality, indirectly portraying his lack of warmth and compassion.





# Load the golden QnA data generated by Claude 3.7 Sonnet

In [4]:
import pandas as pd

df = pd.read_json("golden_data.json")

df.head(5)

Unnamed: 0,reference_question,reference_answer
0,What literary device does Dickens use in the o...,"Repetition (""Marley was dead"") and paradox (""d..."
1,What is the symbolic significance of Scrooge k...,It symbolizes Scrooge's inability to let go of...
2,How does Dickens establish Scrooge's character...,"Through cold imagery: he ""iced his office,"" ca..."
3,What is the thematic purpose of the contrast b...,It juxtaposes institutional cruelty with famil...
4,What narrative technique does Dickens use when...,"Contradictory descriptors (""like a child; yet ..."


# Lets call graphrag to get the answers

In [5]:
import tqdm

reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = list()

for i in tqdm.tqdm(range(len(reference_questions))):
    graphrag_answer = hipporag.rag_qa(queries=[reference_questions[i]])
    graphrag_answers.append(graphrag_answer[0][0].answer)

df["hippographrag_answer"] = graphrag_answers
df.to_json("result_hippographrag.json")

  0%|                                                                                                                                                                                          | 0/98 [00:00<?, ?it/s]
Retrieving:   0%|                                                                                                                                                                               | 0/1 [00:00<?, ?it/s][A
Retrieving: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.23it/s][A

Collecting QA prompts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 874.91it/s][A

QA Reading:   0%|                                                                                                                

# Define the evaluation code using GPT-4

In [6]:
import openai

# First, set the API key
openai_client = openai.OpenAI()  # <-- create a client

def evaluate_with_llm(question, golden, prediction):
    prompt = f"""
    Question: {question}
    Golden Answer: {golden}
    Model Answer: {prediction}

    Evaluate the model answer against the golden answer. 
    Respond with a score between 1 (poor) and 5 (perfect) based on accuracy, relevance, and completeness.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are an expert evaluator."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result_text = response.choices[0].message.content
    return result_text


# Call the Evaluation method for all the golden examples and store the scores

In [7]:
from tqdm import tqdm

df = pd.read_json("result_hippographrag.json")
reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = df["hippographrag_answer"].tolist()
eval_scores = list()

for reference_question,reference_answer,graphrag_answer in tqdm(zip(reference_questions,reference_answers,graphrag_answers)):
    eval_scores.append(evaluate_with_llm(reference_question,reference_answer,graphrag_answer))

df["gpt4_score"] = eval_scores
df.to_json("result_hippographrag_score.json")

98it [01:07,  1.45it/s]


# Mean score for all the examples in the golden dataset

In [8]:
df = pd.read_json("result_hippographrag_score.json")
df["gpt4_score"].mean()

3.7971938775510203