# Read the data from book.txt, chunk and create docs

In [1]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

# Load or create your document

with open("./book.txt") as f:
    doc = f.read() 
text = doc


document = Document(text=text)

# Initialize the splitter
splitter = SentenceSplitter(
    chunk_size=1024,    # Maximum number of characters per chunk
    chunk_overlap=20,   # Number of characters overlapping between chunks
)

# Parse the document into sentence-level nodes
nodes = splitter.get_nodes_from_documents([document])

docs = list()
# Each node contains a sentence
for node in nodes:
    docs.append(node.text)
display(docs[0])

'A CHRISTMAS CAROL\n\nCHARACTERS\n\nBob Cratchit, clerk to Ebenezer Scrooge.\nPeter Cratchit, a son of the preceding.\nTim Cratchit ("Tiny Tim"), a cripple, youngest son of Bob Cratchit.\nMr. Fezziwig, a kind-hearted, jovial old merchant.\nFred, Scrooge\'s nephew.\nGhost of Christmas Past, a phantom showing things past.\nGhost of Christmas Present, a spirit of a kind, generous,\nand hearty nature.\nGhost of Christmas Yet to Come, an apparition showing the shadows\nof things which yet may happen.\nGhost of Jacob Marley, a spectre of Scrooge\'s former partner in business.\nJoe, a marine-store dealer and receiver of stolen goods.\nEbenezer Scrooge, a grasping, covetous old man, the surviving partner\nof the firm of Scrooge and Marley.\nMr. Topper, a bachelor.\nDick Wilkins, a fellow apprentice of Scrooge\'s.\n\nBelle, a comely matron, an old sweetheart of Scrooge\'s.\nCaroline, wife of one of Scrooge\'s debtors.\nMrs. Cratchit, wife of Bob Cratchit.\nBelinda and Martha Cratchit, daughters

# Initialize HippoRAG and create the indexing

In [2]:
from hipporag import HippoRAG

save_dir = 'hipporag_books'# Define save directory for HippoRAG objects (each LLM/Embedding model combination will create a new subdirectory)
llm_model_name = 'gpt-3.5-turbo' # Any OpenAI model name
embedding_model_name = 'text-embedding-3-small'# Embedding model name (NV-Embed, GritLM or Contriever for now)

#Startup a HippoRAG instance
hipporag = HippoRAG(save_dir=save_dir, 
                    llm_model_name=llm_model_name,
                    embedding_model_name=embedding_model_name) 

#Run indexing
hipporag.index(docs=docs)

2025-04-20 11:15:52,985	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
NER: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:00<00:00, 85.25it/s, total_prompt_tokens=40205, total_completion_tokens=1403, num_cache_hit=38]
Extracting triples: 100%|█████████████████████████████████████████████████████████████████████████████████| 38/38 [00:20<00:00,  1.87it/s, total_prompt_tokens=52397, total_completion_tokens=13939, num_cache_hit=14]
Batch Encoding: 336it [00:33,  9.96it/s]                                                                                                                                                                              
Batch Encoding: 1168it [01:35, 12.26it/s]                                                                                                                

{'num_phrase_nodes': 844, 'num_passage_nodes': 90, 'num_total_nodes': 934, 'num_extracted_triples': 1167, 'num_triples_with_passage_node': 1304, 'num_synonymy_triples': 1597, 'num_total_triples': 4068}





# Load the golden QnA data generated by Claude 3.7 Sonnet

In [4]:
import pandas as pd

df = pd.read_json("golden_data.json")

df.head(5)

Unnamed: 0,reference_question,reference_answer
0,What literary device does Dickens use in the o...,"Repetition (""Marley was dead"") and paradox (""d..."
1,What is the symbolic significance of Scrooge k...,It symbolizes Scrooge's inability to let go of...
2,How does Dickens establish Scrooge's character...,"Through cold imagery: he ""iced his office,"" ca..."
3,What is the thematic purpose of the contrast b...,It juxtaposes institutional cruelty with famil...
4,What narrative technique does Dickens use when...,"Contradictory descriptors (""like a child; yet ..."


# Lets call graphrag to get the answers

In [21]:
import tqdm

reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = list()

for i in tqdm.tqdm(range(len(reference_questions))):
    graphrag_answer = hipporag.rag_qa(queries=[reference_questions[i]])
    graphrag_answers.append(graphrag_answer[0][0].answer)

df["hippographrag_answer"] = graphrag_answers
df.to_json("result_hippographrag.json")

  0%|                                                                                                                                                                                          | 0/98 [00:00<?, ?it/s]

Retrieving: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.06it/s][A[A


Collecting QA prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1042.84it/s][A[A


QA Reading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 307.41it/s][A[A


Extraction Answers from LLM Response: 1it [00:00, 1074.36it/s]
  1%|█▊                                             

# Define the evaluation code using GPT-4

In [22]:
import openai

# First, set the API key
openai_client = openai.OpenAI()  # <-- create a client

def evaluate_with_llm(question, golden, prediction):
    prompt = f"""
    Question: {question}
    Golden Answer: {golden}
    Model Answer: {prediction}

    Evaluate the model answer against the golden answer. 
    Respond with a score between 1 (poor) and 5 (perfect) based on accuracy, relevance, and completeness.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are an expert evaluator."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result_text = response.choices[0].message.content
    return result_text


# Call the Evaluation method for all the golden examples and store the scores

In [23]:
from tqdm import tqdm

df = pd.read_json("result_hippographrag.json")
reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = df["hippographrag_answer"].tolist()
eval_scores = list()

for reference_question,reference_answer,graphrag_answer in tqdm(zip(reference_questions,reference_answers,graphrag_answers)):
    eval_scores.append(evaluate_with_llm(reference_question,reference_answer,graphrag_answer))

df["gpt4_score"] = eval_scores
df.to_json("result_hippographrag_score.json")

98it [01:17,  1.26it/s]


# Mean score for all the examples in the golden dataset

In [24]:
df = pd.read_json("result_hippographrag_score.json")
df["gpt4_score"].mean()

4.078061224489796