# Read the data from book.txt, chunk and create docs

In [3]:
import nest_asyncio
import asyncio

nest_asyncio.apply()

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

# Load or create your document

with open("./book.txt") as f:
    doc = f.read() 
text = doc


document = Document(text=text)

# Initialize the splitter
splitter = SentenceSplitter(
    chunk_size=1024,    # Maximum number of characters per chunk
    chunk_overlap=20,   # Number of characters overlapping between chunks
)

# Parse the document into sentence-level nodes
nodes = splitter.get_nodes_from_documents([document])

docs = list()
# Each node contains a sentence
for node in nodes:
    docs.append(node.text)
display(docs[0])

'A CHRISTMAS CAROL\n\nCHARACTERS\n\nBob Cratchit, clerk to Ebenezer Scrooge.\nPeter Cratchit, a son of the preceding.\nTim Cratchit ("Tiny Tim"), a cripple, youngest son of Bob Cratchit.\nMr. Fezziwig, a kind-hearted, jovial old merchant.\nFred, Scrooge\'s nephew.\nGhost of Christmas Past, a phantom showing things past.\nGhost of Christmas Present, a spirit of a kind, generous,\nand hearty nature.\nGhost of Christmas Yet to Come, an apparition showing the shadows\nof things which yet may happen.\nGhost of Jacob Marley, a spectre of Scrooge\'s former partner in business.\nJoe, a marine-store dealer and receiver of stolen goods.\nEbenezer Scrooge, a grasping, covetous old man, the surviving partner\nof the firm of Scrooge and Marley.\nMr. Topper, a bachelor.\nDick Wilkins, a fellow apprentice of Scrooge\'s.\n\nBelle, a comely matron, an old sweetheart of Scrooge\'s.\nCaroline, wife of one of Scrooge\'s debtors.\nMrs. Cratchit, wife of Bob Cratchit.\nBelinda and Martha Cratchit, daughters

# Initialize Fast GraphRAG and create the indexing

In [4]:
from typing import List

import instructor
from dotenv import load_dotenv

from fast_graphrag import GraphRAG
from fast_graphrag._llm import OpenAIEmbeddingService, OpenAILLMService


DOMAIN = "Analyze this story and identify the characters. Focus on how they interact with each other, the locations they explore, and their relationships."

QUERIES = [
    "What is the significance of Christmas Eve in A Christmas Carol?",
    "How does the setting of Victorian London contribute to the story's themes?",
    "Describe the chain of events that leads to Scrooge's transformation.",
    "How does Dickens use the different spirits (Past, Present, and Future) to guide Scrooge?",
    "Why does Dickens choose to divide the story into \"staves\" rather than chapters?"
]

ENTITY_TYPES = ["Character", "Animal", "Place", "Object", "Activity", "Event"]


save_dir = 'fastgraphrag_books'# Define save directory for Fast GraphRAG objects (each LLM/Embedding model combination will create a new subdirectory)
llm_model_name = 'gpt-3.5-turbo' # Any OpenAI model name
embedding_model_name = 'text-embedding-3-small'# Embedding model name

fast_grag = GraphRAG(
    working_dir=save_dir,
    domain=DOMAIN,
    example_queries="\n".join(QUERIES),
    entity_types=ENTITY_TYPES,
    config=GraphRAG.Config(
        llm_service=OpenAILLMService(
            model=llm_model_name,
        ),
        embedding_service=OpenAIEmbeddingService(
            model=embedding_model_name,
        ),
    ),
)


for doc in docs:
    fast_grag.insert(doc)

print(fast_grag.query("What are the top 5 themes in the data?").response)

Extracting data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.30s/it]
Building [done]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.39it/s]
Extracting data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.28s/it]
Building [done]: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  3.15it/s]
Extracting data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

The top 5 themes in the data are Christmas spirit, transformation, memories of the past, family relationships, and redemption.


# Load the golden QnA data generated by Claude 3.7 Sonnet

In [5]:
import pandas as pd

df = pd.read_json("golden_data.json")

df.head(5)

Unnamed: 0,reference_question,reference_answer
0,What literary device does Dickens use in the o...,"Repetition (""Marley was dead"") and paradox (""d..."
1,What is the symbolic significance of Scrooge k...,It symbolizes Scrooge's inability to let go of...
2,How does Dickens establish Scrooge's character...,"Through cold imagery: he ""iced his office,"" ca..."
3,What is the thematic purpose of the contrast b...,It juxtaposes institutional cruelty with famil...
4,What narrative technique does Dickens use when...,"Contradictory descriptors (""like a child; yet ..."


# Lets call graphrag to get the answers

In [7]:
import tqdm

reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = list()

for i in tqdm.tqdm(range(len(reference_questions))):
    graphrag_answer = fast_grag.query(reference_questions[i]).response
    graphrag_answers.append(graphrag_answer)

df["fastgraphrag_answer"] = graphrag_answers
df.to_json("result_fastgraphrag.json")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 98/98 [06:23<00:00,  3.91s/it]


# Define the evaluation code using GPT-4

In [8]:
import openai

# First, set the API key
openai_client = openai.OpenAI()  # <-- create a client

def evaluate_with_llm(question, golden, prediction):
    prompt = f"""
    Question: {question}
    Golden Answer: {golden}
    Model Answer: {prediction}

    Evaluate the model answer against the golden answer. 
    Respond with a score between 1 (poor) and 5 (perfect) based on accuracy, relevance, and completeness.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are an expert evaluator."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result_text = response.choices[0].message.content
    return result_text


# Call the Evaluation method for all the golden examples and store the scores

In [10]:
from tqdm import tqdm

df = pd.read_json("result_fastgraphrag.json")
reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = df["fastgraphrag_answer"].tolist()
eval_scores = list()

for reference_question,reference_answer,graphrag_answer in tqdm(zip(reference_questions,reference_answers,graphrag_answers)):
    eval_scores.append(evaluate_with_llm(reference_question,reference_answer,graphrag_answer))

df["gpt4_score"] = eval_scores
df.to_json("result_fastgraphrag_score.json")

98it [01:08,  1.44it/s]


# Mean score for all the examples in the golden dataset

In [11]:
df = pd.read_json("result_fastgraphrag_score.json")
df["gpt4_score"].mean()

np.float64(4.25765306122449)