# Lets visualize the graph

In [1]:
import networkx as nx
from pyvis.network import Network

# Load GraphML file
G = nx.read_graphml("book_index/output/graph.graphml")
net = Network(notebook=True)  # set notebook=False for full HTML
net.from_nx(G)
net.show("graph.html")

graph.html


# Define the method to call graphrag for query

- Splits the output using `"Search Response:\n"` to find the answer portion.
- Uses a regular expression `re.sub(r'\[Data:.*?\]', '', text)` to remove any `[Data: …]` metadata tags embedded in the result.
- Trims any leading/trailing whitespace.

In [2]:
import subprocess
from IPython.display import Markdown
import os
import re
    

def ask_graph(query):
    command = ['/opt/anaconda3/envs/ms-graphrag-dev/bin/python', 
               '-m', 
               'graphrag', 
               'query', 
               '--root',
               './book_index', 
               '--method', 
               'global', 
               '--query', 
               query]
    output = subprocess.check_output(command,universal_newlines=True,stderr=subprocess.STDOUT)
    text = output.split("Search Response:\n")[1]
    return re.sub(r'\[Data:.*?\]','',text).strip()

query = """"How does Dickens establish Scrooge's character through 
environmental imagery rather than direct description? 
Make sure the answer does not exceed 300 characters."""

ask_graph(query)

"**Dickens portrays Scrooge's character through environmental imagery, like his cold, dark office contrasting with warm Christmas settings, reflecting his isolation and lack of empathy . This indirect approach reveals Scrooge's inner traits through the external world he inhabits, adding depth to his characterization.**"

# Load the golden QnA data generated by Claude 3.7 Sonnet

In [3]:
import pandas as pd

df = pd.read_json("golden_data.json")

df.head(5)

Unnamed: 0,reference_question,reference_answer
0,What literary device does Dickens use in the o...,"Repetition (""Marley was dead"") and paradox (""d..."
1,What is the symbolic significance of Scrooge k...,It symbolizes Scrooge's inability to let go of...
2,How does Dickens establish Scrooge's character...,"Through cold imagery: he ""iced his office,"" ca..."
3,What is the thematic purpose of the contrast b...,It juxtaposes institutional cruelty with famil...
4,What narrative technique does Dickens use when...,"Contradictory descriptors (""like a child; yet ..."


# Lets call graphrag to get the answers

In [4]:
import tqdm

reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = list()

for i in tqdm.tqdm(range(len(reference_questions))):
    graphrag_answer = ask_graph(reference_questions[i])
    graphrag_answers.append(graphrag_answer)

df["graphrag_answer"] = graphrag_answers
df.to_json("result_graphrag.json")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 98/98 [14:04<00:00,  8.62s/it]


# Define the evaluation code using GPT-4

In [5]:
import openai

# First, set the API key
openai_client = openai.OpenAI()  # <-- create a client

def evaluate_with_llm(question, golden, prediction):
    prompt = f"""
    Question: {question}
    Golden Answer: {golden}
    Model Answer: {prediction}

    Evaluate the model answer against the golden answer. 
    Respond with a score between 1 (poor) and 5 (perfect) based on accuracy, relevance, and completeness.
    """

    response = openai_client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are an expert evaluator."},
            {"role": "user", "content": prompt}
        ]
    )
    
    result_text = response.choices[0].message.content
    return result_text


# Call the Evaluation method for all the golden examples and store the scores

In [6]:
from tqdm import tqdm

df = pd.read_json("result_graphrag.json")
reference_questions = df["reference_question"].tolist()
reference_answers = df["reference_answer"].tolist()
graphrag_answers = df["graphrag_answer"].tolist()
eval_scores = list()

for reference_question,reference_answer,graphrag_answer in tqdm(zip(reference_questions,reference_answers,graphrag_answers)):
    eval_scores.append(evaluate_with_llm(reference_question,reference_answer,graphrag_answer))

df["gpt4_score"] = eval_scores
df.to_json("result_graphrag_score.json")

98it [01:14,  1.32it/s]


# Mean score for all the examples in the golden dataset

In [7]:
df = pd.read_json("result_graphrag_score.json")
df["gpt4_score"].mean()

4.4290816326530615