# This notebook is for evaluating the [MultiHop-RAG Benchmark](https://arxiv.org/pdf/2401.15391)

In [77]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
import json
import os
import re
import logging

# Load the JSON data
input_file = "multihop_rag_corpus/multihoprag_kb.json"
output_dir = "documents/"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to sanitize filenames
def sanitize_filename(text):
    text = re.sub(r'[^\w\s-]', '', text)  # Remove special characters
    text = re.sub(r'\s+', '_', text)  # Replace spaces with underscores
    return text.strip('_')

# Function to clean text into raw plaintext
def clean_text(text):
    """Fully clean text to remove encoding issues and ensure pure plaintext."""
    try:
        # Replace smart quotes with standard quotes
        text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
        text = text.replace("–", "-")  # Replace en dash with a normal hyphen
        text = text.replace("…", "...")  # Replace ellipsis

        # Normalize URL slashes
        text = text.replace("\\/", "/")

        # Remove any non-ASCII characters
        text = text.encode("utf-8", "ignore").decode("utf-8")  # Remove bad chars
        text = text.encode("latin-1", "ignore").decode("latin-1")  # Extra safety
        text = text.encode("ascii", "ignore").decode("ascii")  # Strip non-ASCII
        
        return text
    except Exception as e:
        logging.error(f"Encoding error: {e}")
        return text  # Return original if all else fails

# Function to generate unique filenames if duplicates exist
def get_unique_filepath(base_filepath):
    """Ensure unique filenames by appending _1, _2, etc., if needed."""
    if not os.path.exists(base_filepath):
        return base_filepath  # No conflict, return original

    filename, ext = os.path.splitext(base_filepath)
    counter = 1

    while os.path.exists(f"{filename}_{counter}{ext}"):
        counter += 1

    return f"{filename}_{counter}{ext}"  # Return a unique filename

# Read and process JSON entries
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

for entry in data:
    source = sanitize_filename(entry["source"])
    published_date = entry["published_at"].split("T")[0]  # Extract YYYY-MM-DD

    # Construct base filename
    base_filename = f"{source}_{published_date}.txt"
    base_filepath = os.path.join(output_dir, base_filename)

    # Get a unique filename if it already exists
    filepath = get_unique_filepath(base_filepath)

    # Ensure proper formatting of URL and body
    url = clean_text(entry["url"])  # Fully cleaned URL
    body = clean_text(entry["body"])  # FULLY CLEAN TEXT

    # Format content into pure plaintext
    content = f"""{clean_text(entry["title"])}

Author: {clean_text(entry["author"])}
Source: {clean_text(entry["source"])}
Date: {published_date}
Category: {clean_text(entry["category"])}
URL: {url}

{body}
"""

    # Write to file safely as **pure** UTF-8
    try:
        with open(filepath, "w", encoding="utf-8", errors="ignore") as file:
            file.write(content)
    except Exception as e:
        logging.error(f"Failed to write {filepath}: {e}")

print(f"Processed {len(data)} entries. Files saved in '{output_dir}'")

Processed 609 entries. Files saved in 'documents/'


In [79]:
import warnings 
warnings.filterwarnings('ignore')

from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain_community.graphs.neo4j_graph import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain_core.documents.base import Document
from langchain_community.document_loaders import (
    TextLoader,
    UnstructuredFileLoader,
    JSONLoader,
)

from utils.langchainLGT import LLMGraphTransformer 
from utils.checkpoint_manager import CheckpointManager
from utils.neo4j_utils import *
from utils.document_utils import *
from typing import Sequence

from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_ollama import OllamaEmbeddings
import os

from langchain_experimental.llms.ollama_functions import OllamaFunctions
from neo4j import  Driver
from openai import OpenAI
from langchain.chat_models import ChatOpenAI

import logging
from tqdm import tqdm
import json
import re
from tqdm import tqdm
from time import sleep
from datetime import datetime

from PyPDF2 import PdfReader
import pdfplumber
from docx import Document as DocxDocument
import docx
from markdown import markdown

import pickle
from typing import List, Dict, Any
from pathlib import Path

from dotenv import load_dotenv

if not load_dotenv():
    print("Warning: .env not correctly set up")
else:
    print("Proceed")

Proceed


In [80]:
MODEL = "qwen2.5-7b-instruct"
EMB_MODEL = "text-embedding-nomic-embed-text-v1.5@q8_0"
CHECKPOINT_PATH = "mhr_processing_checkpoint.json"
DOC_DIR = "documents/"
ALLOWED_NODES = []
ALLOWED_RELATIONSHIPS = []
STRICT_MODE = False

In [81]:
if not is_database_populated():
    print("DB is not populated")
else:
    print("DB IS POPULATED")

DB is not populated


In [84]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

checkpoint = CheckpointManager(CHECKPOINT_PATH)

graph = Neo4jGraph()
client = ChatOpenAI(
    base_url="http://127.0.0.1:8000/v1",
    api_key="lm-studio",
    model=MODEL,
    temperature=0,
    request_timeout=240,
)

llm_transformer = LLMGraphTransformer(
    llm=client,
    allowed_nodes=ALLOWED_NODES,
    allowed_relationships=ALLOWED_RELATIONSHIPS,
    strict_mode=STRICT_MODE,
)

file_paths = sorted(
    [
        os.path.join(root, f)
        for root, _, files in os.walk(DOC_DIR)
        if ".ipynb_checkpoints" not in root
        for f in files
        if f.lower().endswith(("md", "json", "docx", "pdf", "txt"))
    ]
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=lambda x: len(x.split()),
    separators=[" "]
)

documents: List[Document] = []

# Stage 1: File loading and splitting
for file_path in tqdm(file_paths, desc="Processing files"):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
    file_hash = hash(content)

    # Check if the file has been processed already by checking the content hash
    if checkpoint.is_file_processed(file_path, file_hash):
        logging.info(f"File {file_path} has already been processed.")
        continue  # Skip if the file has already been processed

    try:
        # Load and clean file content
        content = load_and_clean(file_path)
        if not content:
            logging.warning(f"File {file_path} is empty after cleaning. Skipping.")
            continue

        # Split the document into chunks
        chunks = text_splitter.create_documents([content])

        # Add metadata (filename and any other relevant details)
        for chunk in chunks:
            chunk.metadata = {
                'source': file_path,  # Store the filename in the chunk metadata
                'hash': file_hash  # Store the hash of the file content for comparison
            }

        documents.extend(chunks)

        # Add file to checkpoint with content hash
        checkpoint.add_processed_file(file_path, file_hash)  # Ensure file_hash is stored
        logging.info(f"Processed {file_path} into {len(chunks)} chunks")

    except Exception as e:
        logging.error(f"Failed to process {file_path}: {e}")
        continue  # Skip this file and continue with others

# Stage 2: Graph document conversion
graph_documents = []
for doc in tqdm(documents, desc="Converting to graph"):
    doc_id = f"{doc.metadata.get('source', 'unknown')}_{hash(doc.page_content)}"

    # Check if this graph document has already been processed
    if checkpoint.is_graph_doc_processed(doc_id):
        logging.info(f"Graph document {doc_id} has already been processed.")
        continue

    max_retries = 5
    retry_delay = 1  # Seconds before retrying
    attempt = 0

    while attempt < max_retries:
        try:
            # Attempt to convert the document
            graph_doc = llm_transformer.convert_to_graph_documents([doc])[0]

            # Assign original metadata to GraphDocument
            graph_doc.source = doc
            graph_documents.append(graph_doc)

            # Mark this graph document as processed
            checkpoint.add_processed_graph_doc(doc_id)
            logging.info(f"Converted document {doc_id} to GraphDocument.")
            break  # Success, exit retry loop

        except Exception as e:
            attempt += 1
            logging.warning(f"Attempt {attempt}/{max_retries} failed for {doc_id}: {e}")

            if attempt < max_retries:
                sleep(retry_delay)  # Wait before retrying
            else:
                logging.error(f"Failed to convert document {doc_id} after {max_retries} attempts.")
                break  # Stop retrying after max attempts

# Stage 3: Add to Neo4j
batch_size = 10
for doc in tqdm(graph_documents, desc="Inserting documents"):
    doc_hash = hash(doc.source.page_content)

    if checkpoint.is_document_inserted(doc_hash):
        logging.info(f"Document with hash {doc_hash} has already been inserted.")
        continue

    try:
        source = doc.source.metadata.get('source', 'unknown')
        file_hash = doc.source.metadata.get('hash', doc_hash)
        doc_id = str(doc_hash)

        # Insert or update the Document node with page_content
        doc_content = doc.source.page_content.replace("'", "\\'").replace('"', '\\"').replace("\n", " ").replace("\r", " ")

        doc_query = f"""
        MERGE (doc:Document {{id: '{doc_id}'}})
        ON CREATE SET doc.source = '{source}', doc.hash = {file_hash}, doc.page_content = '{doc_content}'
        ON MATCH SET doc.page_content = '{doc_content}', doc.updated = timestamp()
        """
        graph.query(doc_query)

        for node in doc.nodes:
            # Apply only the __Entity__ label to all nodes
            node_type = node.type.replace("'", "\\'").replace('"', '\\"')
            
            node_query = f"""
            MERGE (n:__Entity__ {{id: '{node.id}'}})
            ON CREATE SET n.type = '{node_type}'
            """
            
            # Linking Document to Node
            link_doc_node_query = f"""
            MATCH (doc:Document {{id: '{doc_id}'}})
            MATCH (n:__Entity__ {{id: '{node.id}'}})
            MERGE (doc)-[:MENTIONS]->(n)
            """
            graph.query(link_doc_node_query)
        
        for rel in doc.relationships:
            rel_type = rel.type.replace("'", "\\'").replace('"', '\\"')

            rel_query = f"""
            MERGE (source:__Entity__ {{id: '{rel.source.id}'}})
            MERGE (target:__Entity__ {{id: '{rel.target.id}'}})
            MERGE (source)-[r:{rel_type}]->(target)
            """
            graph.query(rel_query)

        checkpoint.add_inserted_document(doc_id)
        logging.info(f"Inserted document {doc_id}")

    except Exception as e:
        logging.error(f"Failed to insert document {doc_id}: {e}")
        continue

logging.info("Processing completed successfully")        

2025-03-20 22:51:36,359 - INFO - Processed documents/Advanced_Science_News_2023-09-26.txt into 1 chunks[00:00<?, ?it/s]
2025-03-20 22:51:36,370 - INFO - Processed documents/Advanced_Science_News_2023-09-26_1.txt into 2 chunks
2025-03-20 22:51:36,371 - INFO - Processed documents/Advanced_Science_News_2023-11-01.txt into 1 chunks
2025-03-20 22:51:36,380 - INFO - Processed documents/BBC_News_-_Entertainment_Arts_2023-10-13.txt into 4 chunks
2025-03-20 22:51:36,380 - INFO - Processed documents/BBC_News_-_Entertainment_Arts_2023-10-24.txt into 2 chunks
2025-03-20 22:51:36,380 - INFO - Processed documents/BBC_News_-_Technology_2023-10-13.txt into 2 chunks
2025-03-20 22:51:36,388 - INFO - Processed documents/Business_Line_2023-10-14.txt into 1 chunks
2025-03-20 22:51:36,393 - INFO - Processed documents/Business_Line_2023-10-14_1.txt into 3 chunks
2025-03-20 22:51:36,397 - INFO - Processed documents/Business_Line_2023-10-28.txt into 2 chunks
2025-03-20 22:51:36,400 - INFO - Processed documents

In [85]:
import requests
import json

graph = Neo4jGraph()

def get_embedding(text):
    response = requests.post(
        "http://127.0.0.1:8000/v1/embeddings",
        headers={"Content-Type": "application/json"},
        json={"input": text, "model": EMB_MODEL}
    )
    return response.json()["data"][0]["embedding"]

class LMStudioEmbeddings:
    def __init__(self, prepend_document="search_document: ", prepend_query="search_query: "):
        self.prepend_document = prepend_document
        self.prepend_query = prepend_query

    def embed_documents(self, texts):
        return [get_embedding(self.prepend_document + text) for text in texts]

    def embed_query(self, text):
        return get_embedding(self.prepend_query + text)

embeddings = LMStudioEmbeddings()

vector_index = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["page_content"],
    embedding_node_property="embedding"
)

class CustomRetriever:
    def __init__(self, vector_index):
        self.vector_index = vector_index

    def get_relevant_documents(self, query):
        retrieved_docs = self.vector_index.similarity_search(query)
        formatted_docs = []
        for doc in retrieved_docs:
            source = doc.metadata.get('source', 'Unknown Source')
            content = f"In document: {source}: {doc.page_content}\n"
            formatted_docs.append(Document(page_content=content, metadata=doc.metadata))
        return formatted_docs

vector_retriever = CustomRetriever(vector_index)

query = "Elon Musk"
results = vector_retriever.get_relevant_documents(query)
for result in results:
    print(result.page_content)

In document: documents/The_Age_2023-12-09.txt: 
page_content: Musk built his own lab. OpenAI was founded in late 2015, just a few months after he met with Altman at the Rosewood hotel in Silicon Valley. Sam Altman Credit: Bloomberg Businessweek Musk pumped money into the lab, and his former PayPal buddies - Hoffman and Thiel - came along for the ride. The three men and others pledged to put $US1 billion into the project, which Altman, who was 30 at the time, would help run. To get them started, they poached Ilya Sutskever from Google. (Sutskever was one of the graduate students Google "bought" in Hinton's auction.) Initially, Musk wanted to operate OpenAI as a nonprofit, free from the economic incentives that were driving Google and other corporations. But by the time Google wowed the tech community with its Go stunt, Musk was changing his mind about how it should be run. He desperately wanted OpenAI to invent something that would capture the world's imagination and close the gap with 

In [86]:
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))

def create_fulltext_index(tx):
    query = '''
CREATE FULLTEXT INDEX `fulltext_entity_index` 
FOR (n:__Entity__) 
ON EACH [n.id, n.type];
    '''
    tx.run(query)

def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

try:
    create_index()
except:
    pass

driver.close()

In [87]:
class Entities(BaseModel):
    names: list[str] = Field(..., description="Extracted entities from the text.")

llm = ChatOpenAI(
    openai_api_base="http://127.0.0.1:8000/v1",
    openai_api_key="lm-studio",
    model=MODEL,
    temperature=0
)

prompt = ChatPromptTemplate.from_template(
"""
You need to extract all entites present within the user's query.
Only return a JSON object with the key "entities" containing the extracted entities.
ONLY RETURN THE JSON OBJECT. NO FURTHER EXPLANATION.

#####
EXAMPLES:

Query: "Who is the president of the USA?"
Return: {{ "entities": ["President", "USA"] }}

Query: "What products does Apple offer?"
Return: {{ "entities": ["Apple", "product"] }}

Query: "Who is the founder of Microsoft?"
Return: {{ "entities": ["founder", "Microsoft"] }}

#####
Now extract the entites from the query: {question}
"""
)

def extract_entities(question):
    formatted_prompt = prompt.format(question=question)
    response = llm.invoke(formatted_prompt)
    
    response_text = response.content if hasattr(response, "content") else str(response)
    
    cleaned_response = re.sub(r"```json|```", "", response_text).strip()
    
    try:
        parsed_response = json.loads(cleaned_response)
        
        if "entities" not in parsed_response:
            return {"error": "No 'entities' field found in response", "raw": cleaned_response}
        
        parsed_response["names"] = parsed_response.pop("entities")
        
        return Entities(**parsed_response)
    
    except json.JSONDecodeError as e:
        return {"error": f"JSON Decode Error: {str(e)}", "raw": cleaned_response}
    except KeyError as e:
        return {"error": f"Missing key: {str(e)}", "raw": cleaned_response}
    except TypeError as e:
        return {"error": f"Type Error: {str(e)}", "raw": cleaned_response}

question = "Does Microsoft have any employee named John?"
entities = extract_entities(question)

print(entities)

2025-03-21 09:11:10,892 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


names=['Microsoft', 'employee', 'John']


In [117]:
def graph_retriever(question: str) -> str:
    result = []
    
    entities = extract_entities(question)
    # print(entities)
    if isinstance(entities, dict):
        return "Doesn't seem to have any information on this."
    
    for entity in entities.names:
        formatted_query = entity.replace(" ", "_")
        fuzzy_query = f"{formatted_query}*"
        query_length = len(formatted_query)

        response = graph.query(
            """
CALL db.index.fulltext.queryNodes('fulltext_entity_index', $query, {limit:10}) 
YIELD node, score
WITH node, score, $query_length AS query_length, size(node.id) AS id_length
WHERE score >= 0.8 OR (toFloat(query_length) / id_length) >= 0.8
CALL {
  WITH node
  MATCH (node)-[r]->(neighbor)
  RETURN CASE 
    WHEN 'Document' IN labels(node) 
    THEN coalesce([label IN labels(node) WHERE label <> '__Entity__'][0], 'Unknown') + ": " + node.source + ' - ' + type(r) + ' -> ' + neighbor.id 
    ELSE node.id + ' - ' + type(r) + ' -> ' + neighbor.id
  END AS output
  UNION ALL
  WITH node
  MATCH (node)<-[r]-(neighbor)
  RETURN CASE 
    WHEN 'Document' IN labels(neighbor) 
    THEN coalesce([label IN labels(neighbor) WHERE label <> '__Entity__'][0], 'Unknown') + ": " + neighbor.source + ' - ' + type(r) + ' -> ' + node.id
    ELSE neighbor.id + ' - ' + type(r) + ' -> ' + node.id
  END AS output
}
RETURN output LIMIT 15
            """,
            {"query": fuzzy_query, "query_length": query_length},
        )
        
        if response:
            filtered = [el['output'].replace("_", " ") for el in response if not re.search(r"^[0-9a-f]{32}", el['output'])]
            result.extend(filtered)
    
    return "\n".join(result) if result else "Doesn't seem to have sufficient information on this."

In [118]:
print(graph_retriever("What is IELTS?"))

2025-03-21 09:45:31,904 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


Doesn't seem to have sufficient information on this.


In [119]:
def full_retriever(question: str, use_graph: bool = True, use_vector: bool = True):
    graph_data = ""
    vector_data = ""
    if use_graph:
        graph_data = graph_retriever(question)
    if use_vector:
        vector_data = [el.page_content for el in vector_retriever.get_relevant_documents(question)[:3]]
        vector_data = "".join(vector_data)
    final_data = f"""
{graph_data}

{vector_data}
    """
    final_data = final_data.replace("page_content: ", "")
    return final_data

In [120]:
print(full_retriever("Does The Verge have any reports on Elon Musk?", use_graph=True, use_vector=False))

2025-03-21 09:45:37,376 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"



The Verge - PUBLISHED BY -> Sheena Vasani
The Verge - PUBLISHED BY -> Antonio G. Di Benedetto
The Verge - REPORTS -> Wikimedia Foundation s challenge in complying with regulations
Sheena Vasani - WRITTEN BY -> The Verge
Document: documents/The Verge 2023-12-24.txt - MENTIONS -> The Verge
Document: documents/The Verge 2023-12-18.txt - MENTIONS -> The Verge
Document: documents/The Verge 2023-11-09.txt - MENTIONS -> The Verge
Alex Cranz - WORKS FOR -> The Verge
Natalie Weiner - AUTHOR OF -> The Verge article about Discogs
Meta - IGNORES REPORTS OF UNDERAGE ACCOUNTS -> reports of accounts owned by users under 13 in 2021
Elon Musk - STATEMENT MADE BY -> Cybertruck will be safer than other trucks on the road for occupants and pedestrians
Elon Musk - FOUNDERS OF -> X
Elon Musk - CO FOUNDER OF -> OpenAI  Inc.
Elon Musk - EXECUTIVE OF COMPANY -> Tesla
Elon Musk - OWNER OF -> X (formerly Twitter)
Elon Musk - OWNS -> X  former Twitter
Document: documents/The Verge 2023-12-12 1.txt - MENTIONS -> 

In [139]:
import requests

LM_STUDIO_URL = "http://127.0.0.1:8000/v1/chat/completions"
ANSWER_MODEL = "qwen2.5-7b-instruct"

def query_lm_studio(question, use_graph=True, use_vector=True):
    """
    Fetches context using full_retriever() and queries LM Studio for a response.
    """
    context = full_retriever(question, use_graph, use_vector)

    prompt = f"""
You are a smart and helpful assitant.
Answer the user's query based on the given context.
Never give an answer containing false information.

#####
CONTEXT:
{context}

#####
USER'S QUERY:
{question} Answer Consisely.
"""

    payload = {
        "model": ANSWER_MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.5
    }
    response = requests.post(LM_STUDIO_URL, json=payload)

    # print(f"\nCONTEXT{'=' * 90}\n{context}")
    # print("ANSWER" + "=" * 90 + "\n")
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"Error: {response.text}"

In [140]:
print(query_lm_studio("Which two public figures, linked by rumors of a romance covered by 'CBSSports.com' and 'The Independent - Life and Style', have been seen enjoying time together and showing affection, with one being spotted wearing a themed bracelet and the other cheering enthusiastically from box seats at a sporting event?"))

2025-03-21 10:25:16,986 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"


Travis Kelce and Taylor Swift have been seen enjoying time together and showing affection. Travis Kelce was spotted wearing a themed bracelet (specifically a friendship bracelet with lyrics referencing Taylor Swift's song "Vigilante S***") before an NFL game, while Taylor Swift was seen cheering enthusiastically from box seats at Arrowhead Stadium during the game.


In [144]:
input_file = "multihop_rag_corpus/multihoprag_qa.json"
output_file = "multihop_rag_corpus/multihoprag_qa_answered.json"

hop = 10

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

grouped_data = {}
for item in data:
    q_type = item["question_type"]
    if q_type not in grouped_data:
        grouped_data[q_type] = []
    grouped_data[q_type].append(item)

for q_type, queries in grouped_data.items():
    num_to_process = (len(queries) // hop) + (1 if len(queries) % 150 != 0 else 0)
    print(f"  {q_type}: {num_to_process} queries out of {len(queries)} total")

processed_queries = []

def clean_text(text):
    if isinstance(text, str):
        try:
            text = text.encode("utf-8").decode("unicode_escape").encode("utf-8").decode("utf-8")
        except UnicodeDecodeError:
            pass
    return text

for q_type, queries in grouped_data.items():
    print(f"\nProcessing question type: {q_type}")
    for i in tqdm(range(0, len(queries), hop), desc=f"Processing {q_type}", unit="query"):
        item = queries[i]  # This is a query we are actually processing
        try:
            question = clean_text(item["query"])

            # Query the model with different retrieval settings
            item["answer_raw"] = query_lm_studio(question, use_graph=False, use_vector=False)
            item["answer_graph"] = query_lm_studio(question, use_graph=True, use_vector=False)
            item["answer_vector"] = query_lm_studio(question, use_graph=False, use_vector=True)
            item["answer_hybrid"] = query_lm_studio(question, use_graph=True, use_vector=True)

            # Only store this query if **all four** answer fields have valid responses
            if all(item[key] and not item[key].startswith("Error:") for key in ["answer_raw", "answer_graph", "answer_vector", "answer_hybrid"]):
                processed_queries.append(item)

        except Exception as e:
            logging.error(f"Error processing query '{item['query']}': {str(e)}")
            continue  # Skip query if an error occurs

# Save only the processed queries where all four answers are present
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(processed_queries, f, ensure_ascii=False, indent=4)

print(f"Processed data saved to {output_file}")

  inference_query: 82 queries out of 816 total
  comparison_query: 86 queries out of 856 total
  null_query: 31 queries out of 301 total
  temporal_query: 59 queries out of 583 total

Processing question type: inference_query


2025-03-21 10:28:36,959 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"query/s]
2025-03-21 10:28:42,229 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:28:46,323 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:28:52,348 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:28:57,449 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:29:03,850 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:29:10,349 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:29:17,850 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:29:26,394 - INFO - HTTP Request: POST http://127.0.0.1:800


Processing question type: comparison_query


2025-03-21 10:46:32,721 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"query/s]
2025-03-21 10:46:38,285 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:46:44,095 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:46:51,248 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:46:57,871 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:47:03,272 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:47:09,044 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 10:47:15,325 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 10:47:21,988 - INFO - HTTP Request: POST http://127.0.0.1:800


Processing question type: null_query


2025-03-21 11:07:50,728 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"query/s]
2025-03-21 11:07:57,688 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:08:03,136 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:08:08,891 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:08:13,120 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:08:18,658 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:08:23,997 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:08:29,820 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:08:35,520 - INFO - HTTP Request: POST http://127.0.0.1:800


Processing question type: temporal_query


2025-03-21 11:14:10,192 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"query/s]
2025-03-21 11:14:19,343 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:14:28,029 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:14:36,471 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:14:46,183 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:14:54,933 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:15:01,753 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"s/query]
2025-03-21 11:15:08,998 - INFO - HTTP Request: POST http://127.0.0.1:8000/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-21 11:15:16,184 - INFO - HTTP Request: POST http://127.0.0.1:800

Processed data saved to multihop_rag_corpus/multihoprag_qa_answered.json





In [163]:
answered_file = "multihop_rag_corpus/qwen_multihoprag_qa_answered.json"
evaluated_file = "multihop_rag_corpus/qwen_multihoprag_qa_evaluated.json"

EVAL_MODEL = "deepseek-r1-distill-qwen-7b"

# Load answered dataset
with open(answered_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def evaluate_answer(ground_truth, generated_answer):
    """
    Queries an LLM to check if the generated answer correctly matches the ground truth.
    Returns "YES" or "NO".
    """
    prompt = f"""
You are a strict evaluator. Your task is to determine whether the given answer correctly conveys the same factual meaning as the ground truth.  

#####
GROUND TRUTH:
{ground_truth}

#####
GENERATED ANSWER:
{generated_answer}

#####
Does the generated answer correctly convey the same factual meaning as the ground truth?  

- Ignore wording differences, synonyms, or rephrasings unless they **change the actual meaning**.  
- Do NOT reject answers just because they include additional correct details.  
- Only respond with "YES" or "NO". No explanations.  
"""
    
    payload = {
        "model": EVAL_MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.0
    }

    response = requests.post(LM_STUDIO_URL, json=payload)

    if response.status_code == 200:
        answer = response.json()["choices"][0]["message"]["content"].strip().upper()
        return "YES" if "YES" in answer else "NO"
    else:
        return f"Error: {response.text}"

evaluated_data = []

print("\nEvaluating model-generated answers against ground truth...")
for item in tqdm(data, desc="Evaluating", unit="query"):
    try:
        ground_truth = item["answer"]

        item["evaluation_raw"] = evaluate_answer(ground_truth, item["answer_raw"])
        item["evaluation_graph"] = evaluate_answer(ground_truth, item["answer_graph"])
        item["evaluation_vector"] = evaluate_answer(ground_truth, item["answer_vector"])
        item["evaluation_hybrid"] = evaluate_answer(ground_truth, item["answer_hybrid"])

        evaluated_data.append(item)

    except Exception as e:
        logging.error(f"Error evaluating query '{item['query']}': {str(e)}")
        continue

# Save the evaluated dataset
with open(evaluated_file, "w", encoding="utf-8") as f:
    json.dump(evaluated_data, f, ensure_ascii=False, indent=4)

print(f"\nEvaluation results saved to {evaluated_file}")


Evaluating model-generated answers against ground truth...


2025-03-21 13:04:06,570 - ERROR - Error evaluating query 'Does the TechCrunch article suggest that Google's behavior towards news publishers is anticompetitive, while The Verge focuses on Google's role in the Epic v. Google trial without making a similar claim about anticompetitive actions?': ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-03-21 14:29:56,519 - ERROR - Error evaluating query 'Did 'The Guardian' fail to report on Eintracht Frankfurt's historic Bundesliga achievement against Bayern Munich before 'The Independent - Sports' mentioned Joelinton's powerful goal for Newcastle?': ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
2025-03-21 14:29:58,552 - ERROR - Error evaluating query 'After the Sporting News report on December 12, 2023, detailing Manchester United's exit from European competitions, and The Roar | Sports Writers Blog report on December 23, 2023, discussing Manchester Un


Evaluation results saved to multihop_rag_corpus/qwen_multihoprag_qa_evaluated.json





In [164]:
evaluated_file = "multihop_rag_corpus/qwen_multihoprag_qa_evaluated.json"

# Load evaluated dataset
with open(evaluated_file, "r", encoding="utf-8") as f:
    data = json.load(f)

valid_count = 0
invalid_count = 0

invalid_entries = []

# Validation check
for item in data:
    evaluations = [
        item.get("evaluation_raw", ""),
        item.get("evaluation_graph", ""),
        item.get("evaluation_vector", ""),
        item.get("evaluation_hybrid", "")
    ]

    if all(eval_value in {"YES", "NO"} for eval_value in evaluations):
        valid_count += 1
    else:
        invalid_count += 1
        invalid_entries.append(item)

# Print results
print("\nValidation Summary:")
print(f"✅ Valid evaluations: {valid_count}")
print(f"❌ Invalid evaluations: {invalid_count}")

# If there are invalid entries, print some examples
if invalid_count > 0:
    print("\n⚠️ Examples of invalid entries:")
    for i, entry in enumerate(invalid_entries[:3]):  # Show first 3 invalid examples
        print(f"\n--- Invalid Entry {i+1} ---")
        print(json.dumps(entry, indent=4, ensure_ascii=False))


Validation Summary:
✅ Valid evaluations: 238
❌ Invalid evaluations: 0


In [168]:
import json

def compute_accuracy(evaluated_file):
    # Load evaluated dataset
    with open(evaluated_file, "r", encoding="utf-8") as f:
        evaluated_data = json.load(f)

    # Initialize counters
    accuracy_counts = {
        "raw": {"correct": 0, "total": 0},
        "graph": {"correct": 0, "total": 0},
        "vector": {"correct": 0, "total": 0},
        "hybrid": {"correct": 0, "total": 0},
    }

    # Count correct and total responses
    for item in evaluated_data:
        for key in accuracy_counts.keys():
            eval_key = f"evaluation_{key}"
            if eval_key in item:
                accuracy_counts[key]["total"] += 1
                if item[eval_key] == "YES":
                    accuracy_counts[key]["correct"] += 1

    # Compute accuracy
    accuracy_results = {
        key: round((counts["correct"] / counts["total"]) * 100, 2) if counts["total"] > 0 else 0
        for key, counts in accuracy_counts.items()
    }

    # Print accuracy results
    print("\n🔍 Accuracy Results:")
    for key, accuracy in accuracy_results.items():
        print(f"✅ {key.capitalize()} Accuracy: {accuracy:.2f}%")

# Example usage
evaluated_file = "multihop_rag_corpus/deepseek_proof_qwen_multihoprag_qa_evaluated.json"
compute_accuracy(evaluated_file)


🔍 Accuracy Results:
✅ Raw Accuracy: 47.90%
✅ Graph Accuracy: 65.13%
✅ Vector Accuracy: 75.63%
✅ Hybrid Accuracy: 75.63%


## Some useful utilities

#### Start Neo4j
```bash
docker-compose up -d --build
```

#### Stop Neo4j
```bash
docker-compose down
```

#### Delete database
```cypher
MATCH (n) DETACH DELETE n;
```

#### Display n number of nodes
```cypher
MATCH (n)
RETURN n LIMIT 25
```

#### Indixes manipulation
```cypher
SHOW INDEXES;
DROP INDEX vector;
```