In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from typing import List
import json
import argparse
import logging
import pandas as pd
from tqdm import tqdm
import evaluate
import shutil

# Load the BLEU and ROUGE metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load('meteor')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to /home/dptn1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/dptn1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dptn1/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
import sys
sys.path.append("../")

In [4]:
from src.hipporag import HippoRAG
from dotenv import load_dotenv
load_dotenv()

2025-05-22 14:34:45,722	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


True

In [None]:
cache_dir = 'outputs/openai_test'  # Define save directory for HippoRAG objects (each LLM/Embedding model combination will create a new subdirectory)
save_dir = "outputs/qa_results"
llm_model_name = 'gpt-4o-mini'  # Any OpenAI model name

for major in ["MCS", "DS", "AM"]:
    for dataset_style in [None, "hcmus"]:
        print(f"Running HippoRAG for major: {major}, dataset style: {dataset_style}")
        for embedding_model_name in ["text-embedding-3-small", "nvidia/NV-Embed-v2", "GritLM/GritLM-7B", "facebook/contriever"]:
            shutil.rmtree("outputs", ignore_errors=True)

            corpus_path = f"../data/courses_{major}.json"
            with open(corpus_path, "r") as f:
                corpus = json.load(f)

            docs = [f"{doc['title']}\n{doc['text']}" for doc in corpus]

            # Startup a HippoRAG instance
            hipporag = HippoRAG(save_dir=cache_dir,
                                llm_model_name=llm_model_name,
                                embedding_model_name=embedding_model_name,
                                dataset=dataset_style, ## HippoRAG base
                                embedding_batch_size=4
                                )

            # Run indexing
            hipporag.index(docs=docs)
            open_end_qa_ds = pd.DataFrame(json.load(open(f"../data/{major}_opened_end.json", "r")))
            queries = open_end_qa_ds["question"].tolist()
            references = open_end_qa_ds["answer"].tolist()
            gold_docs = [[f"{item[0]['title']}\n{item[0]['text']}"] for item in open_end_qa_ds["paragraphs"].tolist()]

            # queries_solutions, all_response_message, all_metadata = hipporag.rag_qa(queries=queries)
            queries_solutions, all_response_message, all_metadata, overall_retrieval_result, overall_qa_results = hipporag.rag_qa(
                queries=queries,
                gold_docs=gold_docs,
                gold_answers=references
            )
            predictions = [item.split("Answer: ")[1] for item in all_response_message]

            bleu_results = bleu_metric.compute(predictions=predictions, references=references)

            rouge_results = rouge_metric.compute(predictions=predictions, references=references)

            meteor_results = meteor_metric.compute(predictions=predictions, references=references)

            open_end_results = {
                "bleu": f"{bleu_results['bleu'] * 100:.2f}",
                "meteor": f"{meteor_results['meteor'] * 100:.2f}",
                "rougeL": f"{rouge_results['rougeL'] * 100:.2f}",
            }
            results = {
                "major": major,
                "dataset_style": dataset_style,
                "embedding_model_name": embedding_model_name,
                "graph_info": hipporag.get_graph_info(),
                "retrieval_results": overall_retrieval_result,
                "bleu": open_end_results["bleu"],
                "meteor": open_end_results["meteor"],
                "rougeL": open_end_results["rougeL"],
            }
            ## save results
            results_save_path = os.path.join(save_dir, f"{major}_{dataset_style}_{embedding_model_name.replace('/', '_')}.json")
            os.makedirs(save_dir, exist_ok=True)
            with open(results_save_path, "w") as f:
                json.dump(results, f, indent=4)

A new version of the following files was downloaded from https://huggingface.co/GritLM/GritLM-7B:
- modeling_gritlm7b.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 3/3 [02:18<00:00, 46.30s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it]


Created GritLM: torch.bfloat16 dtype, mean pool, unified mode, bbcc attn


NER: 100%|██████████| 94/94 [00:12<00:00,  7.83it/s, total_prompt_tokens=43122, total_completion_tokens=11841, num_cache_hit=0]
Extracting triples: 100%|██████████| 94/94 [00:35<00:00,  2.64it/s, total_prompt_tokens=81122, total_completion_tokens=36014, num_cache_hit=0]
Batches: 100%|██████████| 395/395 [05:01<00:00,  1.31it/s]
Batches: 100%|██████████| 492/492 [06:13<00:00,  1.32it/s]
94it [00:00, 12358.23it/s]
94it [00:00, 35718.84it/s]
KNN for Queries: 100%|██████████| 2/2 [00:00<00:00,  4.35it/s]
100%|██████████| 1580/1580 [00:00<00:00, 37116.90it/s]


{'num_phrase_nodes': 1580, 'num_passage_nodes': 94, 'num_total_nodes': 1674, 'num_extracted_triples': 1968, 'num_triples_with_passage_node': 2107, 'num_synonymy_triples': 16185, 'num_total_triples': 20260}
major: AM | style: None | Graph info: {'num_phrase_nodes': 1580, 'num_passage_nodes': 94, 'num_total_nodes': 1674, 'num_extracted_triples': 1968, 'num_triples_with_passage_node': 2107, 'num_synonymy_triples': 16185, 'num_total_triples': 20260}


Retrieving: 100%|██████████| 25/25 [00:55<00:00,  2.24s/it]
Collecting QA prompts: 100%|██████████| 25/25 [00:00<00:00, 2220.57it/s]
QA Reading: 100%|██████████| 25/25 [01:33<00:00,  3.75s/it]
Extraction Answers from LLM Response: 25it [00:00, 131236.05it/s]


major: AM | style: None | Overall retrieval results: {'Recall@1': 0.64, 'Recall@2': 0.8, 'Recall@5': 0.84}
major: AM | style: None | Open-end results: {'bleu': '8.61', 'meteor': '32.40', 'rougeL': '39.01'}
