In [38]:
from classify import classify_text as classifier
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from graph_rag import GraphRAG
from tqdm import tqdm
import pandas as pd
import evaluate


In [40]:
# Load the dataset
df = pd.read_excel("dataset.xlsx")

graph_rag = GraphRAG(model_name="gpt-4o-mini")

# Function to generate answers using GraphRAG
def generate_answer(question):
    """Generate answer using the GraphRAG system"""
    try:
        # Try to classify the question if classifier is available
        label = None
        if classifier is not None:
            try:
                label = classifier(question)
                print(f"Question classified as: {label}")
            except Exception as e:
                print(f"Classification failed: {e}")
                label = None
        
        # Generate response using GraphRAG
        result = graph_rag.generate_response(question, label)
        return result["response"]
    except Exception as e:
        print(f"Error generating answer for question: {question}")
        print(f"Error: {e}")
        return "Error generating answer"

# Generate answers for all questions in the dataset
print("Generating answers using GraphRAG...")
tqdm.pandas(desc="Generating Answers")
df['generated_answer'] = df['question'].progress_apply(generate_answer)

# Initialize the evaluation metrics from Hugging Face
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# --- LLM as a Judge ---
# This section uses a simple LLM judge implementation
# You can enhance this by using a more sophisticated LLM for judging

def llm_judge(question, generated_answer, ground_truth):
    """
    LLM Judge function to evaluate generated answers using actual LLM calls
    """
    try:
        # Create a more sophisticated prompt template
        template = """
        You are an expert evaluator. Your task is to evaluate a generated answer based on a given question and a ground truth answer.
        
        Question: {question}
        Ground Truth Answer: {ground_truth}
        Generated Answer: {generated_answer}
        
        Please evaluate the generated answer for correctness, relevance, and completeness based on the ground truth.
        Respond with a single integer score:
        1 - The generated answer is correct and aligns well with the ground truth.
        0 - The generated answer is incorrect or does not align with the ground truth.
        
        Only respond with the number (0 or 1), no additional text.
        """

        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        prompt = ChatPromptTemplate.from_template(template)

        chain = (
            prompt
            | llm
            | StrOutputParser()
        )

        response = chain.invoke({
            "question": question, 
            "ground_truth": ground_truth, 
            "generated_answer": generated_answer
        })
        
        # Extract the numeric score from the response
        import re
        score_match = re.search(r'([01])', response.strip())
        if score_match:
            return int(score_match.group(1))
        else:
            # Fallback to simple string matching if LLM doesn't return expected format
            if generated_answer.lower() in ground_truth.lower() or ground_truth.lower() in generated_answer.lower():
                return 1
            else:
                return 0
                
    except Exception as e:
        print(f"Error in LLM judge: {e}")
        # Fallback to simple string matching
        if generated_answer.lower() in ground_truth.lower() or ground_truth.lower() in generated_answer.lower():
            return 1
        else:
            return 0

# Apply LLM judge to evaluate all generated answers
print("Evaluating answers using LLM Judge...")
df['llm_judge_score'] = df.progress_apply(lambda row: llm_judge(row['question'], row['ground truth'], row['ground_truth']), axis=1)

# --- Calculate Precision, Recall, F1-Score, Accuracy ---
# Based on the LLM Judge scores
true_positives = df['llm_judge_score'].sum()
false_positives = len(df) - true_positives # Simplified for this example

accuracy = true_positives / len(df)
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / len(df)
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\n--- LLM Judge Evaluation ---")
print(f"Total Questions: {len(df)}")
print(f"Correct Answers: {true_positives}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

# --- BLEU and ROUGE Score Calculation ---
predictions = df['generated_answer'].tolist()
references = [[gt] for gt in df['ground_truth'].tolist()]

print("\nCalculating BLEU and ROUGE scores...")
bleu_results = bleu.compute(predictions=predictions, references=references)
rouge_results = rouge.compute(predictions=predictions, references=references)

print("\n--- BLEU Score ---")
print(f"BLEU Score: {bleu_results['bleu']:.4f}")
print(f"BLEU Precisions: {bleu_results['precisions']}")

print("\n--- ROUGE Score ---")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

# Save results to a CSV file for further analysis
df.to_csv('evaluation_results.csv', index=False)
print("\nResults saved to 'evaluation_results.csv'")

# Display sample results
print("\n--- Sample Results ---")
print(df[['Question', 'ground_truth', 'generated_answer', 'llm_judge_score']].head())


Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating answers using GraphRAG...


Generating Answers:  19%|█▉        | 88/469 [00:00<00:00, 877.04it/s]

Predicted label: __label__KTX
Probability: 0.9839826822280884
Question classified as: KTX
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Xe vô chủ và xe gửi quá thời hạn sẽ được xử lý như thế nào theo quy định của trung tâm PVSV?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__KTX
Probability: 0.7270471453666687
Question classified as: KTX
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Thẻ giữ xe và phiếu giữ xe quá hạn có giá trị như thế nào sau mỗi học kỳ?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__Hoc_tap_ren_luyen
Probability: 0.5236092805862427
Question classified as: Hoc_tap_ren_luyen
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer

Generating Answers:  40%|███▉      | 187/469 [00:00<00:00, 915.32it/s]

Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Hành vi uống rượu bia trong khuôn viên KTX sẽ bị xử lý ra sao?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__KTX
Probability: 0.731459379196167
Question classified as: KTX
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Sinh viên có thể bị buộc ra khỏi KTX nếu tổ chức nấu ăn trong phòng ở không phép, đúng không?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__KTX
Probability: 0.9972632527351379
Question classified as: KTX
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Nội quy về công tác nội trú tại KTX được phát cho ai và khi nào?
Error: [Errno 2] No such file or directory: 'd:/G

Generating Answers:  65%|██████▍   | 303/469 [00:00<00:00, 1021.06it/s]

Predicted label: __label__Dao_tao
Probability: 0.953640341758728
Question classified as: Dao_tao
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Lớp học phần được tổ chức như thế nào?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__Dao_tao
Probability: 0.9986956715583801
Question classified as: Dao_tao
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Học phần tiên quyết là gì?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__Hoc_tap_ren_luyen
Probability: 0.7319497466087341
Question classified as: Hoc_tap_ren_luyen
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Học phí của từng CTĐTCLC và CTĐTTT được xác định dựa trên cơ sở nào?
Er

Generating Answers: 100%|██████████| 469/469 [00:00<00:00, 1085.65it/s]


Predicted label: __label__Dao_tao
Probability: 0.9582071900367737
Question classified as: Dao_tao
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Chứng chỉ TOEIC được cấp bởi tổ chức nào?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__Dao_tao
Probability: 0.9427523612976074
Question classified as: Dao_tao
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Chứng chỉ nào được cấp bởi tổ chức Cambridge English?
Error: [Errno 2] No such file or directory: 'd:/GraphRAG/evaluate/storage/docstore.json'
Predicted label: __label__Dao_tao
Probability: 0.997276782989502
Question classified as: Dao_tao
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage\docstore.json.
Error generating answer for question: Khung năng lực ngoại ngữ được sử dụng cho Việt Nam có bao nhi

Downloading builder script: 5.94kB [00:00, ?B/s]
Downloading extra modules: 4.07kB [00:00, ?B/s]                       
Downloading extra modules: 3.34kB [00:00, ?B/s]
Downloading builder script: 6.27kB [00:00, ?B/s]


ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['absl', 'rouge_score'] using 'pip install # Here to have a nice missing dependency error message early on rouge_score' for instance'