In [None]:
#!!!!!!!!!!!ADJUST PROMPT AS IN GIVE KNOWLEDGE AND QUESTION BOTH TO ALIF TO FIGURE OUT HALLUCINATION!!!

In [1]:
# Install required packages
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install sentencepiece
!pip install sentence_transformers
!pip install tqdm
!pip install pandas

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
# Create necessary directories
!mkdir -p halueval_urdu_data
!mkdir -p evaluation_results

In [3]:
import torch
import json
import os
import time
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [4]:
# Constants and configurations - optimized for Google Colab T4 GPU
CATEGORIES = ["knowledge", "dialogue", "general"]
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
DATA_DIR = "halueval_urdu_data"  # Directory containing the translated HaluEval datasets
OUTPUT_DIR = "evaluation_results"
SAMPLE_LIMIT = 50  # Limit samples per category to avoid long runtime on Colab
BATCH_SIZE = 1  # T4 can only handle one sample at a time for 8B model
MAX_LENGTH = 1024  # Reduced to save memory

In [5]:
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [6]:
# Initialize sentence transformer model for semantic similarity
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')  # Multilingual model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
def format_prompt(input_text, category=None, knowledge=None):
    """Format input text with appropriate prompt template for Alif model

    Args:
        input_text: The main question or instruction
        category: The evaluation category (knowledge, dialogue, etc.)
        knowledge: Optional knowledge context to include in the prompt
    """
    base_template = """
    You are Urdu Chatbot.
    ### Instruction:
    Below is an instruction that describes a task. Write a response in urdu that appropriately completes the request. Don't say you don't know unless you really don't.
    Please be expressive when needed. Give long and detailed answers.
    """

    # Category-specific templates that include knowledge with explicit instruction
    if category == "knowledge" and knowledge:
        prompt_template = base_template + """
    ### Knowledge:
    {knowledge}

    ### Input:
    آپ کو دی گئی معلومات کے مطابق درج ذیل سوال کا جواب دیں۔ صرف دی گئی معلومات کا استعمال کریں۔

    {prompt}

    ### Response:
    """
        return prompt_template.format(knowledge=knowledge, prompt=input_text)

    elif category == "dialogue" and knowledge:
        prompt_template = base_template + """
    ### Knowledge:
    {knowledge}

    ### Input:
    آپ کو دی گئی معلومات کے مطابق ہی گفتگو میں حصہ لیں۔ صرف دی گئی معلومات کا استعمال کریں اور کوئی اضافی معلومات یا اختراع شامل نہ کریں۔

    {prompt}

    ### Response:
    """
        return prompt_template.format(knowledge=knowledge, prompt=input_text)

    elif category == "summarization" and knowledge:
        # For summarization, the document is already in the prompt
        prompt_template = base_template + """
    ### Input:
    درج ذیل مضمون کا مختصر خلاصہ کریں۔ صرف وہی معلومات شامل کریں جو مضمون میں موجود ہیں۔

    {prompt}

    ### Response:
    """
        return prompt_template.format(prompt=input_text)

    # Default template without knowledge
    else:
        prompt_template = base_template + """
    ### Input:
    {prompt}

    ### Response:
    """
        return prompt_template.format(prompt=input_text)

In [8]:
def load_model():
    """Load the Alif model and tokenizer - optimized for Google Colab T4 GPU"""
    print("Loading Alif model and tokenizer...")

    # Clear CUDA cache before loading model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"Available GPU memory before loading: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Used GPU memory before loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    # 4-bit quantization configuration - optimized for T4 GPU
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    # Load model with optimized settings for T4
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,  # Use fp16 for better performance
        low_cpu_mem_usage=True      # Reduce CPU memory usage during loading
    )

    # Report memory usage after loading
    if torch.cuda.is_available():
        print(f"Used GPU memory after loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    return model, tokenizer

In [9]:
def generate_response(model, tokenizer, query, category=None, knowledge=None):
    """Generate response from Alif model for a given query - optimized for T4 GPU

    Args:
        model: The loaded model
        tokenizer: The loaded tokenizer
        query: The query text
        category: The evaluation category (knowledge, dialogue, etc.)
        knowledge: Optional knowledge context to include
    """
    prompt = format_prompt(query, category, knowledge)

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate response with reduced max_new_tokens to save memory
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,  # Reduced from 1024 to save memory
            do_sample=True,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            repetition_penalty=1.2,
        )

    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

    # Clear CUDA cache to prevent OOM
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return response.strip()

In [10]:
def load_halueval_dataset(category):
    """Load the translated HaluEval dataset for a specific category"""
    file_path = os.path.join(DATA_DIR, f"{category}_data_urdu.json")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} samples from {category} dataset")
        return data
    except Exception as e:
        print(f"Error loading {category} dataset: {e}")
        return []

In [11]:
def calculate_semantic_similarity(text1, text2):
    """Calculate semantic similarity between two texts using sentence embeddings - T4 optimized"""
    try:
        # Handle empty strings
        if not text1.strip() or not text2.strip():
            return 0.0

        # Truncate very long texts to prevent OOM
        max_text_length = 1000  # Characters
        if len(text1) > max_text_length:
            text1 = text1[:max_text_length]
        if len(text2) > max_text_length:
            text2 = text2[:max_text_length]

        # Generate embeddings with CPU to save GPU memory
        with torch.no_grad():  # Ensure no gradients are calculated
            embedding1 = embedding_model.encode(text1, convert_to_tensor=True)
            embedding2 = embedding_model.encode(text2, convert_to_tensor=True)

        # Calculate cosine similarity
        similarity = torch.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()

        return similarity
    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return 0.0

In [12]:
def evaluate_knowledge_qa(model, tokenizer, data, results_file):
    """Evaluate model on Knowledge QA category - T4 GPU optimized"""
    results = []

    # Limit the number of samples to evaluate (for Colab T4 environment)
    sample_count = min(SAMPLE_LIMIT, len(data))
    eval_data = data[:sample_count]

    for i, sample in enumerate(tqdm(eval_data)):
        try:
            knowledge = sample.get("knowledge", "")
            question = sample.get("question", "")
            right_answer = sample.get("right_answer", "")
            hallucinated_answer = sample.get("hallucinated_answer", "")

            # Generate model response with knowledge in the prompt
            model_response = generate_response(model, tokenizer, question,
                                              category="knowledge",
                                              knowledge=knowledge)

            # Calculate similarity scores
            similarity_to_right = calculate_semantic_similarity(model_response, right_answer)
            similarity_to_hallucinated = calculate_semantic_similarity(model_response, hallucinated_answer)

            # Determine if response is hallucinated
            is_hallucinated = similarity_to_hallucinated > similarity_to_right

            # Save result
            result = {
                "knowledge": knowledge,
                "question": question,
                "model_response": model_response,
                "right_answer": right_answer,
                "hallucinated_answer": hallucinated_answer,
                "similarity_to_right": similarity_to_right,
                "similarity_to_hallucinated": similarity_to_hallucinated,
                "is_hallucinated": is_hallucinated
            }

            results.append(result)

            # Save progress after each sample for Colab (in case of disconnections)
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

            # Run garbage collection to free memory
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing knowledge QA sample {i}: {e}")
            # Save progress even when errors occur
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

    # Save final results
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return results

In [13]:
def evaluate_dialogue(model, tokenizer, data, results_file):
    """Evaluate model on Dialogue category - T4 GPU optimized"""
    results = []

    # Limit the number of samples to evaluate (for Colab T4 environment)
    sample_count = min(SAMPLE_LIMIT, len(data))
    eval_data = data[:sample_count]

    for i, sample in enumerate(tqdm(eval_data)):
        try:
            knowledge = sample.get("knowledge", "")
            dialogue_history = sample.get("dialogue_history", "")
            right_response = sample.get("right_response", "")
            hallucinated_response = sample.get("hallucinated_response", "")

            # Generate model response with knowledge included
            model_response = generate_response(model, tokenizer, dialogue_history,
                                              category="dialogue",
                                              knowledge=knowledge)

            # Calculate similarity scores
            similarity_to_right = calculate_semantic_similarity(model_response, right_response)
            similarity_to_hallucinated = calculate_semantic_similarity(model_response, hallucinated_response)

            # Determine if response is hallucinated
            is_hallucinated = similarity_to_hallucinated > similarity_to_right

            # Save result
            result = {
                "knowledge": knowledge,
                "dialogue_history": dialogue_history,
                "model_response": model_response,
                "right_response": right_response,
                "hallucinated_response": hallucinated_response,
                "similarity_to_right": similarity_to_right,
                "similarity_to_hallucinated": similarity_to_hallucinated,
                "is_hallucinated": is_hallucinated
            }

            results.append(result)

            # Save progress after each sample for Colab (in case of disconnections)
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

            # Run garbage collection to free memory
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing dialogue sample {i}: {e}")
            # Save progress even when errors occur
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

    # Save final results
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return results

In [14]:
# def evaluate_summarization(model, tokenizer, data, results_file):
#     """Evaluate model on Summarization category - T4 GPU optimized"""
#     results = []

#     # Limit the number of samples to evaluate (for Colab T4 environment)
#     # Use even smaller sample size for summarization since documents are longer
#     sample_count = min(int(SAMPLE_LIMIT/2), len(data))
#     eval_data = data[:sample_count]

#     for i, sample in enumerate(tqdm(eval_data)):
#         try:
#             document = sample.get("document", "")
#             right_summary = sample.get("right_summary", "")
#             hallucinated_summary = sample.get("hallucinated_summary", "")

#             # Truncate long documents to save memory
#             if len(document) > 4000:
#                 document = document[:4000] + "..."

#             # Create prompt for summarization
#             query = f"اس مضمون کا خلاصہ کریں: {document}"

#             # Generate model summary
#             model_summary = generate_response(model, tokenizer, query)

#             # Calculate similarity scores
#             similarity_to_right = calculate_semantic_similarity(model_summary, right_summary)
#             similarity_to_hallucinated = calculate_semantic_similarity(model_summary, hallucinated_summary)

#             # Determine if summary is hallucinated
#             is_hallucinated = similarity_to_hallucinated > similarity_to_right

#             # Save result
#             result = {
#                 # Store just the beginning of the document to save file size
#                 "document": document[:500] + "..." if len(document) > 500 else document,
#                 "model_summary": model_summary,
#                 "right_summary": right_summary,
#                 "hallucinated_summary": hallucinated_summary,
#                 "similarity_to_right": similarity_to_right,
#                 "similarity_to_hallucinated": similarity_to_hallucinated,
#                 "is_hallucinated": is_hallucinated
#             }

#             results.append(result)

#             # Save progress after each sample for Colab (in case of disconnections)
#             with open(results_file, 'w', encoding='utf-8') as f:
#                 json.dump(results, f, ensure_ascii=False, indent=2)

#             # Run garbage collection to free memory - critical for summarization
#             gc.collect()
#             if torch.cuda.is_available():
#                 torch.cuda.empty_cache()

#         except Exception as e:
#             print(f"Error processing summarization sample {i}: {e}")
#             # Save progress even when errors occur
#             with open(results_file, 'w', encoding='utf-8') as f:
#                 json.dump(results, f, ensure_ascii=False, indent=2)

#     # Save final results
#     with open(results_file, 'w', encoding='utf-8') as f:
#         json.dump(results, f, ensure_ascii=False, indent=2)

#     return results

In [15]:
def evaluate_general(model, tokenizer, data, results_file):
    """Evaluate model on General category - T4 GPU optimized"""
    results = []

    # Limit the number of samples to evaluate (for Colab T4 environment)
    sample_count = min(SAMPLE_LIMIT, len(data))
    eval_data = data[:sample_count]

    for i, sample in enumerate(tqdm(eval_data)):
        try:
            user_query = sample.get("user_query", "")
            reference_response = sample.get("chatgpt_response", "")  # Using this as reference
            hallucination_label = sample.get("hallucination", "no")  # "yes" or "no"

            # Generate model response
            model_response = generate_response(model, tokenizer, user_query)

            # For general category, we check if the model response contains hallucinations
            # based on similarity to reference and presence of potential hallucination spans

            # Calculate similarity to reference
            similarity_to_reference = calculate_semantic_similarity(model_response, reference_response)

            # Determine hallucination based on reference similarity threshold
            # This is a simple heuristic - you may need to adjust the threshold
            is_hallucinated = similarity_to_reference < 0.5

            # Save result
            result = {
                "user_query": user_query,
                "model_response": model_response,
                "reference_response": reference_response,
                "reference_hallucination_label": hallucination_label,
                "similarity_to_reference": similarity_to_reference,
                "is_hallucinated": is_hallucinated
            }

            results.append(result)

            # Save progress after each sample for Colab (in case of disconnections)
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

            # Run garbage collection to free memory
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing general sample {i}: {e}")
            # Save progress even when errors occur
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

    # Save final results
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return results

In [16]:
def generate_report(evaluation_results):
    """Generate comprehensive evaluation report"""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    report_file = os.path.join(OUTPUT_DIR, f"evaluation_report_{timestamp}.txt")

    # Calculate overall metrics
    overall_stats = {}

    for category in CATEGORIES:
        results = evaluation_results.get(category, [])
        if not results:
            continue

        # Calculate hallucination rate
        hallucination_count = sum(1 for r in results if r.get("is_hallucinated", False))
        hallucination_rate = hallucination_count / len(results) if results else 0

        # Calculate average similarity scores
        if category in ["knowledge", "dialogue", "summarization"]:
            avg_similarity_to_right = np.mean([r.get("similarity_to_right", 0) for r in results])
            avg_similarity_to_hallucinated = np.mean([r.get("similarity_to_hallucinated", 0) for r in results])
        else:  # General category
            avg_similarity_to_right = np.mean([r.get("similarity_to_reference", 0) for r in results])
            avg_similarity_to_hallucinated = 0  # Not applicable for general

        overall_stats[category] = {
            "sample_count": len(results),
            "hallucination_count": hallucination_count,
            "hallucination_rate": hallucination_rate,
            "avg_similarity_to_right": avg_similarity_to_right,
            "avg_similarity_to_hallucinated": avg_similarity_to_hallucinated
        }

    # Calculate overall hallucination rate across all categories
    total_samples = sum(stats["sample_count"] for stats in overall_stats.values())
    total_hallucinations = sum(stats["hallucination_count"] for stats in overall_stats.values())
    overall_hallucination_rate = total_hallucinations / total_samples if total_samples > 0 else 0

    # Generate report text
    report = []
    report.append("=" * 80)
    report.append(f"ALIF MODEL HALLUCINATION EVALUATION REPORT - {timestamp}")
    report.append("=" * 80)
    report.append(f"\nModel: {MODEL_ID}")
    report.append(f"Total samples evaluated: {total_samples}")
    report.append(f"Overall hallucination rate: {overall_hallucination_rate:.4f} ({total_hallucinations}/{total_samples})")
    report.append("\n" + "=" * 80)

    # Add category-specific stats
    for category in CATEGORIES:
        if category in overall_stats:
            stats = overall_stats[category]
            report.append(f"\n{category.upper()} CATEGORY:")
            report.append(f"- Samples evaluated: {stats['sample_count']}")
            report.append(f"- Hallucination rate: {stats['hallucination_rate']:.4f} ({stats['hallucination_count']}/{stats['sample_count']})")
            report.append(f"- Avg. similarity to correct response: {stats['avg_similarity_to_right']:.4f}")
            if category != "general":
                report.append(f"- Avg. similarity to hallucinated response: {stats['avg_similarity_to_hallucinated']:.4f}")
            report.append("-" * 40)

    # Add examples of hallucinated and non-hallucinated responses
    report.append("\n" + "=" * 80)
    report.append("\nEXAMPLE RESPONSES:")

    for category in CATEGORIES:
        results = evaluation_results.get(category, [])
        if not results:
            continue

        # Get hallucinated examples
        hallucinated_examples = [r for r in results if r.get("is_hallucinated", False)]
        non_hallucinated_examples = [r for r in results if not r.get("is_hallucinated", False)]

        report.append(f"\n{category.upper()} - HALLUCINATED EXAMPLE:")
        if hallucinated_examples:
            example = hallucinated_examples[0]
            if category == "knowledge":
                report.append(f"Question: {example.get('question', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Right answer: {example.get('right_answer', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            elif category == "dialogue":
                report.append(f"Dialogue history: {example.get('dialogue_history', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Right response: {example.get('right_response', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            elif category == "summarization":
                report.append(f"Document: {example.get('document', '')[:200]}...")  # First 200 chars
                report.append(f"Model summary: {example.get('model_summary', '')}")
                report.append(f"Right summary: {example.get('right_summary', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            else:  # General
                report.append(f"User query: {example.get('user_query', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Reference response: {example.get('reference_response', '')}")
                report.append(f"Similarity to reference: {example.get('similarity_to_reference', 0):.4f}")
        else:
            report.append("No hallucinated examples found.")

        report.append(f"\n{category.upper()} - NON-HALLUCINATED EXAMPLE:")
        if non_hallucinated_examples:
            example = non_hallucinated_examples[0]
            if category == "knowledge":
                report.append(f"Question: {example.get('question', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Right answer: {example.get('right_answer', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            elif category == "dialogue":
                report.append(f"Dialogue history: {example.get('dialogue_history', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Right response: {example.get('right_response', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            elif category == "summarization":
                report.append(f"Document: {example.get('document', '')[:200]}...")  # First 200 chars
                report.append(f"Model summary: {example.get('model_summary', '')}")
                report.append(f"Right summary: {example.get('right_summary', '')}")
                report.append(f"Similarity to right: {example.get('similarity_to_right', 0):.4f}")
                report.append(f"Similarity to hallucinated: {example.get('similarity_to_hallucinated', 0):.4f}")
            else:  # General
                report.append(f"User query: {example.get('user_query', '')}")
                report.append(f"Model response: {example.get('model_response', '')}")
                report.append(f"Reference response: {example.get('reference_response', '')}")
                report.append(f"Similarity to reference: {example.get('similarity_to_reference', 0):.4f}")
        else:
            report.append("No non-hallucinated examples found.")

    # Save the report
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))

    # Print the report
    print('\n'.join(report))

    print(f"\nReport saved to {report_file}")

    # Also create CSV summary for easy analysis
    summary_data = []
    for category, stats in overall_stats.items():
        summary_data.append({
            "Category": category,
            "Samples": stats["sample_count"],
            "Hallucinations": stats["hallucination_count"],
            "Hallucination_Rate": stats["hallucination_rate"],
            "Avg_Similarity_To_Right": stats["avg_similarity_to_right"],
            "Avg_Similarity_To_Hallucinated": stats["avg_similarity_to_hallucinated"]
        })

    # Add overall row
    summary_data.append({
        "Category": "OVERALL",
        "Samples": total_samples,
        "Hallucinations": total_hallucinations,
        "Hallucination_Rate": overall_hallucination_rate,
        "Avg_Similarity_To_Right": np.mean([s["avg_similarity_to_right"] for s in overall_stats.values()]),
        "Avg_Similarity_To_Hallucinated": np.mean([s["avg_similarity_to_hallucinated"] for s in overall_stats.values() if category != "general"])
    })

    # Save summary to CSV
    summary_df = pd.DataFrame(summary_data)
    summary_csv = os.path.join(OUTPUT_DIR, f"evaluation_summary_{timestamp}.csv")
    summary_df.to_csv(summary_csv, index=False)

    print(f"Summary saved to {summary_csv}")

    return report_file, summary_csv

In [17]:
def main():
    """Main evaluation pipeline - optimized for Google Colab T4 GPU"""
    start_time = time.time()

    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Check for CUDA availability and print GPU info
    if torch.cuda.is_available():
        print(f"CUDA available: {torch.cuda.is_available()}")
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("WARNING: CUDA not available. Running on CPU will be extremely slow.")

    # Initialize results dictionary
    evaluation_results = {}

    # Process each category separately to avoid memory leaks
    for category in CATEGORIES:
        print(f"\n{'='*50}\nEvaluating {category.upper()} category\n{'='*50}")

        # Load dataset
        data = load_halueval_dataset(category)

        if not data:
            print(f"Skipping {category} due to missing or empty dataset")
            continue

        # Set up results file
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        results_file = os.path.join(OUTPUT_DIR, f"{category}_results_{timestamp}.json")

        # Load model and tokenizer fresh for each category to prevent memory leaks
        model, tokenizer = load_model()

        try:
            # Run evaluation based on category
            if category == "knowledge":
                results = evaluate_knowledge_qa(model, tokenizer, data, results_file)
            elif category == "dialogue":
                results = evaluate_dialogue(model, tokenizer, data, results_file)
            #elif category == "summarization":
             #   results = evaluate_summarization(model, tokenizer, data, results_file)
            elif category == "general":
                results = evaluate_general(model, tokenizer, data, results_file)

            # Store results
            evaluation_results[category] = results

            print(f"Completed evaluation for {category}, results saved to {results_file}")

        except Exception as e:
            print(f"Error during {category} evaluation: {e}")
        finally:
            # Delete model and tokenizer to free up memory for next category
            del model
            del tokenizer
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                print(f"Memory freed for next category. Current GPU usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    # Generate comprehensive report
    try:
        report_file, summary_csv = generate_report(evaluation_results)

        # Print execution time
        total_time = time.time() - start_time
        hours, remainder = divmod(total_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        print(f"\nTotal evaluation time: {int(hours)}h {int(minutes)}m {int(seconds)}s")
        print(f"Report: {report_file}")
        print(f"Summary: {summary_csv}")
    except Exception as e:
        print(f"Error generating final report: {e}")
        # Still save evaluation results in case report generation fails
        with open(os.path.join(OUTPUT_DIR, f"evaluation_results_raw_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"), 'w', encoding='utf-8') as f:
            json.dump(evaluation_results, f, ensure_ascii=False, indent=2)

In [19]:
if __name__ == "__main__":
    main()

CUDA available: True
GPU: Tesla T4
Total GPU memory: 15.83 GB

Evaluating KNOWLEDGE category
Loaded 10 samples from knowledge dataset
Loading Alif model and tokenizer...
Available GPU memory before loading: 15.83 GB
Used GPU memory before loading: 1.11 GB


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Used GPU memory after loading: 10.29 GB


100%|██████████| 10/10 [01:41<00:00, 10.14s/it]


Completed evaluation for knowledge, results saved to evaluation_results/knowledge_results_2025-04-27_13-31-42.json
Memory freed for next category. Current GPU usage: 1.12 GB

Evaluating DIALOGUE category
Loaded 10 samples from dialogue dataset
Loading Alif model and tokenizer...
Available GPU memory before loading: 15.83 GB
Used GPU memory before loading: 1.12 GB


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Used GPU memory after loading: 10.29 GB


100%|██████████| 10/10 [01:50<00:00, 11.05s/it]


Completed evaluation for dialogue, results saved to evaluation_results/dialogue_results_2025-04-27_13-40-11.json
Memory freed for next category. Current GPU usage: 1.12 GB

Evaluating GENERAL category
Loaded 10 samples from general dataset
Loading Alif model and tokenizer...
Available GPU memory before loading: 15.83 GB
Used GPU memory before loading: 1.12 GB


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Used GPU memory after loading: 10.29 GB


100%|██████████| 10/10 [07:32<00:00, 45.24s/it]


Completed evaluation for general, results saved to evaluation_results/general_results_2025-04-27_13-43-43.json
Memory freed for next category. Current GPU usage: 1.12 GB
ALIF MODEL HALLUCINATION EVALUATION REPORT - 2025-04-27_13-52-55

Model: large-traversaal/Alif-1.0-8B-Instruct
Total samples evaluated: 30
Overall hallucination rate: 0.4000 (12/30)


KNOWLEDGE CATEGORY:
- Samples evaluated: 10
- Hallucination rate: 0.3000 (3/10)
- Avg. similarity to correct response: 0.3479
- Avg. similarity to hallucinated response: 0.2330
----------------------------------------

DIALOGUE CATEGORY:
- Samples evaluated: 10
- Hallucination rate: 0.3000 (3/10)
- Avg. similarity to correct response: 0.1481
- Avg. similarity to hallucinated response: 0.1642
----------------------------------------

GENERAL CATEGORY:
- Samples evaluated: 10
- Hallucination rate: 0.6000 (6/10)
- Avg. similarity to correct response: 0.4464
----------------------------------------


EXAMPLE RESPONSES:

KNOWLEDGE - HALLUCINAT