In [1]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_READ_TOKEN'))

In [8]:
!pip install nltk rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a934f302707923d22f681d26635b4ce90ca9f1be807124bbf3ee252f1a551fe1
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [13]:
"""
Testing script for base Qwen/Qwen3-4B-2507 model on WebQSP dataset
Optimized for CUDA/Google Colab environment
"""

import json
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import numpy as np
from typing import Dict, List
import os
from tqdm import tqdm
import time
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Set environment variables for optimization
os.environ["TOKENIZERS_PARALLELISM"] = "false"


class WebQSPTestDataset(Dataset):
    """Dataset class for WebQSP test data"""

    def __init__(self, json_file: str, max_length: int = 512):
        self.max_length = max_length
        self.data = self.load_and_process_data(json_file)

    def load_and_process_data(self, json_file: str) -> List[Dict]:
        """Load and process the WebQSP JSON file"""
        with open(json_file, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)

        processed_data = []

        # Process the data format: list of dicts with question, reasoning_graph, answer
        for idx, item in enumerate(raw_data):
            question = item.get('question', '').strip()
            reasoning_graph = item.get('reasoning_graph', [])
            answer = item.get('answer', [])

            # Skip if no question or answer
            if not question or not answer:
                continue

            # Format reasoning graph as a string
            if reasoning_graph:
                # Handle nested lists and ensure all items are strings
                flat_graph = []
                for graph_item in reasoning_graph:
                    if isinstance(graph_item, list):
                        flat_graph.extend(str(x) for x in graph_item)
                    else:
                        flat_graph.append(str(graph_item))
                reasoning_str = " -> ".join(flat_graph)
            else:
                reasoning_str = ""

            # Format answers as list of strings
            if isinstance(answer, list):
                answer_list = [str(ans) for ans in answer if ans]
            else:
                answer_list = [str(answer)]

            if question and answer_list:
                processed_data.append({
                    'question': question,
                    'reasoning_graph': reasoning_str,
                    'ground_truth_answers': answer_list,
                    'question_id': f"test_{idx}"
                })

        print(f"Loaded {len(processed_data)} test examples")
        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def load_model_and_tokenizer(
    model_name: str,
    use_4bit: bool = True
):
    """Load the model and tokenizer"""

    print(f"Loading tokenizer from {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side='right'
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if use_4bit:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )

        print(f"Loading model with 4-bit quantization from {model_name}")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16,
        )
    else:
        print(f"Loading model in FP16 from {model_name}")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16,
        )

    model.eval()
    return model, tokenizer


def create_prompt(question: str, reasoning_graph: str) -> str:
    """Create prompt for the model"""
    prompt = f"""<|im_start|>system
You are a helpful assistant who answers questions using knowledge graph reasoning. You only use the information obtained from the context provided by the user. If you don't know the answer, just say that you don't know.
<|im_end|>
<|im_start|>user
Question: {question}
Reasoning Graph: {reasoning_graph}
<|im_end|>
<|im_start|>assistant
"""
    return prompt


def generate_answer(
    model,
    tokenizer,
    question: str,
    reasoning_graph: str,
    max_new_tokens: int = 20,
    temperature: float = 0.7,
    top_p: float = 0.9,
) -> str:
    """Generate answer for a given question and reasoning graph"""

    prompt = create_prompt(question, reasoning_graph)

    # Tokenize and move inputs to the model device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    input_len = input_ids.shape[1]
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate and decode only the new tokens (slice off the prompt)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # outputs is (batch_size, seq_len); slice off prompt tokens
    gen_ids = outputs[0]
    if gen_ids.dim() == 2:
        gen_ids = gen_ids[0]

    # If generation includes the prompt, remove prompt tokens
    if gen_ids.shape[0] > input_len:
        new_tokens = gen_ids[input_len:]
    else:
        new_tokens = gen_ids

    # Decode only generated tokens and remove special tokens
    generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    # Extract assistant response (Qwen format) from generated_text
    response = generated_text
    # remove any leading header markers if present
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1]
    if "<|im_end|>" in response:
        response = response.split("<|im_end|>")[0]

    response = response.strip()
    return response


def calculate_bleu_score(prediction: str, ground_truths: List[str]) -> float:
    """Calculate BLEU score"""
    if not prediction.strip():
        return 0.0

    # Tokenize prediction
    pred_tokens = prediction.lower().split()

    # Tokenize all ground truths (BLEU expects list of reference token lists)
    reference_tokens = [gt.lower().split() for gt in ground_truths]

    # Use smoothing function to handle edge cases
    smoothing = SmoothingFunction().method1

    try:
        # Calculate BLEU score (using BLEU-4)
        bleu = sentence_bleu(reference_tokens, pred_tokens, smoothing_function=smoothing)
        return bleu
    except Exception as e:
        return 0.0


def calculate_f1_score(prediction: str, ground_truths: List[str]) -> float:
    """Calculate F1 score based on token overlap"""
    if not prediction.strip():
        return 0.0

    pred_tokens = set(prediction.lower().split())

    max_f1 = 0.0
    for gt in ground_truths:
        gt_tokens = set(gt.lower().split())

        if len(pred_tokens) == 0 or len(gt_tokens) == 0:
            continue

        common_tokens = pred_tokens & gt_tokens
        if len(common_tokens) == 0:
            continue

        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(gt_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)
        max_f1 = max(max_f1, f1)

    return max_f1


def calculate_rouge_scores(prediction: str, ground_truths: List[str]) -> Dict[str, float]:
    """Calculate ROUGE-1, ROUGE-2, and ROUGE-L scores with recall, precision, and F-measure"""
    if not prediction.strip():
        return {
            'rouge1_recall': 0.0, 'rouge1_precision': 0.0, 'rouge1_fmeasure': 0.0,
            'rouge2_recall': 0.0, 'rouge2_precision': 0.0, 'rouge2_fmeasure': 0.0,
            'rougeL_recall': 0.0, 'rougeL_precision': 0.0, 'rougeL_fmeasure': 0.0
        }

    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores against all ground truths and take the maximum for each metric
    max_scores = {
        'rouge1_recall': 0.0, 'rouge1_precision': 0.0, 'rouge1_fmeasure': 0.0,
        'rouge2_recall': 0.0, 'rouge2_precision': 0.0, 'rouge2_fmeasure': 0.0,
        'rougeL_recall': 0.0, 'rougeL_precision': 0.0, 'rougeL_fmeasure': 0.0
    }

    for gt in ground_truths:
        scores = scorer.score(gt, prediction)

        # ROUGE-1 scores
        max_scores['rouge1_recall'] = max(max_scores['rouge1_recall'], scores['rouge1'].recall)
        max_scores['rouge1_precision'] = max(max_scores['rouge1_precision'], scores['rouge1'].precision)
        max_scores['rouge1_fmeasure'] = max(max_scores['rouge1_fmeasure'], scores['rouge1'].fmeasure)

        # ROUGE-2 scores
        max_scores['rouge2_recall'] = max(max_scores['rouge2_recall'], scores['rouge2'].recall)
        max_scores['rouge2_precision'] = max(max_scores['rouge2_precision'], scores['rouge2'].precision)
        max_scores['rouge2_fmeasure'] = max(max_scores['rouge2_fmeasure'], scores['rouge2'].fmeasure)

        # ROUGE-L scores
        max_scores['rougeL_recall'] = max(max_scores['rougeL_recall'], scores['rougeL'].recall)
        max_scores['rougeL_precision'] = max(max_scores['rougeL_precision'], scores['rougeL'].precision)
        max_scores['rougeL_fmeasure'] = max(max_scores['rougeL_fmeasure'], scores['rougeL'].fmeasure)

    return max_scores


def test_model(
    model,
    tokenizer,
    dataset: WebQSPTestDataset,
    output_file: str = "base_model_test_results.json",
    num_samples: int = None,
    save_predictions: bool = True
) -> Dict:
    """Test the model on test dataset"""

    results = []
    metrics = {
        'bleu': [],
        'f1': [],
        'rouge1_recall': [], 'rouge1_precision': [], 'rouge1_fmeasure': [],
        'rouge2_recall': [], 'rouge2_precision': [], 'rouge2_fmeasure': [],
        'rougeL_recall': [], 'rougeL_precision': [], 'rougeL_fmeasure': []
    }

    num_samples = num_samples or len(dataset)
    num_samples = min(num_samples, len(dataset))

    print(f"\nTesting on {num_samples} samples...")

    start_time = time.time()

    for idx in tqdm(range(num_samples), desc="Testing"):
        item = dataset[idx]

        try:
            prediction = generate_answer(
                model,
                tokenizer,
                item['question'],
                item['reasoning_graph'],
                max_new_tokens=100,
                temperature=0.7,
                top_p=0.9
            )
        except Exception as e:
            print(f"\nError generating answer for sample {idx}: {e}")
            prediction = ""

        ground_truths = item['ground_truth_answers']

        # Calculate metrics
        bleu = calculate_bleu_score(prediction, ground_truths)
        f1 = calculate_f1_score(prediction, ground_truths)
        rouge_scores = calculate_rouge_scores(prediction, ground_truths)

        metrics['bleu'].append(bleu)
        metrics['f1'].append(f1)
        metrics['rouge1_recall'].append(rouge_scores['rouge1_recall'])
        metrics['rouge1_precision'].append(rouge_scores['rouge1_precision'])
        metrics['rouge1_fmeasure'].append(rouge_scores['rouge1_fmeasure'])
        metrics['rouge2_recall'].append(rouge_scores['rouge2_recall'])
        metrics['rouge2_precision'].append(rouge_scores['rouge2_precision'])
        metrics['rouge2_fmeasure'].append(rouge_scores['rouge2_fmeasure'])
        metrics['rougeL_recall'].append(rouge_scores['rougeL_recall'])
        metrics['rougeL_precision'].append(rouge_scores['rougeL_precision'])
        metrics['rougeL_fmeasure'].append(rouge_scores['rougeL_fmeasure'])

        # Store result
        result = {
            'question_id': item['question_id'],
            'question': item['question'],
            'reasoning_graph': item['reasoning_graph'],
            'ground_truth': ground_truths,
            'prediction': prediction,
            'bleu_score': bleu,
            'f1_score': f1,
            'rouge1_recall': rouge_scores['rouge1_recall'],
            'rouge1_precision': rouge_scores['rouge1_precision'],
            'rouge1_fmeasure': rouge_scores['rouge1_fmeasure'],
            'rouge2_recall': rouge_scores['rouge2_recall'],
            'rouge2_precision': rouge_scores['rouge2_precision'],
            'rouge2_fmeasure': rouge_scores['rouge2_fmeasure'],
            'rougeL_recall': rouge_scores['rougeL_recall'],
            'rougeL_precision': rouge_scores['rougeL_precision'],
            'rougeL_fmeasure': rouge_scores['rougeL_fmeasure']
        }
        results.append(result)

        # Print sample results every 100 samples
        if (idx + 1) % 100 == 0:
            print(f"\n--- Sample {idx + 1} ---")
            print(f"Question: {item['question']}")
            print(f"Prediction: {prediction}")
            print(f"Ground Truth: {ground_truths}")
            print(f"BLEU: {bleu:.3f}, F1: {f1:.3f}, ROUGE-L F1: {rouge_scores['rougeL_fmeasure']:.3f}")

    test_time = time.time() - start_time

    # Calculate aggregate metrics
    aggregate_metrics = {
        'bleu_score': np.mean(metrics['bleu']),
        'f1_score': np.mean(metrics['f1']),
        'rouge1_recall': np.mean(metrics['rouge1_recall']),
        'rouge1_precision': np.mean(metrics['rouge1_precision']),
        'rouge1_fmeasure': np.mean(metrics['rouge1_fmeasure']),
        'rouge2_recall': np.mean(metrics['rouge2_recall']),
        'rouge2_precision': np.mean(metrics['rouge2_precision']),
        'rouge2_fmeasure': np.mean(metrics['rouge2_fmeasure']),
        'rougeL_recall': np.mean(metrics['rougeL_recall']),
        'rougeL_precision': np.mean(metrics['rougeL_precision']),
        'rougeL_fmeasure': np.mean(metrics['rougeL_fmeasure']),
        'total_samples': num_samples,
        'test_time_minutes': test_time / 60,
        'average_time_per_sample': test_time / num_samples
    }

    # Calculate additional statistics (standard deviations)
    aggregate_metrics['bleu_score_std'] = np.std(metrics['bleu'])
    aggregate_metrics['f1_score_std'] = np.std(metrics['f1'])
    aggregate_metrics['rouge1_recall_std'] = np.std(metrics['rouge1_recall'])
    aggregate_metrics['rouge1_precision_std'] = np.std(metrics['rouge1_precision'])
    aggregate_metrics['rouge1_fmeasure_std'] = np.std(metrics['rouge1_fmeasure'])
    aggregate_metrics['rouge2_recall_std'] = np.std(metrics['rouge2_recall'])
    aggregate_metrics['rouge2_precision_std'] = np.std(metrics['rouge2_precision'])
    aggregate_metrics['rouge2_fmeasure_std'] = np.std(metrics['rouge2_fmeasure'])
    aggregate_metrics['rougeL_recall_std'] = np.std(metrics['rougeL_recall'])
    aggregate_metrics['rougeL_precision_std'] = np.std(metrics['rougeL_precision'])
    aggregate_metrics['rougeL_fmeasure_std'] = np.std(metrics['rougeL_fmeasure'])

    # Save results to file
    if save_predictions:
        output_data = {
            'aggregate_metrics': aggregate_metrics,
            'detailed_results': results
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)

        print(f"\nResults saved to {output_file}")

    return aggregate_metrics


def print_test_summary(metrics: Dict):
    """Print test summary in a formatted way"""
    print("\n" + "="*70)
    print("BASE MODEL TEST RESULTS SUMMARY")
    print("="*70)
    print(f"Total Samples:              {metrics['total_samples']}")
    print(f"Test Time:                  {metrics['test_time_minutes']:.2f} minutes")
    print(f"Avg Time per Sample:        {metrics['average_time_per_sample']:.2f} seconds")
    print("-"*70)
    print(f"BLEU Score:                 {metrics['bleu_score']:.4f} ({metrics['bleu_score']*100:.2f}%)")
    print(f"  ± Std Dev:                {metrics['bleu_score_std']:.4f}")
    print("-"*70)
    print(f"F1 Score:                   {metrics['f1_score']:.4f} ({metrics['f1_score']*100:.2f}%)")
    print(f"  ± Std Dev:                {metrics['f1_score_std']:.4f}")
    print("-"*70)
    print("ROUGE-1 Scores:")
    print(f"  Recall:                   {metrics['rouge1_recall']:.4f} ({metrics['rouge1_recall']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge1_recall_std']:.4f}")
    print(f"  Precision:                {metrics['rouge1_precision']:.4f} ({metrics['rouge1_precision']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge1_precision_std']:.4f}")
    print(f"  F-measure:                {metrics['rouge1_fmeasure']:.4f} ({metrics['rouge1_fmeasure']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge1_fmeasure_std']:.4f}")
    print("-"*70)
    print("ROUGE-2 Scores:")
    print(f"  Recall:                   {metrics['rouge2_recall']:.4f} ({metrics['rouge2_recall']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge2_recall_std']:.4f}")
    print(f"  Precision:                {metrics['rouge2_precision']:.4f} ({metrics['rouge2_precision']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge2_precision_std']:.4f}")
    print(f"  F-measure:                {metrics['rouge2_fmeasure']:.4f} ({metrics['rouge2_fmeasure']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rouge2_fmeasure_std']:.4f}")
    print("-"*70)
    print("ROUGE-L Scores:")
    print(f"  Recall:                   {metrics['rougeL_recall']:.4f} ({metrics['rougeL_recall']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rougeL_recall_std']:.4f}")
    print(f"  Precision:                {metrics['rougeL_precision']:.4f} ({metrics['rougeL_precision']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rougeL_precision_std']:.4f}")
    print(f"  F-measure:                {metrics['rougeL_fmeasure']:.4f} ({metrics['rougeL_fmeasure']*100:.2f}%)")
    print(f"    ± Std Dev:              {metrics['rougeL_fmeasure_std']:.4f}")
    print("="*70)


def analyze_results(results_file: str):
    """Analyze test results and provide insights"""

    with open(results_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    results = data['detailed_results']

    # Analyze performance patterns
    # Sort by F1 score to find best and worst predictions
    sorted_by_f1 = sorted(results, key=lambda x: x['f1_score'], reverse=True)
    high_quality = sorted_by_f1[:len(sorted_by_f1)//3]  # Top 1/3
    medium_quality = sorted_by_f1[len(sorted_by_f1)//3:2*len(sorted_by_f1)//3]  # Middle 1/3
    low_quality = sorted_by_f1[2*len(sorted_by_f1)//3:]  # Bottom 1/3

    print("\n" + "="*70)
    print("DETAILED ANALYSIS")
    print("="*70)
    print(f"High Quality Predictions (Top 33%):     {len(high_quality)}")
    print(f"Medium Quality Predictions (Mid 33%):   {len(medium_quality)}")
    print(f"Low Quality Predictions (Bottom 33%):   {len(low_quality)}")
    print("-"*70)

    # Show some examples
    if high_quality:
        print("\n✓ Sample High Quality Prediction:")
        sample = high_quality[0]
        print(f"  Question: {sample['question']}")
        print(f"  Prediction: {sample['prediction']}")
        print(f"  Ground Truth: {sample['ground_truth']}")
        print(f"  Scores - BLEU: {sample['bleu_score']:.3f}, F1: {sample['f1_score']:.3f}, ROUGE-L F1: {sample['rougeL_fmeasure']:.3f}")

    if medium_quality:
        print("\n≈ Sample Medium Quality Prediction:")
        sample = medium_quality[len(medium_quality)//2]
        print(f"  Question: {sample['question']}")
        print(f"  Prediction: {sample['prediction']}")
        print(f"  Ground Truth: {sample['ground_truth']}")
        print(f"  Scores - BLEU: {sample['bleu_score']:.3f}, F1: {sample['f1_score']:.3f}, ROUGE-L F1: {sample['rougeL_fmeasure']:.3f}")

    if low_quality:
        print("\n✗ Sample Low Quality Prediction:")
        sample = low_quality[0]
        print(f"  Question: {sample['question']}")
        print(f"  Prediction: {sample['prediction']}")
        print(f"  Ground Truth: {sample['ground_truth']}")
        print(f"  Scores - BLEU: {sample['bleu_score']:.3f}, F1: {sample['f1_score']:.3f}, ROUGE-L F1: {sample['rougeL_fmeasure']:.3f}")

    print("="*70)


def main():
    """Main testing pipeline"""

    # Configuration
    CONFIG = {
        'model_name': "meta-llama/Llama-3.2-1B-Instruct",
        'test_file': "webqsp_test_validated.json",
        'output_file': "llama_1B_test_exhops_results.json",
        'use_4bit': False,
        'num_samples': None,  # None = all samples
    }

    print("="*70)
    print("QWEN BASE MODEL TESTING PIPELINE")
    print(f"Model: {CONFIG['model_name']}")
    print("="*70)

    # Check CUDA
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("WARNING: CUDA not available, testing will be slow on CPU")

    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(
        CONFIG['model_name'],
        use_4bit=CONFIG['use_4bit']
    )

    # Load test dataset
    print(f"\nLoading test dataset from {CONFIG['test_file']}")
    test_dataset = WebQSPTestDataset(CONFIG['test_file'])

    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Run testing
    metrics = test_model(
        model,
        tokenizer,
        test_dataset,
        output_file=CONFIG['output_file'],
        num_samples=CONFIG['num_samples']
    )

    # Print summary
    print_test_summary(metrics)

    # Analyze results
    analyze_results(CONFIG['output_file'])

    print(f"\n✓ Testing completed successfully!")
    print(f"  Results saved to: {CONFIG['output_file']}")
    print("\n" + "="*70)


if __name__ == "__main__":
    main()


QWEN BASE MODEL TESTING PIPELINE
Model: meta-llama/Llama-3.2-1B-Instruct
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.17 GB
Loading tokenizer from meta-llama/Llama-3.2-1B-Instruct
Loading model in FP16 from meta-llama/Llama-3.2-1B-Instruct

Loading test dataset from webqsp_test_validated.json
Loaded 854 test examples

Testing on 854 samples...


Testing:  12%|█▏        | 100/854 [01:27<14:53,  1.19s/it]


--- Sample 100 ---
Question: what did fred durst do?
Prediction: According to the provided context, Fred Durst is a famous American musician and singer who has been involved in various roles, including being a member of the band Limp Bizkit and starring in a TV role. He was born in Jacksonville, Florida, and played the guitar. He has also been involved in other roles, such as being a professional musician, artist, and actor.
Ground Truth: ['Actor', 'Musician', 'Rapper', 'Guitarist', 'Film director', 'Businessperson', 'Songwriter', 'Singer']
BLEU: 0.003, F1: 0.043, ROUGE-L F1: 0.031


Testing:  23%|██▎       | 200/854 [03:10<13:00,  1.19s/it]


--- Sample 200 ---
Question: what currency is used in panama?
Prediction: The currency used in Panama is the United States Dollar (USD).
Ground Truth: ['United States Dollar', 'Panamanian balboa']
BLEU: 0.093, F1: 0.462, ROUGE-L F1: 0.429


Testing:  35%|███▌      | 300/854 [04:52<04:17,  2.15it/s]


--- Sample 300 ---
Question: what team is reggie bush on 2011?
Prediction: Based on the provided reasoning graph, Reggie Bush is a football player who was drafted by the New Orleans Saints in the 2005 NFL Draft.
Ground Truth: ['Miami Dolphins', 'New Orleans Saints']
BLEU: 0.037, F1: 0.231, ROUGE-L F1: 0.214


Testing:  47%|████▋     | 400/854 [06:23<07:50,  1.04s/it]


--- Sample 400 ---
Question: what county is san diego ca?
Prediction: The county is San Diego County.
Ground Truth: ['San Diego County']
BLEU: 0.096, F1: 0.667, ROUGE-L F1: 0.667


Testing:  59%|█████▊    | 500/854 [08:04<05:40,  1.04it/s]


--- Sample 500 ---
Question: where did the greek alphabet come from?
Prediction: The Greek alphabet is believed to have originated from the Phoenician alphabet, which was used by the Phoenicians to write various languages, including Greek. The Phoenician alphabet is thought to have been created around 1500 BCE and consisted of 22 characters. Over time, the Phoenician alphabet evolved and was adapted by the Greeks to create their own writing system. The Greek alphabet is unique in that it is the first alphabet to have 24 characters, and it is still used today
Ground Truth: ['Phoenician alphabet']
BLEU: 0.005, F1: 0.075, ROUGE-L F1: 0.048


Testing:  70%|███████   | 601/854 [09:40<02:22,  1.77it/s]


--- Sample 600 ---
Question: what kind of government is south korea?
Prediction: Based on the provided information, I would say that South Korea is a unitary state with a presidential system.
Ground Truth: ['Constitutional republic', 'Presidential system', 'Unitary state']
BLEU: 0.024, F1: 0.200, ROUGE-L F1: 0.190


Testing:  82%|████████▏ | 700/854 [11:09<01:52,  1.36it/s]


--- Sample 700 ---
Question: who did ben stiller date?
Prediction: Ben Stiller dated Amanda Peet.
Ground Truth: ['Calista Flockhart', 'Amanda Peet', 'Jeanne Tripplehorn', 'Janeane Garofalo', 'Christine Taylor', 'Claire Forlani', 'Rhea Durham']
BLEU: 0.054, F1: 0.286, ROUGE-L F1: 0.571


Testing:  94%|█████████▎| 800/854 [12:48<00:33,  1.62it/s]


--- Sample 800 ---
Question: who was william mckinley married to?
Prediction: Based on the information provided, Ida McKinley was William McKinley's wife.
Ground Truth: ['Ida Saxton McKinley']
BLEU: 0.022, F1: 0.286, ROUGE-L F1: 0.267


Testing: 100%|██████████| 854/854 [13:37<00:00,  1.04it/s]


Results saved to llama_1B_test_exhops_results.json

BASE MODEL TEST RESULTS SUMMARY
Total Samples:              854
Test Time:                  13.63 minutes
Avg Time per Sample:        0.96 seconds
----------------------------------------------------------------------
BLEU Score:                 0.0191 (1.91%)
  ± Std Dev:                0.0389
----------------------------------------------------------------------
F1 Score:                   0.1172 (11.72%)
  ± Std Dev:                0.1365
----------------------------------------------------------------------
ROUGE-1 Scores:
  Recall:                   0.7823 (78.23%)
    ± Std Dev:              0.3885
  Precision:                0.1034 (10.34%)
    ± Std Dev:              0.1060
  F-measure:                0.1711 (17.11%)
    ± Std Dev:              0.1584
----------------------------------------------------------------------
ROUGE-2 Scores:
  Recall:                   0.4490 (44.90%)
    ± Std Dev:              0.4912
  Precision




In [None]:
_