In [1]:
"""
Initial Evaluation - Prompting Strategy Comparison
Testing naive RAG with different prompting approaches
"""
import sys
sys.path.insert(0, '../src')

from naive_rag import NaiveRAG
from utils import calculate_f1_score, exact_match
import pandas as pd
import json
import time
from tqdm import tqdm

print("Loading RAG system...")
rag = NaiveRAG()
documents, qa_pairs = rag.load_dataset()
rag.create_embeddings(batch_size=32)
vector_db = rag.build_vector_db()
# Free up RAM by deleting large embeddings array
#del rag.embeddings
#import gc
#gc.collect()
rag.load_generator()

print(f"\nSystem ready")
print(f"Total test questions: {len(qa_pairs)}")

2025-10-02 02:12:58,427 - naive_rag - INFO - Initializing RAG with embedding model: all-MiniLM-L6-v2
2025-10-02 02:12:58,441 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2025-10-02 02:12:58,441 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Utils module loaded successfully!
Loading RAG system...


2025-10-02 02:13:00,714 - naive_rag - INFO - RAG system initialized successfully
2025-10-02 02:13:00,714 - naive_rag - INFO - Loading RAG Mini Wikipedia dataset...
2025-10-02 02:13:00,714 - naive_rag - INFO - Loading text corpus...
2025-10-02 02:13:04,803 - naive_rag - INFO - Loading Q&A pairs...
2025-10-02 02:13:06,823 - naive_rag - INFO - Loaded 3200 documents and 918 Q&A pairs
2025-10-02 02:13:06,823 - naive_rag - INFO - Creating embeddings for 3200 documents...


Batches:   0%|          | 0/100 [00:00<?, ?it/s]

2025-10-02 02:14:56,992 - naive_rag - INFO - Created embeddings with shape: (3200, 384)
2025-10-02 02:14:56,992 - naive_rag - INFO - Building FAISS vector database...
2025-10-02 02:14:56,999 - naive_rag - INFO - FAISS index built with 3200 vectors
2025-10-02 02:14:56,999 - naive_rag - INFO - Loading generation model: google/flan-t5-base
2025-10-02 02:14:57,835 - naive_rag - INFO - Generation model loaded successfully



System ready
Total test questions: 918


In [3]:
#Converting the qa_pairs to a list and taking the first 100 questions
test_size = 100
test_questions = [qa_pairs[i] for i in range(min(test_size, len(qa_pairs)))]

print(f"Evaluating on {test_size} questions")
print(f"\nSample Questions:")
for i in range(5):
    print(f"{i+1} Q: {test_questions[i]['question']}")
    print(f"{i+1} A: {test_questions[i]['answer']}")

Evaluating on 100 questions

Sample Questions:
1 Q: Was Abraham Lincoln the sixteenth President of the United States?
1 A: yes
2 Q: Did Lincoln sign the National Banking Act of 1863?
2 A: yes
3 Q: Did his mother die of pneumonia?
3 A: no
4 Q: How many long was Lincoln's formal education?
4 A: 18 months
5 Q: When did Lincoln begin his political career?
5 A: 1832


In [5]:
# Test all 4 prompting strategies
strategies = ["basic", "cot", "persona", "instruction"]
results = []

print("Starting evaluation across all strategies...")
print("This tales ~15-20 minutes\n")

for strategy in strategies:
	print(f"\n")
	print(f"Testing strategy: {strategy.upper()}")
	print("\n")
	
	strategy_results = []
	start_time = time.time()
	
	for i, qa in enumerate(tqdm(test_questions, desc=f"{strategy}")):
		question = qa['question']
		expected_answer = qa['answer']
		
		try:
			# Get RAG answer
			result = rag.query(question, top_k=1, prompt_strategy=strategy)
			predicted_answer = result['answer']
			
			# Calculate metrics
			f1 = calculate_f1_score(predicted_answer, expected_answer)
			em = exact_match(predicted_answer, expected_answer)
			
			strategy_results.append({
				'question': question,
				'expected': expected_answer,
				'predicted': predicted_answer,
				'f1_score': f1,
				'exact_match': em,
				'retrieval_score': result['retrieval_score']
			})
			
		except Exception as e:
			print(f"\nError on question {i}: {e}")
			strategy_results.append({
				'question': question,
				'expected': expected_answer,
				'predicted': "ERROR",
				'f1_score': 0.0,
				'exact_match': False,
				'retrieval_score': 0.0
			})
	
	# Calculate aggregate metrics
	avg_f1 = sum(r['f1_score'] for r in strategy_results) / len(strategy_results)
	avg_em = sum(r['exact_match'] for r in strategy_results) / len(strategy_results)
	elapsed = time.time() - start_time
	
	print(f"\n{strategy.upper()} Results:")
	print(f"  Average F1 Score: {avg_f1:.3f}")
	print(f"  Exact Match Rate: {avg_em:.3f} ({int(avg_em*100)}%)")
	print(f"  Time: {elapsed/60:.1f} minutes")
	
	results.append({
		'strategy': strategy,
		'avg_f1': avg_f1,
		'avg_em': avg_em,
		'time_minutes': elapsed/60,
		'detailed_results': strategy_results
	})

print("\n")
print("EVALUATION COMPLETE")
print("\n")

Starting evaluation across all strategies...
This tales ~15-20 minutes



Testing strategy: BASIC




basic:   0%|                                                                                   | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   1%|▊                                                                          | 1/100 [00:01<02:30,  1.52s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   2%|█▌                                                                         | 2/100 [00:02<02:22,  1.45s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   3%|██▎                                                                        | 3/100 [00:03<02:03,  1.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   4%|███                                                                        | 4/100 [00:05<02:07,  1.33s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   5%|███▊                                                                       | 5/100 [00:06<01:52,  1.18s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   6%|████▌                                                                      | 6/100 [00:07<02:03,  1.32s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   7%|█████▎                                                                     | 7/100 [00:08<01:48,  1.17s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   8%|██████                                                                     | 8/100 [00:10<02:02,  1.33s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:   9%|██████▊                                                                    | 9/100 [00:11<02:01,  1.34s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  10%|███████▍                                                                  | 10/100 [00:12<01:37,  1.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  11%|████████▏                                                                 | 11/100 [00:13<01:26,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  12%|████████▉                                                                 | 12/100 [00:13<01:24,  1.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  13%|█████████▌                                                                | 13/100 [00:14<01:19,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  14%|██████████▎                                                               | 14/100 [00:15<01:08,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  15%|███████████                                                               | 15/100 [00:16<01:17,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  16%|███████████▊                                                              | 16/100 [00:18<01:41,  1.21s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  17%|████████████▌                                                             | 17/100 [00:20<01:53,  1.36s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  18%|█████████████▎                                                            | 18/100 [00:22<02:18,  1.69s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  19%|██████████████                                                            | 19/100 [00:23<01:56,  1.44s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  20%|██████████████▊                                                           | 20/100 [00:24<01:48,  1.35s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  21%|███████████████▌                                                          | 21/100 [00:26<01:57,  1.48s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  22%|████████████████▎                                                         | 22/100 [00:26<01:32,  1.19s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  23%|█████████████████                                                         | 23/100 [00:27<01:18,  1.02s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  24%|█████████████████▊                                                        | 24/100 [00:28<01:14,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  25%|██████████████████▌                                                       | 25/100 [00:29<01:12,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  26%|███████████████████▏                                                      | 26/100 [00:29<00:59,  1.25it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  27%|███████████████████▉                                                      | 27/100 [00:30<00:55,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  28%|████████████████████▋                                                     | 28/100 [00:32<01:21,  1.14s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  29%|█████████████████████▍                                                    | 29/100 [00:33<01:11,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  30%|██████████████████████▏                                                   | 30/100 [00:33<01:08,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  31%|██████████████████████▉                                                   | 31/100 [00:34<01:02,  1.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  32%|███████████████████████▋                                                  | 32/100 [00:35<01:00,  1.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  33%|████████████████████████▍                                                 | 33/100 [00:36<01:03,  1.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  34%|█████████████████████████▏                                                | 34/100 [00:37<00:54,  1.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  35%|█████████████████████████▉                                                | 35/100 [00:38<00:56,  1.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  36%|██████████████████████████▋                                               | 36/100 [00:39<01:11,  1.12s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  37%|███████████████████████████▍                                              | 37/100 [00:41<01:24,  1.35s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  38%|████████████████████████████                                              | 38/100 [00:43<01:30,  1.47s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  39%|████████████████████████████▊                                             | 39/100 [00:44<01:23,  1.37s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  40%|█████████████████████████████▌                                            | 40/100 [00:46<01:27,  1.45s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  41%|██████████████████████████████▎                                           | 41/100 [00:47<01:19,  1.36s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  42%|███████████████████████████████                                           | 42/100 [00:48<01:11,  1.24s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  43%|███████████████████████████████▊                                          | 43/100 [00:49<01:02,  1.09s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  44%|████████████████████████████████▌                                         | 44/100 [00:50<01:01,  1.10s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  45%|█████████████████████████████████▎                                        | 45/100 [00:51<01:03,  1.15s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  46%|██████████████████████████████████                                        | 46/100 [00:52<00:53,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  47%|██████████████████████████████████▊                                       | 47/100 [00:52<00:44,  1.18it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  48%|███████████████████████████████████▌                                      | 48/100 [00:53<00:41,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  49%|████████████████████████████████████▎                                     | 49/100 [00:54<00:49,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  50%|█████████████████████████████████████                                     | 50/100 [00:55<00:43,  1.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  51%|█████████████████████████████████████▋                                    | 51/100 [00:55<00:38,  1.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  52%|██████████████████████████████████████▍                                   | 52/100 [00:58<01:00,  1.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  53%|███████████████████████████████████████▏                                  | 53/100 [00:58<00:50,  1.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  54%|███████████████████████████████████████▉                                  | 54/100 [01:00<00:49,  1.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  55%|████████████████████████████████████████▋                                 | 55/100 [01:01<00:47,  1.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  56%|█████████████████████████████████████████▍                                | 56/100 [01:02<00:47,  1.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  57%|██████████████████████████████████████████▏                               | 57/100 [01:03<00:50,  1.16s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  58%|██████████████████████████████████████████▉                               | 58/100 [01:04<00:50,  1.20s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  59%|███████████████████████████████████████████▋                              | 59/100 [01:05<00:48,  1.18s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  60%|████████████████████████████████████████████▍                             | 60/100 [01:08<01:00,  1.51s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  61%|█████████████████████████████████████████████▏                            | 61/100 [01:09<00:53,  1.38s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  62%|█████████████████████████████████████████████▉                            | 62/100 [01:11<01:00,  1.58s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  63%|██████████████████████████████████████████████▌                           | 63/100 [01:13<01:06,  1.79s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  64%|███████████████████████████████████████████████▎                          | 64/100 [01:15<01:03,  1.78s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  65%|████████████████████████████████████████████████                          | 65/100 [01:16<00:56,  1.60s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  66%|████████████████████████████████████████████████▊                         | 66/100 [01:17<00:52,  1.53s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  67%|█████████████████████████████████████████████████▌                        | 67/100 [01:19<00:50,  1.52s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  68%|██████████████████████████████████████████████████▎                       | 68/100 [01:19<00:39,  1.22s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  69%|███████████████████████████████████████████████████                       | 69/100 [01:21<00:38,  1.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  70%|███████████████████████████████████████████████████▊                      | 70/100 [01:22<00:33,  1.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  71%|████████████████████████████████████████████████████▌                     | 71/100 [01:23<00:32,  1.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  72%|█████████████████████████████████████████████████████▎                    | 72/100 [01:24<00:29,  1.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  73%|██████████████████████████████████████████████████████                    | 73/100 [01:24<00:25,  1.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  74%|██████████████████████████████████████████████████████▊                   | 74/100 [01:25<00:24,  1.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  75%|███████████████████████████████████████████████████████▌                  | 75/100 [01:26<00:25,  1.00s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  76%|████████████████████████████████████████████████████████▏                 | 76/100 [01:27<00:23,  1.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  77%|████████████████████████████████████████████████████████▉                 | 77/100 [01:28<00:20,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  78%|█████████████████████████████████████████████████████████▋                | 78/100 [01:29<00:17,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  79%|██████████████████████████████████████████████████████████▍               | 79/100 [01:29<00:15,  1.37it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  80%|███████████████████████████████████████████████████████████▏              | 80/100 [01:30<00:17,  1.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  81%|███████████████████████████████████████████████████████████▉              | 81/100 [01:32<00:19,  1.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  82%|████████████████████████████████████████████████████████████▋             | 82/100 [01:33<00:19,  1.06s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  83%|█████████████████████████████████████████████████████████████▍            | 83/100 [01:34<00:16,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  84%|██████████████████████████████████████████████████████████████▏           | 84/100 [01:35<00:19,  1.21s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  85%|██████████████████████████████████████████████████████████████▉           | 85/100 [01:36<00:14,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  86%|███████████████████████████████████████████████████████████████▋          | 86/100 [01:38<00:17,  1.22s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  87%|████████████████████████████████████████████████████████████████▍         | 87/100 [01:38<00:12,  1.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  88%|█████████████████████████████████████████████████████████████████         | 88/100 [01:40<00:14,  1.18s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  89%|█████████████████████████████████████████████████████████████████▊        | 89/100 [01:40<00:10,  1.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  90%|██████████████████████████████████████████████████████████████████▌       | 90/100 [01:41<00:08,  1.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  91%|███████████████████████████████████████████████████████████████████▎      | 91/100 [01:43<00:11,  1.25s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  92%|████████████████████████████████████████████████████████████████████      | 92/100 [01:45<00:11,  1.48s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  93%|████████████████████████████████████████████████████████████████████▊     | 93/100 [01:46<00:09,  1.32s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  94%|█████████████████████████████████████████████████████████████████████▌    | 94/100 [01:47<00:06,  1.09s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  95%|██████████████████████████████████████████████████████████████████████▎   | 95/100 [01:47<00:04,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  96%|███████████████████████████████████████████████████████████████████████   | 96/100 [01:48<00:03,  1.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  97%|███████████████████████████████████████████████████████████████████████▊  | 97/100 [01:48<00:02,  1.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  98%|████████████████████████████████████████████████████████████████████████▌ | 98/100 [01:49<00:01,  1.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic:  99%|█████████████████████████████████████████████████████████████████████████▎| 99/100 [01:49<00:00,  1.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

basic: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:50<00:00,  1.11s/it]



BASIC Results:
  Average F1 Score: 0.444
  Exact Match Rate: 0.390 (39%)
  Time: 1.8 minutes


Testing strategy: COT




cot:   0%|                                                                                     | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   1%|▊                                                                            | 1/100 [00:02<04:37,  2.80s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   2%|█▌                                                                           | 2/100 [00:09<08:36,  5.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   3%|██▎                                                                          | 3/100 [00:16<09:18,  5.76s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   4%|███                                                                          | 4/100 [00:19<07:56,  4.97s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   5%|███▊                                                                         | 5/100 [00:23<07:21,  4.65s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   6%|████▌                                                                        | 6/100 [00:27<06:41,  4.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   7%|█████▍                                                                       | 7/100 [00:32<06:58,  4.50s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   8%|██████▏                                                                      | 8/100 [00:39<08:11,  5.34s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:   9%|██████▉                                                                      | 9/100 [00:47<09:27,  6.24s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  10%|███████▌                                                                    | 10/100 [00:50<07:43,  5.15s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  11%|████████▎                                                                   | 11/100 [00:53<06:47,  4.57s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  12%|█████████                                                                   | 12/100 [00:59<07:02,  4.80s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  13%|█████████▉                                                                  | 13/100 [01:02<06:30,  4.49s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  14%|██████████▋                                                                 | 14/100 [01:05<05:43,  4.00s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  15%|███████████▍                                                                | 15/100 [01:09<05:33,  3.92s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  16%|████████████▏                                                               | 16/100 [01:21<08:50,  6.31s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  17%|████████████▉                                                               | 17/100 [01:25<07:46,  5.62s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  18%|█████████████▋                                                              | 18/100 [01:33<08:30,  6.23s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  19%|██████████████▍                                                             | 19/100 [01:38<08:00,  5.94s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  20%|███████████████▏                                                            | 20/100 [01:42<07:10,  5.38s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  21%|███████████████▉                                                            | 21/100 [01:48<07:29,  5.69s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  22%|████████████████▋                                                           | 22/100 [01:51<06:05,  4.68s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  23%|█████████████████▍                                                          | 23/100 [01:55<05:50,  4.56s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  24%|██████████████████▏                                                         | 24/100 [02:01<06:32,  5.16s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  25%|███████████████████                                                         | 25/100 [02:07<06:29,  5.19s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  26%|███████████████████▊                                                        | 26/100 [02:10<05:31,  4.49s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  27%|████████████████████▌                                                       | 27/100 [02:14<05:35,  4.60s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  28%|█████████████████████▎                                                      | 28/100 [02:22<06:29,  5.41s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  29%|██████████████████████                                                      | 29/100 [02:25<05:40,  4.79s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  30%|██████████████████████▊                                                     | 30/100 [02:35<07:32,  6.47s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  31%|███████████████████████▌                                                    | 31/100 [02:41<07:05,  6.17s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  32%|████████████████████████▎                                                   | 32/100 [02:49<07:47,  6.87s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  33%|█████████████████████████                                                   | 33/100 [02:56<07:43,  6.91s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  34%|█████████████████████████▊                                                  | 34/100 [03:02<07:01,  6.39s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  35%|██████████████████████████▌                                                 | 35/100 [03:09<07:17,  6.73s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  36%|███████████████████████████▎                                                | 36/100 [03:16<07:07,  6.69s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  37%|████████████████████████████                                                | 37/100 [03:19<05:52,  5.60s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  38%|████████████████████████████▉                                               | 38/100 [03:23<05:14,  5.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  39%|█████████████████████████████▋                                              | 39/100 [03:34<06:56,  6.84s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  40%|██████████████████████████████▍                                             | 40/100 [03:39<06:29,  6.49s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  41%|███████████████████████████████▏                                            | 41/100 [03:45<06:05,  6.20s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  42%|███████████████████████████████▉                                            | 42/100 [03:48<05:04,  5.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  43%|████████████████████████████████▋                                           | 43/100 [03:54<05:11,  5.47s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  44%|█████████████████████████████████▍                                          | 44/100 [04:01<05:34,  5.98s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  45%|██████████████████████████████████▏                                         | 45/100 [04:06<05:17,  5.77s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  46%|██████████████████████████████████▉                                         | 46/100 [04:10<04:32,  5.04s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  47%|███████████████████████████████████▋                                        | 47/100 [04:14<04:11,  4.75s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  48%|████████████████████████████████████▍                                       | 48/100 [04:16<03:29,  4.04s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  49%|█████████████████████████████████████▏                                      | 49/100 [04:20<03:30,  4.12s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  50%|██████████████████████████████████████                                      | 50/100 [04:23<03:05,  3.71s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  51%|██████████████████████████████████████▊                                     | 51/100 [04:25<02:40,  3.29s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  52%|███████████████████████████████████████▌                                    | 52/100 [04:37<04:35,  5.74s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  53%|████████████████████████████████████████▎                                   | 53/100 [04:40<03:53,  4.97s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  54%|█████████████████████████████████████████                                   | 54/100 [04:45<03:51,  5.04s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  55%|█████████████████████████████████████████▊                                  | 55/100 [04:52<04:11,  5.58s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  56%|██████████████████████████████████████████▌                                 | 56/100 [05:00<04:41,  6.39s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  57%|███████████████████████████████████████████▎                                | 57/100 [05:04<03:52,  5.42s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  58%|████████████████████████████████████████████                                | 58/100 [05:08<03:40,  5.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  59%|████████████████████████████████████████████▊                               | 59/100 [05:14<03:39,  5.36s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  60%|█████████████████████████████████████████████▌                              | 60/100 [05:19<03:24,  5.12s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  61%|██████████████████████████████████████████████▎                             | 61/100 [05:23<03:08,  4.84s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  62%|███████████████████████████████████████████████                             | 62/100 [05:29<03:18,  5.23s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  63%|███████████████████████████████████████████████▉                            | 63/100 [05:37<03:42,  6.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  64%|████████████████████████████████████████████████▋                           | 64/100 [05:42<03:32,  5.91s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  65%|█████████████████████████████████████████████████▍                          | 65/100 [05:46<03:01,  5.18s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  66%|██████████████████████████████████████████████████▏                         | 66/100 [05:49<02:34,  4.55s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  67%|██████████████████████████████████████████████████▉                         | 67/100 [05:55<02:41,  4.89s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  68%|███████████████████████████████████████████████████▋                        | 68/100 [05:58<02:23,  4.49s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  69%|████████████████████████████████████████████████████▍                       | 69/100 [06:02<02:17,  4.42s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  70%|█████████████████████████████████████████████████████▏                      | 70/100 [06:07<02:10,  4.36s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  71%|█████████████████████████████████████████████████████▉                      | 71/100 [06:12<02:15,  4.67s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  72%|██████████████████████████████████████████████████████▋                     | 72/100 [06:16<02:02,  4.37s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  73%|███████████████████████████████████████████████████████▍                    | 73/100 [06:22<02:14,  4.99s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  74%|████████████████████████████████████████████████████████▏                   | 74/100 [06:37<03:29,  8.06s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  75%|█████████████████████████████████████████████████████████                   | 75/100 [06:49<03:45,  9.00s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  76%|█████████████████████████████████████████████████████████▊                  | 76/100 [06:53<03:01,  7.58s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  77%|██████████████████████████████████████████████████████████▌                 | 77/100 [06:57<02:27,  6.43s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  78%|███████████████████████████████████████████████████████████▎                | 78/100 [07:00<02:01,  5.50s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  79%|████████████████████████████████████████████████████████████                | 79/100 [07:04<01:44,  4.96s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  80%|████████████████████████████████████████████████████████████▊               | 80/100 [07:08<01:36,  4.85s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  81%|█████████████████████████████████████████████████████████████▌              | 81/100 [07:12<01:26,  4.57s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  82%|██████████████████████████████████████████████████████████████▎             | 82/100 [07:16<01:16,  4.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  83%|███████████████████████████████████████████████████████████████             | 83/100 [07:18<01:04,  3.81s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  84%|███████████████████████████████████████████████████████████████▊            | 84/100 [07:21<00:52,  3.31s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  85%|████████████████████████████████████████████████████████████████▌           | 85/100 [07:22<00:42,  2.81s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  86%|█████████████████████████████████████████████████████████████████▎          | 86/100 [07:32<01:07,  4.84s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  87%|██████████████████████████████████████████████████████████████████          | 87/100 [07:36<00:59,  4.61s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  88%|██████████████████████████████████████████████████████████████████▉         | 88/100 [07:41<00:56,  4.73s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  89%|███████████████████████████████████████████████████████████████████▋        | 89/100 [07:43<00:43,  3.99s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  90%|████████████████████████████████████████████████████████████████████▍       | 90/100 [07:45<00:34,  3.43s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  91%|█████████████████████████████████████████████████████████████████████▏      | 91/100 [07:49<00:31,  3.47s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  92%|█████████████████████████████████████████████████████████████████████▉      | 92/100 [07:53<00:30,  3.83s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  93%|██████████████████████████████████████████████████████████████████████▋     | 93/100 [07:56<00:23,  3.38s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  94%|███████████████████████████████████████████████████████████████████████▍    | 94/100 [07:58<00:18,  3.02s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  95%|████████████████████████████████████████████████████████████████████████▏   | 95/100 [08:00<00:13,  2.78s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  96%|████████████████████████████████████████████████████████████████████████▉   | 96/100 [08:03<00:10,  2.74s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  97%|█████████████████████████████████████████████████████████████████████████▋  | 97/100 [08:05<00:08,  2.68s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  98%|██████████████████████████████████████████████████████████████████████████▍ | 98/100 [08:09<00:06,  3.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot:  99%|███████████████████████████████████████████████████████████████████████████▏| 99/100 [08:11<00:02,  2.71s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

cot: 100%|███████████████████████████████████████████████████████████████████████████| 100/100 [08:16<00:00,  4.96s/it]



COT Results:
  Average F1 Score: 0.064
  Exact Match Rate: 0.000 (0%)
  Time: 8.3 minutes


Testing strategy: PERSONA




persona:   0%|                                                                                 | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   1%|▋                                                                        | 1/100 [00:01<02:51,  1.73s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   2%|█▍                                                                       | 2/100 [00:03<02:31,  1.54s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   3%|██▏                                                                      | 3/100 [00:05<02:49,  1.74s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   4%|██▉                                                                      | 4/100 [00:06<02:30,  1.57s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   5%|███▋                                                                     | 5/100 [00:07<02:00,  1.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   6%|████▍                                                                    | 6/100 [00:07<01:42,  1.09s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   7%|█████                                                                    | 7/100 [00:08<01:26,  1.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   8%|█████▊                                                                   | 8/100 [00:09<01:33,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:   9%|██████▌                                                                  | 9/100 [00:10<01:27,  1.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  10%|███████▏                                                                | 10/100 [00:11<01:34,  1.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  11%|███████▉                                                                | 11/100 [00:12<01:21,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  12%|████████▋                                                               | 12/100 [00:13<01:12,  1.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  13%|█████████▎                                                              | 13/100 [00:13<01:06,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  14%|██████████                                                              | 14/100 [00:14<00:59,  1.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  15%|██████████▊                                                             | 15/100 [00:14<00:55,  1.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  16%|███████████▌                                                            | 16/100 [00:16<01:22,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  17%|████████████▏                                                           | 17/100 [00:17<01:23,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  18%|████████████▉                                                           | 18/100 [00:19<01:54,  1.40s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  19%|█████████████▋                                                          | 19/100 [00:21<01:53,  1.40s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  20%|██████████████▍                                                         | 20/100 [00:22<01:36,  1.21s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  21%|███████████████                                                         | 21/100 [00:23<01:51,  1.41s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  22%|███████████████▊                                                        | 22/100 [00:24<01:26,  1.10s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  23%|████████████████▌                                                       | 23/100 [00:24<01:13,  1.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  24%|█████████████████▎                                                      | 24/100 [00:25<01:05,  1.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  25%|██████████████████                                                      | 25/100 [00:26<01:13,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  26%|██████████████████▋                                                     | 26/100 [00:27<01:14,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  27%|███████████████████▍                                                    | 27/100 [00:28<01:11,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  28%|████████████████████▏                                                   | 28/100 [00:30<01:18,  1.09s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  29%|████████████████████▉                                                   | 29/100 [00:30<01:03,  1.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  30%|█████████████████████▌                                                  | 30/100 [00:31<00:55,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  31%|██████████████████████▎                                                 | 31/100 [00:31<00:49,  1.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  32%|███████████████████████                                                 | 32/100 [00:32<00:42,  1.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  33%|███████████████████████▊                                                | 33/100 [00:33<00:49,  1.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  34%|████████████████████████▍                                               | 34/100 [00:34<00:52,  1.25it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  35%|█████████████████████████▏                                              | 35/100 [00:35<00:55,  1.18it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  36%|█████████████████████████▉                                              | 36/100 [00:37<01:20,  1.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  37%|██████████████████████████▋                                             | 37/100 [00:38<01:22,  1.31s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  38%|███████████████████████████▎                                            | 38/100 [00:39<01:08,  1.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  39%|████████████████████████████                                            | 39/100 [00:40<01:00,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  40%|████████████████████████████▊                                           | 40/100 [00:41<01:01,  1.02s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  41%|█████████████████████████████▌                                          | 41/100 [00:41<00:54,  1.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  42%|██████████████████████████████▏                                         | 42/100 [00:42<00:47,  1.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  43%|██████████████████████████████▉                                         | 43/100 [00:42<00:42,  1.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  44%|███████████████████████████████▋                                        | 44/100 [00:43<00:41,  1.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  45%|████████████████████████████████▍                                       | 45/100 [00:44<00:37,  1.46it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  46%|█████████████████████████████████                                       | 46/100 [00:44<00:33,  1.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  47%|█████████████████████████████████▊                                      | 47/100 [00:45<00:29,  1.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  48%|██████████████████████████████████▌                                     | 48/100 [00:45<00:25,  2.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  49%|███████████████████████████████████▎                                    | 49/100 [00:46<00:32,  1.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  50%|████████████████████████████████████                                    | 50/100 [00:46<00:30,  1.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  51%|████████████████████████████████████▋                                   | 51/100 [00:47<00:25,  1.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  52%|█████████████████████████████████████▍                                  | 52/100 [00:48<00:36,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  53%|██████████████████████████████████████▏                                 | 53/100 [00:49<00:38,  1.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  54%|██████████████████████████████████████▉                                 | 54/100 [00:50<00:36,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  55%|███████████████████████████████████████▌                                | 55/100 [00:51<00:39,  1.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  56%|████████████████████████████████████████▎                               | 56/100 [00:52<00:40,  1.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  57%|█████████████████████████████████████████                               | 57/100 [00:52<00:32,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  58%|█████████████████████████████████████████▊                              | 58/100 [00:53<00:37,  1.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  59%|██████████████████████████████████████████▍                             | 59/100 [00:54<00:33,  1.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  60%|███████████████████████████████████████████▏                            | 60/100 [00:56<00:42,  1.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  61%|███████████████████████████████████████████▉                            | 61/100 [00:56<00:37,  1.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  62%|████████████████████████████████████████████▋                           | 62/100 [00:57<00:37,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  63%|█████████████████████████████████████████████▎                          | 63/100 [01:01<00:59,  1.60s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  64%|██████████████████████████████████████████████                          | 64/100 [01:02<00:51,  1.44s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  65%|██████████████████████████████████████████████▊                         | 65/100 [01:02<00:44,  1.28s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  66%|███████████████████████████████████████████████▌                        | 66/100 [01:03<00:34,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  67%|████████████████████████████████████████████████▏                       | 67/100 [01:04<00:32,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  68%|████████████████████████████████████████████████▉                       | 68/100 [01:04<00:25,  1.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  69%|█████████████████████████████████████████████████▋                      | 69/100 [01:05<00:28,  1.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  70%|██████████████████████████████████████████████████▍                     | 70/100 [01:06<00:25,  1.19it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  71%|███████████████████████████████████████████████████                     | 71/100 [01:07<00:26,  1.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  72%|███████████████████████████████████████████████████▊                    | 72/100 [01:08<00:23,  1.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  73%|████████████████████████████████████████████████████▌                   | 73/100 [01:09<00:24,  1.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  74%|█████████████████████████████████████████████████████▎                  | 74/100 [01:10<00:23,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  75%|██████████████████████████████████████████████████████                  | 75/100 [01:11<00:22,  1.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  76%|██████████████████████████████████████████████████████▋                 | 76/100 [01:11<00:19,  1.20it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  77%|███████████████████████████████████████████████████████▍                | 77/100 [01:12<00:17,  1.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  78%|████████████████████████████████████████████████████████▏               | 78/100 [01:12<00:13,  1.62it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  79%|████████████████████████████████████████████████████████▉               | 79/100 [01:13<00:11,  1.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  80%|█████████████████████████████████████████████████████████▌              | 80/100 [01:14<00:13,  1.48it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  81%|██████████████████████████████████████████████████████████▎             | 81/100 [01:15<00:14,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  82%|███████████████████████████████████████████████████████████             | 82/100 [01:15<00:14,  1.25it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  83%|███████████████████████████████████████████████████████████▊            | 83/100 [01:16<00:14,  1.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  84%|████████████████████████████████████████████████████████████▍           | 84/100 [01:18<00:16,  1.06s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  85%|█████████████████████████████████████████████████████████████▏          | 85/100 [01:18<00:12,  1.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  86%|█████████████████████████████████████████████████████████████▉          | 86/100 [01:20<00:16,  1.19s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  87%|██████████████████████████████████████████████████████████████▋         | 87/100 [01:21<00:12,  1.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  88%|███████████████████████████████████████████████████████████████▎        | 88/100 [01:22<00:11,  1.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  89%|████████████████████████████████████████████████████████████████        | 89/100 [01:22<00:08,  1.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  90%|████████████████████████████████████████████████████████████████▊       | 90/100 [01:23<00:07,  1.33it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  91%|█████████████████████████████████████████████████████████████████▌      | 91/100 [01:24<00:08,  1.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  92%|██████████████████████████████████████████████████████████████████▏     | 92/100 [01:25<00:08,  1.03s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  93%|██████████████████████████████████████████████████████████████████▉     | 93/100 [01:26<00:06,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  94%|███████████████████████████████████████████████████████████████████▋    | 94/100 [01:27<00:04,  1.27it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  95%|████████████████████████████████████████████████████████████████████▍   | 95/100 [01:27<00:03,  1.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  96%|█████████████████████████████████████████████████████████████████████   | 96/100 [01:28<00:03,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  97%|█████████████████████████████████████████████████████████████████████▊  | 97/100 [01:28<00:01,  1.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  98%|██████████████████████████████████████████████████████████████████████▌ | 98/100 [01:29<00:01,  1.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona:  99%|███████████████████████████████████████████████████████████████████████▎| 99/100 [01:29<00:00,  1.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

persona: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [01:30<00:00,  1.10it/s]



PERSONA Results:
  Average F1 Score: 0.403
  Exact Match Rate: 0.350 (35%)
  Time: 1.5 minutes


Testing strategy: INSTRUCTION




instruction:   0%|                                                                             | 0/100 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   1%|▋                                                                    | 1/100 [00:00<00:56,  1.75it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   2%|█▍                                                                   | 2/100 [00:01<01:39,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   3%|██                                                                   | 3/100 [00:02<01:29,  1.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   4%|██▊                                                                  | 4/100 [00:03<01:40,  1.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   5%|███▍                                                                 | 5/100 [00:04<01:28,  1.07it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   6%|████▏                                                                | 6/100 [00:05<01:25,  1.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   7%|████▊                                                                | 7/100 [00:06<01:13,  1.27it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   8%|█████▌                                                               | 8/100 [00:07<01:20,  1.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:   9%|██████▏                                                              | 9/100 [00:08<01:26,  1.05it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  10%|██████▊                                                             | 10/100 [00:08<01:11,  1.27it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  11%|███████▍                                                            | 11/100 [00:09<01:04,  1.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  12%|████████▏                                                           | 12/100 [00:10<01:05,  1.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  13%|████████▊                                                           | 13/100 [00:10<01:06,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  14%|█████████▌                                                          | 14/100 [00:11<00:59,  1.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  15%|██████████▏                                                         | 15/100 [00:11<00:53,  1.59it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  16%|██████████▉                                                         | 16/100 [00:12<01:02,  1.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  17%|███████████▌                                                        | 17/100 [00:13<01:07,  1.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  18%|████████████▏                                                       | 18/100 [00:15<01:24,  1.04s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  19%|████████████▉                                                       | 19/100 [00:16<01:14,  1.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  20%|█████████████▌                                                      | 20/100 [00:17<01:17,  1.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  21%|██████████████▎                                                     | 21/100 [00:18<01:19,  1.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  22%|██████████████▉                                                     | 22/100 [00:18<01:05,  1.19it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  23%|███████████████▋                                                    | 23/100 [00:19<00:57,  1.35it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  24%|████████████████▎                                                   | 24/100 [00:19<00:55,  1.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  25%|█████████████████                                                   | 25/100 [00:20<00:53,  1.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  26%|█████████████████▋                                                  | 26/100 [00:21<00:47,  1.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  27%|██████████████████▎                                                 | 27/100 [00:21<00:48,  1.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  28%|███████████████████                                                 | 28/100 [00:23<01:05,  1.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  29%|███████████████████▋                                                | 29/100 [00:23<00:56,  1.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  30%|████████████████████▍                                               | 30/100 [00:24<00:53,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  31%|█████████████████████                                               | 31/100 [00:25<00:49,  1.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  32%|█████████████████████▊                                              | 32/100 [00:25<00:47,  1.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  33%|██████████████████████▍                                             | 33/100 [00:26<00:48,  1.37it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  34%|███████████████████████                                             | 34/100 [00:27<00:44,  1.48it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  35%|███████████████████████▊                                            | 35/100 [00:27<00:45,  1.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  36%|████████████████████████▍                                           | 36/100 [00:28<00:48,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  37%|█████████████████████████▏                                          | 37/100 [00:29<00:46,  1.37it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  38%|█████████████████████████▊                                          | 38/100 [00:30<00:44,  1.40it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  39%|██████████████████████████▌                                         | 39/100 [00:31<00:49,  1.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  40%|███████████████████████████▏                                        | 40/100 [00:31<00:44,  1.36it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  41%|███████████████████████████▉                                        | 41/100 [00:32<00:47,  1.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  42%|████████████████████████████▌                                       | 42/100 [00:33<00:39,  1.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  43%|█████████████████████████████▏                                      | 43/100 [00:33<00:38,  1.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  44%|█████████████████████████████▉                                      | 44/100 [00:34<00:40,  1.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  45%|██████████████████████████████▌                                     | 45/100 [00:35<00:40,  1.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  46%|███████████████████████████████▎                                    | 46/100 [00:35<00:37,  1.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  47%|███████████████████████████████▉                                    | 47/100 [00:36<00:34,  1.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  48%|████████████████████████████████▋                                   | 48/100 [00:36<00:30,  1.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  49%|█████████████████████████████████▎                                  | 49/100 [00:37<00:35,  1.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  50%|██████████████████████████████████                                  | 50/100 [00:38<00:32,  1.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  51%|██████████████████████████████████▋                                 | 51/100 [00:38<00:31,  1.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  52%|███████████████████████████████████▎                                | 52/100 [00:39<00:34,  1.39it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  53%|████████████████████████████████████                                | 53/100 [00:40<00:33,  1.42it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  54%|████████████████████████████████████▋                               | 54/100 [00:41<00:33,  1.39it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  55%|█████████████████████████████████████▍                              | 55/100 [00:41<00:31,  1.42it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  56%|██████████████████████████████████████                              | 56/100 [00:42<00:29,  1.47it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  57%|██████████████████████████████████████▊                             | 57/100 [00:43<00:27,  1.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  58%|███████████████████████████████████████▍                            | 58/100 [00:44<00:30,  1.39it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  59%|████████████████████████████████████████                            | 59/100 [00:44<00:32,  1.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  60%|████████████████████████████████████████▊                           | 60/100 [00:46<00:44,  1.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  61%|█████████████████████████████████████████▍                          | 61/100 [00:47<00:38,  1.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  62%|██████████████████████████████████████████▏                         | 62/100 [00:48<00:37,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  63%|██████████████████████████████████████████▊                         | 63/100 [00:51<00:59,  1.61s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  64%|███████████████████████████████████████████▌                        | 64/100 [00:53<01:00,  1.67s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  65%|████████████████████████████████████████████▏                       | 65/100 [00:53<00:46,  1.32s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  66%|████████████████████████████████████████████▉                       | 66/100 [00:54<00:35,  1.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  67%|█████████████████████████████████████████████▌                      | 67/100 [00:55<00:35,  1.06s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  68%|██████████████████████████████████████████████▏                     | 68/100 [00:55<00:28,  1.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  69%|██████████████████████████████████████████████▉                     | 69/100 [00:56<00:29,  1.07it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  70%|███████████████████████████████████████████████▌                    | 70/100 [00:57<00:24,  1.20it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  71%|████████████████████████████████████████████████▎                   | 71/100 [00:58<00:24,  1.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  72%|████████████████████████████████████████████████▉                   | 72/100 [00:59<00:21,  1.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  73%|█████████████████████████████████████████████████▋                  | 73/100 [00:59<00:20,  1.30it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  74%|██████████████████████████████████████████████████▎                 | 74/100 [01:00<00:19,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  75%|███████████████████████████████████████████████████                 | 75/100 [01:01<00:21,  1.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  76%|███████████████████████████████████████████████████▋                | 76/100 [01:02<00:21,  1.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  77%|████████████████████████████████████████████████████▎               | 77/100 [01:03<00:18,  1.25it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  78%|█████████████████████████████████████████████████████               | 78/100 [01:03<00:15,  1.39it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  79%|█████████████████████████████████████████████████████▋              | 79/100 [01:04<00:13,  1.57it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  80%|██████████████████████████████████████████████████████▍             | 80/100 [01:04<00:12,  1.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  81%|███████████████████████████████████████████████████████             | 81/100 [01:05<00:11,  1.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  82%|███████████████████████████████████████████████████████▊            | 82/100 [01:06<00:11,  1.54it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  83%|████████████████████████████████████████████████████████▍           | 83/100 [01:07<00:12,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  84%|█████████████████████████████████████████████████████████           | 84/100 [01:07<00:12,  1.30it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  85%|█████████████████████████████████████████████████████████▊          | 85/100 [01:08<00:10,  1.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  86%|██████████████████████████████████████████████████████████▍         | 86/100 [01:09<00:10,  1.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  87%|███████████████████████████████████████████████████████████▏        | 87/100 [01:09<00:08,  1.51it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  88%|███████████████████████████████████████████████████████████▊        | 88/100 [01:10<00:09,  1.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  89%|████████████████████████████████████████████████████████████▌       | 89/100 [01:11<00:07,  1.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  90%|█████████████████████████████████████████████████████████████▏      | 90/100 [01:11<00:06,  1.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  91%|█████████████████████████████████████████████████████████████▉      | 91/100 [01:13<00:08,  1.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  92%|██████████████████████████████████████████████████████████████▌     | 92/100 [01:14<00:08,  1.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  93%|███████████████████████████████████████████████████████████████▏    | 93/100 [01:15<00:06,  1.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  94%|███████████████████████████████████████████████████████████████▉    | 94/100 [01:15<00:04,  1.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  95%|████████████████████████████████████████████████████████████████▌   | 95/100 [01:16<00:03,  1.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  96%|█████████████████████████████████████████████████████████████████▎  | 96/100 [01:16<00:02,  1.67it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  97%|█████████████████████████████████████████████████████████████████▉  | 97/100 [01:17<00:01,  1.73it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  98%|██████████████████████████████████████████████████████████████████▋ | 98/100 [01:17<00:01,  1.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction:  99%|███████████████████████████████████████████████████████████████████▎| 99/100 [01:18<00:00,  1.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

instruction: 100%|███████████████████████████████████████████████████████████████████| 100/100 [01:18<00:00,  1.27it/s]


INSTRUCTION Results:
  Average F1 Score: 0.267
  Exact Match Rate: 0.200 (20%)
  Time: 1.3 minutes


EVALUATION COMPLETE







In [13]:
# Analyze CoT strategy failures
cot_results = [r for r in results if r['strategy'] == 'cot'][0]['detailed_results']

# Sort by F1 score to find worst performers
worst_cot = sorted(cot_results, key=lambda x: x['f1_score'])[:20]

print("CoT Strategy - Worst 20 Examples:\n")
for i, result in enumerate(worst_cot, 1):
    print(f"{i}. Q: {result['question']}")
    print(f"   Expected: {result['expected']}")
    print(f"   Got: {result['predicted']}")
    print(f"   F1: {result['f1_score']:.3f}")
    print(f"   Retrieval: {result['retrieval_score']:.3f}\n")

CoT Strategy - Worst 20 Examples:

1. Q: Was Abraham Lincoln the sixteenth President of the United States?
   Expected: yes
   Got: Abraham Lincoln was the 16th President of the United States. Abraham Lincoln was born in 1865. The answer: no.
   F1: 0.000
   Retrieval: 0.710

2. Q: Did Lincoln sign the National Banking Act of 1863?
   Expected: yes
   Got: The relevant sentence in the passage is: Also included was the creation of the system of national banks by the National Banking Acts of 1863, 1864, and 1865, which allowed the creation of a strong national financial system. The answer: yes.
   F1: 0.000
   Retrieval: 0.641

3. Q: Did his mother die of pneumonia?
   Expected: no
   Got: The cause of his continual poor health was never precisely determined, though speculation focuses on tuberculosis, stomach cancer, or a combination of the two. Muir, 103. The answer: no.
   F1: 0.000
   Retrieval: 0.483

4. Q: When did Lincoln begin his political career?
   Expected: 1832
   Got: The r

In [7]:
# Create comparison table
comparison = pd.DataFrame([
	{
		'Strategy': r['strategy'],
		'Avg F1 Score': f"{r['avg_f1']:.3f}",
		'Exact Match %': f"{r['avg_em']*100:.1f}%",
		'Time (min)': f"{r['time_minutes']:.1f}"
	}
	for r in results
])

print("\nPROMPTING STRATEGY COMPARISON")
print(comparison.to_string(index=False))

# Identify best strategy
best_f1 = max(results, key=lambda x: x['avg_f1'])
best_em = max(results, key=lambda x: x['avg_em'])

print(f"\nBest F1 Score: {best_f1['strategy']} ({best_f1['avg_f1']:.3f})")
print(f"Best Exact Match: {best_em['strategy']} ({best_em['avg_em']*100:.1f}%)")


PROMPTING STRATEGY COMPARISON
   Strategy Avg F1 Score Exact Match % Time (min)
      basic        0.444         39.0%        1.8
        cot        0.064          0.0%        8.3
    persona        0.403         35.0%        1.5
instruction        0.267         20.0%        1.3

Best F1 Score: basic (0.444)
Best Exact Match: basic (39.0%)


In [9]:
# Save detailed results
import os
os.makedirs('../results', exist_ok=True)

# Save comparison table
comparison.to_csv('../results/step3_prompting_comparison.csv', index=False)

# Save full results
with open('../results/step3_detailed_results.json', 'w') as f:
	json.dump(results, f, indent=2)

print("Results saved to ../results/")
print("  - step3_prompting_comparison.csv")
print("  - step3_detailed_results.json")

Results saved to ../results/
  - step3_prompting_comparison.csv
  - step3_detailed_results.json


In [11]:
# Analyze where the system fails
best_strategy_results = best_f1['detailed_results']

# Find worst performing questions
sorted_by_f1 = sorted(best_strategy_results, key=lambda x: x['f1_score'])

print("Worst performing questions (lowest F1):")
for i, r in enumerate(sorted_by_f1[:10], 1):
	print(f"\n{i}. Q: {r['question']}")
	print(f"   Expected: {r['expected']}")
	print(f"   Got: {r['predicted']}")
	print(f"   F1: {r['f1_score']:.3f}")

Worst performing questions (lowest F1):

1. Q: Was Abraham Lincoln the sixteenth President of the United States?
   Expected: yes
   Got: no
   F1: 0.000

2. Q: Did his mother die of pneumonia?
   Expected: no
   Got: unanswerable
   F1: 0.000

3. Q: Did Lincoln ever represent Alton & Sangamon Railroad?
   Expected: Yes
   Got: no
   F1: 0.000

4. Q: Which county was Lincoln born in?
   Expected: Hardin County
   Got: Illinois
   F1: 0.000

5. Q: When did Lincoln first serve as President?
   Expected: March 4, 1861
   Got: 1865
   F1: 0.000

6. Q: Who assassinated Lincoln?
   Expected: John Wilkes Booth
   Got: charles dickens
   F1: 0.000

7. Q: Was Lincoln chosen as a presidential candidate in 1860?
   Expected: Yes
   Got: Lincoln was eventually chosen
   F1: 0.000

8. Q: How old was Lincoln in 1816?
   Expected: seven
   Got: 18
   F1: 0.000

9. Q: How long was Lincoln's legal Career?
   Expected: 23 years
   Got: 1837
   F1: 0.000

10. Q: What trail did Lincoln use a Farmers' Alma