In [1]:
import huggingface_hub
import os
from datasets import load_dataset
import ollama

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
huggingface_hub.login(os.environ['HF_API_TOKEN'])
def load_eval_dataset(dataset_name="squad_v2"):
    # Load the SQuAD v2 dataset by default
    file_path = "Meta-Llama-3.1-8B-evals/Details_squad_2024-07-22T14-58-08.291117.parquet.gzip"
    dataset = load_dataset(dataset_name, data_files=file_path, split="train")
    return dataset


def generate_llm(prompt: str, model: str = "llama3.1", temperature=0.01) -> str:
    return ollama.generate(
        model=model, prompt=prompt, options=ollama.Options(temperature=temperature, num_predict=32)
    )["response"]

dataset = load_eval_dataset(dataset_name="meta-llama/Meta-Llama-3.1-8B-evals")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/edwin/.cache/huggingface/token
Login successful


In [3]:
dataset

Dataset({
    features: ['task_type', 'task_name', 'subtask_name', 'input_question', 'input_choice_list', 'input_final_prompts', 'input_correct_responses', 'output_prediction_text', 'output_parsed_answer', 'output_choice_completions', 'output_choice_negative_log_likelihoods', 'output_metrics', 'is_correct', 'input_question_hash', 'input_final_prompts_hash', 'benchmark_label', 'eval_config'],
    num_rows: 11873
})

In [4]:
generate_llm("What is the capital of Texas?")


'The capital of Texas is Austin.'

In [15]:
'{hi}'.format(hi='hello')

'hello'

In [19]:
import pickle
from collections import defaultdict
import time

############################################
# Attempt 3:
############################################
predictions = defaultdict(list)
references = defaultdict(list)


eval_prompt = """ 
You are a evaluator that needs to compare if response1 matches any of the other_responses to determine if they are the same or not.

response1: {llm_response}

other_responses: {input_correct_responses}

Answer just "yes" or "no" to the question: "Do the responses match?"
"""

def evaluate_llm(temp=0.01, total_evals=1):
    evaluation_history = []

    for i, example in enumerate(dataset):
        # print(i, example)
        if i == total_evals:
            break
        
        generation = generate_llm(example["input_question"])
        print(generation)
        print(example["input_correct_responses"])

        correct = generate_llm(
            eval_prompt.format(
                llm_response=generation,
                input_correct_responses=example["input_correct_responses"],
            )
        )
        evaluation_history.append(
            {
                "id": example["input_question_hash"],
                "generation": generation,
                "input_correct_responses": example["input_correct_responses"],
                "correct": correct,
                "temperature": temp,
            }
        )
        
    return evaluation_history


In [27]:

all_evaluations ={}
for temp in [0.001, 1.0]:
    evaluation_history = evaluate_llm(temp, total_evals=100)
    all_evaluations[temp] = evaluation_history

Not in background. The background only mentions that John Paul II's visits to his native country brought support to the solidarity movement, but it does not mention that he
['john paul ii', 'john paul ii', 'john paul ii']
Near Ballarat, gold was discovered in 1851.
['1851', 'in 1851', '1851']
20 million ounces
['20 million ounces', '20 million ounces', '20 million ounces']
Not in background. The text does not mention anything about a New South Wales president or the issuance of writs for their election. It only talks about Victoria's
['not in background']
Not in background.
['not in background']
Here are the answers to each question based on the provided background paragraph:

1. What was the name of Beyoncé's first solo album?
A: Dangerously
['not in background']
One of the largest gold rushes the world has ever seen.
['gold rush', 'gold rush', 'gold rushes']
Based on the background information, here are the answers to your questions:

Q: On what date was Victoria declared independent

In [21]:
all_evaluations

{0.001: [{'id': '8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a256e2abea3e1b3eeb1',
   'generation': "Not in background. The background only mentions that John Paul II's visits to his native country brought support to the solidarity movement, but it does not mention that he",
   'input_correct_responses': ['john paul ii', 'john paul ii', 'john paul ii'],
   'correct': 'no',
   'temperature': 0.001}],
 1.0: [{'id': '8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a256e2abea3e1b3eeb1',
   'generation': "Not in background. The background only mentions that John Paul II's visits to his native country brought support to the solidarity movement, but it does not mention that he",
   'input_correct_responses': ['john paul ii', 'john paul ii', 'john paul ii'],
   'correct': 'no',
   'temperature': 1.0}]}

In [43]:
import pandas as pd
o1 = pd.DataFrame(all_evaluations[0.001])
o1['correct'] = list(map(lambda x: x.lower().strip("."), o1["correct"]))
o1["correct"].value_counts()

correct
yes    59
no     41
Name: count, dtype: int64

In [44]:
import pandas as pd

one = pd.DataFrame(all_evaluations[1.0])
one["correct"] = list(map(lambda x: x.lower().strip("."), one["correct"]))
one["correct"].value_counts()

correct
yes    58
no     42
Name: count, dtype: int64

In [32]:
o1

Unnamed: 0,id,generation,input_correct_responses,correct,temperature
0,8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a...,Not in background. The background only mention...,"[john paul ii, john paul ii, john paul ii]",no,0.001
1,07fac974d3c5714d8a990a8b1fdd61838939c613926530...,"Near Ballarat, gold was discovered in 1851.","[1851, in 1851, 1851]",Yes.,0.001
2,d568ae5d2c9056b505e6ee7c42ccc4d612c2b28d466372...,20 million ounces,"[20 million ounces, 20 million ounces, 20 mill...",Yes.,0.001
3,9c42a3f7f7debe911ba20717c6ec1ff5176f848eaf17d0...,Not in background. The text does not mention a...,[not in background],Yes,0.001
4,e5e3bae01a94b7d75508c24c2fde048ad295188d160d4d...,Not in background.,[not in background],yes,0.001
...,...,...,...,...,...
95,d6f86f97949e50afcd2580a38f39f0743a9773acc731f2...,Here are the answers to each question based on...,[not in background],no,0.001
96,93ae398b3aab76904fafff74151b2d1c721dc67c70ddd5...,Here are the answers to each question based on...,[not in background],no,0.001
97,04ac4ae3c2a36c7e078a64530a1f9440ee4abee2de20ae...,Here are the answers to each question based on...,[not in background],no,0.001
98,c8059e2fd92eb9e34d064c12e495e764a9635f0657ed25...,"Based on the background information, here are ...",[not in background],no,0.001


['no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no']