In [1]:
import huggingface_hub
import os
from datasets import load_dataset
import ollama

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
huggingface_hub.login(os.environ['HF_API_TOKEN'])
def load_eval_dataset(dataset_name="squad_v2"):
    # Load the SQuAD v2 dataset by default
    file_path = "Meta-Llama-3.1-8B-evals/Details_squad_2024-07-22T14-58-08.291117.parquet.gzip"
    dataset = load_dataset(dataset_name, data_files=file_path, split="train")
    return dataset


def generate_llm(prompt: str, model: str = "llama3.1", temperature=0.01) -> str:
    return ollama.generate(
        model=model, prompt=prompt, options=ollama.Options(temperature=temperature, num_predict=32)
    )["response"]

dataset = load_eval_dataset(dataset_name="meta-llama/Meta-Llama-3.1-8B-evals")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/edwin/.cache/huggingface/token
Login successful


In [3]:
dataset

Dataset({
    features: ['task_type', 'task_name', 'subtask_name', 'input_question', 'input_choice_list', 'input_final_prompts', 'input_correct_responses', 'output_prediction_text', 'output_parsed_answer', 'output_choice_completions', 'output_choice_negative_log_likelihoods', 'output_metrics', 'is_correct', 'input_question_hash', 'input_final_prompts_hash', 'benchmark_label', 'eval_config'],
    num_rows: 11873
})

In [4]:
generate_llm("What is the capital of Texas?")


'The capital of Texas is Austin.'

In [5]:
'{hi}'.format(hi='hello')

'hello'

In [6]:
import pickle
from collections import defaultdict
import time

############################################
# Attempt 3:
############################################
predictions = defaultdict(list)
references = defaultdict(list)


eval_prompt = """ 
You are a evaluator that needs to compare if response1 matches any of the other_responses to determine if they are the same or not.

response1: {llm_response}

other_responses: {input_correct_responses}

Answer just "yes" or "no" to the question: "Do the responses match?"
"""

def evaluate_llm(temp=0.01, total_evals=1, model="llama3.1"):
    evaluation_history = []

    for i, example in enumerate(dataset):
        # print(i, example)
        if i == total_evals:
            break
        
        generation = generate_llm(example["input_question"], model=model, temperature=temp)
        try:
            generation = generation.split('\n\n')[0]
        except:
            pass
        print('START GENERATION:\n',  generation)
        print('END GENERATION\n')
        print('Correct responses:', example["input_correct_responses"])

        correct = generate_llm(
            eval_prompt.format(
                llm_response=generation,
                input_correct_responses=example["input_correct_responses"],
            ),
            model='llama3.1',
        )
        evaluation_history.append(
            {
                "id": example["input_question_hash"],
                "generation": generation,
                "input_correct_responses": example["input_correct_responses"],
                "correct": correct.lower().strip('.'),
                "temperature": temp,
            }
        )
        
    return evaluation_history


In [10]:

all_evaluations ={}
for temp in [0.001, 1.0]:
    evaluation_history = evaluate_llm(temp, total_evals=100, model='mistral:text')
    all_evaluations[temp] = evaluation_history

START GENERATION:
  John Paul II
END GENERATION

Correct responses: ['john paul ii', 'john paul ii', 'john paul ii']
START GENERATION:
  1851
END GENERATION

Correct responses: ['1851', 'in 1851', '1851']
START GENERATION:
  20 million ounces
END GENERATION

Correct responses: ['20 million ounces', '20 million ounces', '20 million ounces']
START GENERATION:
  1 July 1851
END GENERATION

Correct responses: ['not in background']
START GENERATION:
  Not in background
END GENERATION

Correct responses: ['not in background']
START GENERATION:
  Not in background
END GENERATION

Correct responses: ['not in background']
START GENERATION:
  The largest gold rush the world has ever seen.
END GENERATION

Correct responses: ['gold rush', 'gold rush', 'gold rushes']
START GENERATION:
  sevenfold from 76,000 to 540,000.
END GENERATION

Correct responses: ['sevenfold', 'sevenfold', '76000 to 540000']
START GENERATION:
  Not in background
END GENERATION

Correct responses: ['not in background']
START

In [None]:
models = ["gemma2:2b-text-fp16", "mistral:text", "gemma2:9b-text-fp16", "qwen2:7b-text"]

In [11]:
all_evaluations



{0.001: [{'id': '8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a256e2abea3e1b3eeb1',
   'generation': ' John Paul II',
   'input_correct_responses': ['john paul ii', 'john paul ii', 'john paul ii'],
   'correct': 'yes',
   'temperature': 0.001},
  {'id': '07fac974d3c5714d8a990a8b1fdd61838939c6139265307baf0770058dd1f7aa',
   'generation': ' 1851',
   'input_correct_responses': ['1851', 'in 1851', '1851'],
   'correct': 'yes',
   'temperature': 0.001},
  {'id': 'd568ae5d2c9056b505e6ee7c42ccc4d612c2b28d4663720298a38bda20f027e4',
   'generation': ' 20 million ounces',
   'input_correct_responses': ['20 million ounces',
    '20 million ounces',
    '20 million ounces'],
   'correct': 'yes',
   'temperature': 0.001},
  {'id': '9c42a3f7f7debe911ba20717c6ec1ff5176f848eaf17d01c5c82927e196fb31b',
   'generation': ' 1 July 1851',
   'input_correct_responses': ['not in background'],
   'correct': 'no',
   'temperature': 0.001},
  {'id': 'e5e3bae01a94b7d75508c24c2fde048ad295188d160d4dee3ddbd0023e130

In [17]:
import pandas as pd

df001 = pd.DataFrame(all_evaluations[0.001])
df001

Unnamed: 0,id,generation,input_correct_responses,correct,temperature
0,8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a...,John Paul II,"[john paul ii, john paul ii, john paul ii]",yes,0.001
1,07fac974d3c5714d8a990a8b1fdd61838939c613926530...,1851,"[1851, in 1851, 1851]",yes,0.001
2,d568ae5d2c9056b505e6ee7c42ccc4d612c2b28d466372...,20 million ounces,"[20 million ounces, 20 million ounces, 20 mill...",yes,0.001
3,9c42a3f7f7debe911ba20717c6ec1ff5176f848eaf17d0...,1 July 1851,[not in background],no,0.001
4,e5e3bae01a94b7d75508c24c2fde048ad295188d160d4d...,Not in background,[not in background],yes,0.001
...,...,...,...,...,...
95,d6f86f97949e50afcd2580a38f39f0743a9773acc731f2...,Not in background,[not in background],yes,0.001
96,93ae398b3aab76904fafff74151b2d1c721dc67c70ddd5...,Not in background,[not in background],yes,0.001
97,04ac4ae3c2a36c7e078a64530a1f9440ee4abee2de20ae...,Members of the Victorian Parliament,[not in background],no,0.001
98,c8059e2fd92eb9e34d064c12e495e764a9635f0657ed25...,Miners,[not in background],no,0.001


In [18]:
import pandas as pd

df1 = pd.DataFrame(all_evaluations[1.0])
df1

Unnamed: 0,id,generation,input_correct_responses,correct,temperature
0,8bda73a1a16177a7b1a27cd99ca72b17ba96f04c86c36a...,John Paul II,"[john paul ii, john paul ii, john paul ii]",yes,1.0
1,07fac974d3c5714d8a990a8b1fdd61838939c613926530...,1851,"[1851, in 1851, 1851]",yes,1.0
2,d568ae5d2c9056b505e6ee7c42ccc4d612c2b28d466372...,20 million ounces of gold,"[20 million ounces, 20 million ounces, 20 mill...",yes,1.0
3,9c42a3f7f7debe911ba20717c6ec1ff5176f848eaf17d0...,1 July 1851,[not in background],no,1.0
4,e5e3bae01a94b7d75508c24c2fde048ad295188d160d4d...,Not in background,[not in background],yes,1.0
...,...,...,...,...,...
95,d6f86f97949e50afcd2580a38f39f0743a9773acc731f2...,Not in background,[not in background],yes,1.0
96,93ae398b3aab76904fafff74151b2d1c721dc67c70ddd5...,Not in background,[not in background],yes,1.0
97,04ac4ae3c2a36c7e078a64530a1f9440ee4abee2de20ae...,Members of the Victorian Parliament,[not in background],no,1.0
98,c8059e2fd92eb9e34d064c12e495e764a9635f0657ed25...,Miners,[not in background],no,1.0
