In [None]:
import json

import torch
import torch.nn as nn
import bitsandbytes as bnb

import datasets
from datasets import load_dataset, load_from_disk
from datasets.arrow_dataset import Dataset
from evaluate import load

from transformers import pipeline
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
from transformers.pipelines.pt_utils import KeyDataset

import accelerate

from tqdm import tqdm

import matplotlib.pyplot as plt

In [None]:
from huggingface_hub import login
login(token = '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')

In [None]:
judge_model_id = "meta-llama/Llama-3.2-3B-Instruct"

judge_pipe = pipeline('text-generation', model=judge_model_id, torch_dtype=torch.bfloat16)
# LLama 3.2 has multiple eos_token_id. We use the "128001"
judge_pipe.tokenizer.pad_token_id = judge_pipe.model.config.eos_token_id[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# model_id = "meta-llama/Llama-3.2-1B"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipe = pipeline('text-generation', model=model_id, torch_dtype=torch.bfloat16)
# LLama 3.2 has multiple eos_token_id. We use the "128001"
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id[0]

_model = pipe.model
_tokenizer = pipe.tokenizer

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
quant8_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
quant8_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant8_config)
quant8_pipe = pipeline('text-generation', model=quant8_model, tokenizer=_tokenizer, torch_dtype='auto')
# LLama 3.2 has multiple eos_token_id. We use the "128001"
quant8_pipe.tokenizer.pad_token_id = quant8_pipe.model.config.eos_token_id[0]

Device set to use cuda:0


In [None]:
quant4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4')
quant4_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant4_config)
quant4_pipe = pipeline('text-generation', model=quant4_model, tokenizer=_tokenizer, torch_dtype='auto')
# LLama 3.2 has multiple eos_token_id. We use the "128001"
quant4_pipe.tokenizer.pad_token_id = quant4_pipe.model.config.eos_token_id[0]

Device set to use cuda:0


In [None]:
llama_1b = sum(param.numel() * param.element_size() for param in _model.parameters()) / (1024 ** 2)
llama_1b_int8 = sum(param.numel() * param.element_size() for param in quant8_model.parameters()) /  (1024 ** 2)
llama_1b_int4 = sum(param.numel() * param.element_size() for param in quant4_model.parameters()) /  (1024 ** 2)

print(f'LLama 3.2 1B Instruct with bfloat16 uses {round(llama_1b, 2)}MB of memory')
print(f'LLama 3.2 1B Instruct with bfloat16 and int8 uses {round(llama_1b_int8, 2)}MB of memory')
print(f'LLama 3.2 1B Instruct with bfloat16 and int4 uses {round(llama_1b_int4, 2)}MB of memory')

LLama 3.2 1B Instruct with bfloat16 uses 2357.13MB of memory
LLama 3.2 1B Instruct with bfloat16 and int8 uses 1429.13MB of memory
LLama 3.2 1B Instruct with bfloat16 and int4 uses 965.13MB of memory


In [None]:
triviqa_dataset = load_dataset('mandarjoshi/trivia_qa', 'unfiltered', split='all')

README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/47 [00:00<?, ?files/s]

train-00000-of-00047.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00001-of-00047.parquet:   0%|          | 0.00/279M [00:00<?, ?B/s]

train-00002-of-00047.parquet:   0%|          | 0.00/250M [00:00<?, ?B/s]

train-00003-of-00047.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

train-00004-of-00047.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00005-of-00047.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00006-of-00047.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00007-of-00047.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00008-of-00047.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00009-of-00047.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00010-of-00047.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

train-00011-of-00047.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00012-of-00047.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

train-00013-of-00047.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00014-of-00047.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

train-00015-of-00047.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00016-of-00047.parquet:   0%|          | 0.00/397M [00:00<?, ?B/s]

train-00017-of-00047.parquet:   0%|          | 0.00/372M [00:00<?, ?B/s]

train-00018-of-00047.parquet:   0%|          | 0.00/320M [00:00<?, ?B/s]

train-00019-of-00047.parquet:   0%|          | 0.00/351M [00:00<?, ?B/s]

train-00020-of-00047.parquet:   0%|          | 0.00/314M [00:00<?, ?B/s]

train-00021-of-00047.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00022-of-00047.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00023-of-00047.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

train-00024-of-00047.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

train-00025-of-00047.parquet:   0%|          | 0.00/252M [00:00<?, ?B/s]

train-00026-of-00047.parquet:   0%|          | 0.00/278M [00:00<?, ?B/s]

train-00027-of-00047.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00028-of-00047.parquet:   0%|          | 0.00/252M [00:00<?, ?B/s]

train-00029-of-00047.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00030-of-00047.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

train-00031-of-00047.parquet:   0%|          | 0.00/264M [00:00<?, ?B/s]

train-00032-of-00047.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00033-of-00047.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00034-of-00047.parquet:   0%|          | 0.00/338M [00:00<?, ?B/s]

train-00035-of-00047.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00036-of-00047.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00037-of-00047.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00038-of-00047.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

train-00039-of-00047.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00040-of-00047.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00041-of-00047.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00042-of-00047.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00043-of-00047.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00044-of-00047.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00045-of-00047.parquet:   0%|          | 0.00/349M [00:00<?, ?B/s]

train-00046-of-00047.parquet:   0%|          | 0.00/361M [00:00<?, ?B/s]

validation-00000-of-00007.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00001-of-00007.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

validation-00002-of-00007.parquet:   0%|          | 0.00/308M [00:00<?, ?B/s]

validation-00003-of-00007.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

validation-00004-of-00007.parquet:   0%|          | 0.00/234M [00:00<?, ?B/s]

validation-00005-of-00007.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00006-of-00007.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

test-00000-of-00006.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

test-00001-of-00006.parquet:   0%|          | 0.00/316M [00:00<?, ?B/s]

test-00002-of-00006.parquet:   0%|          | 0.00/300M [00:00<?, ?B/s]

test-00003-of-00006.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

test-00004-of-00006.parquet:   0%|          | 0.00/295M [00:00<?, ?B/s]

test-00005-of-00006.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87622 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11313 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10832 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/46 [00:00<?, ?it/s]

In [None]:
ds_100 = triviqa_dataset.select(range(100))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
responses = pipe([
    {'role': 'system', 'content': 'You are a chatbot which answers user question in the most concise manner possible.'},
    {'role': 'user', 'content': ds_100[0]['question']}
], pad_token_id =  _tokenizer.pad_token_id)

print(json.dumps(responses, indent=4))

[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "You are a chatbot which answers user question in the most concise manner possible."
            },
            {
                "role": "user",
                "content": "Who was President when the first Peanuts cartoon was published?"
            },
            {
                "role": "assistant",
                "content": "Paul Winchell was the President when the first Peanuts cartoon was published."
            }
        ]
    }
]


In [None]:
responses[0]['generated_text'][-1]['content']

'Paul Winchell was the President when the first Peanuts cartoon was published.'

In [None]:
chat_item = [
    {'role': 'system', 'content': 'For the following query give response in True or False only, nothing more.'},
    {'role': 'user', 'content': f'Does "{ds_100[0]["answer"]["normalized_value"]}" appear in the following text "{responses[0]["generated_text"][-1]["content"]}"'}
]

verification_response = pipe([chat_item] * 10, pad_token_id =  _tokenizer.pad_token_id)

print(json.dumps(verification_response, indent=4))

[
    [
        {
            "generated_text": [
                {
                    "role": "system",
                    "content": "For the following query give response in True or False only, nothing more."
                },
                {
                    "role": "user",
                    "content": "Does \"harry truman\" appear in the following text \"Paul Winchell was the President when the first Peanuts cartoon was published.\""
                },
                {
                    "role": "assistant",
                    "content": "False"
                }
            ]
        }
    ],
    [
        {
            "generated_text": [
                {
                    "role": "system",
                    "content": "For the following query give response in True or False only, nothing more."
                },
                {
                    "role": "user",
                    "content": "Does \"harry truman\" appear in the following text \"Paul Winche

In [None]:
verification_response = judge_pipe([
    {'role': 'system', 'content': 'For the following query give response in False or True only, nothing more.'},
    {'role': 'user', 'content': f'For the question "{ds_100[0]["question"]}", the correct answer is "{ds_100[0]["answer"]["normalized_value"]}". Does the response "{responses[0]["generated_text"][-1]["content"]}" match the correct answer?'}
], pad_token_id=_tokenizer.pad_token_id)

import json
print(json.dumps(verification_response, indent=4))


[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "For the following query give response in False or True only, nothing more."
            },
            {
                "role": "user",
                "content": "For the question \"Who was President when the first Peanuts cartoon was published?\", the correct answer is \"harry truman\". Does the response \"Paul Winchell was the President when the first Peanuts cartoon was published.\" match the correct answer?"
            },
            {
                "role": "assistant",
                "content": "False"
            }
        ]
    }
]


In [None]:
responses[0]['generated_text'][-1]['content']

'Paul Winchell was the President when the first Peanuts cartoon was published.'

In [None]:
response_kwargs = {
    'max_new_tokens': 128,
    'pad_token_id': _tokenizer.pad_token_id,
    # 'batch_size': 5 * MULTIPLIER
}


responses = pipe([
    {'role': 'system', 'content': 'You are a chatbot which answers user question in the most succinct manner possible.'},
    {'role': 'user', 'content': ds_100[0]['question']}
], **response_kwargs)

response_kwargs = {
    'max_new_tokens': 128,
    'pad_token_id': _tokenizer.pad_token_id,
    # 'batch_size': 5 * MULTIPLIER
}


responses = pipe([
    {'role': 'system', 'content': 'You are a chatbot which answers user question in the most succinct manner possible.'},
    {'role': 'user', 'content': ds_100[0]['question']}
], **response_kwargs)

print(json.dumps(responses, indent=4))

response_item = [
    {'role': 'system', 'content': 'For the following query give response as True or False, nothing more.'}, # , nothing more
    {'role': 'user', 'content': f'For the question "{ds_100[0]["question"]}", the correct answer is "{ds_100[0]["answer"]["normalized_value"]}". Does the response "{responses[0]["generated_text"][-1]["content"]}" fits the correct answer?'}
]

verification_SAMPLES = 25

verification_response = judge_pipe([response_item] * verification_SAMPLES, pad_token_id =  _tokenizer.pad_token_id)

# print(json.dumps(verification_response, indent=4))
print(sum([int(item[0]['generated_text'][-1]['content'].lower() == 'true') for item in verification_response]) / verification_SAMPLES)
verification_SAMPLES = 25

verification_response = judge_pipe([response_item] * verification_SAMPLES, pad_token_id =  _tokenizer.pad_token_id)

# print(json.dumps(verification_response, indent=4))
print(sum([int(item[0]['generated_text'][-1]['content'].lower() == 'true') for item in verification_response]) / verification_SAMPLES)

responses = pipe([
    {'role': 'system', 'content': 'You are a chatbot which answers user question in the most succinct manner possible.'},
    {'role': 'user', 'content': ds_100[0]['question']}
], **response_kwargs)

print(json.dumps(responses, indent=4))

response_item = [
    {'role': 'system', 'content': 'For the following query give response as True or False, nothing more.'}, # , nothing more
    # {'role': 'user', 'content': f'For the question "{ds_100[0]['question']}", the correct answer is "{ds_100[0]['answer']['normalized_value']}". Does the response "{responses[0]['generated_text'][-1]['content']} the time was Harry Truman" the correct answer?'}
    # {'role': 'user', 'content': f'Does "{ds_100[0]['answer']['normalized_value']}" appears in the following text "{responses[0]['generated_text'][-1]['content']}"'}
    {'role': 'user', 'content': f'For the question "{ds_100[0]["question"]}", the correct answer is "{ds_100[0]["answer"]["normalized_value"]}". Does the response "{responses[0]["generated_text"][-1]["content"]}" fits the correct answer?'}
]

verification_SAMPLES = 25

verification_response = judge_pipe([response_item] * verification_SAMPLES, pad_token_id =  _tokenizer.pad_token_id)

# print(json.dumps(verification_response, indent=4))
print(sum([int(item[0]['generated_text'][-1]['content'].lower() == 'true') for item in verification_response]) / verification_SAMPLES)
verification_SAMPLES = 25

verification_response = judge_pipe([response_item] * verification_SAMPLES, pad_token_id =  _tokenizer.pad_token_id)

# print(json.dumps(verification_response, indent=4))
print(sum([int(item[0]['generated_text'][-1]['content'].lower() == 'true') for item in verification_response]) / verification_SAMPLES)

[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "You are a chatbot which answers user question in the most succinct manner possible."
            },
            {
                "role": "user",
                "content": "Who was President when the first Peanuts cartoon was published?"
            },
            {
                "role": "assistant",
                "content": "The first Peanuts cartoon was published in 1950, and the President of the United States at that time was Harry S. Truman."
            }
        ]
    }
]
0.92
0.72
[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "You are a chatbot which answers user question in the most succinct manner possible."
            },
            {
                "role": "user",
                "content": "Who was President when the first Peanuts cartoon was published?"
            },
            {
          

In [None]:
def get_prompt_for_question(question: str) -> list:
    """
    Get prompt in chat format. This includes a system and an user prompt.
    """
    return [
        {'role': 'system', 'content': 'You are a chatbot which answers user question in the most concise manner possible.'},
        {'role': 'user', 'content': question}
    ]

In [None]:
ds_100 = ds_100.add_column('question_prompt', column=list(map(get_prompt_for_question, ds_100['question'])))
ds_100

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'question_prompt'],
    num_rows: 100
})

In [None]:
SAMPLING = 50
MULTIPLIER = 10

# params for beam
pipe_kwargs = {
    'num_beams': 5,
    'do_sample': True,
    'num_return_sequences': 1,
    'early_stopping': True,
    'max_new_tokens': 128,
    'pad_token_id': _tokenizer.pad_token_id,
    'batch_size': 5 * MULTIPLIER
}


del MULTIPLIER

# Originally, it's right side, but huggingface throws warning.
# "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer."
_tokenizer.padding_side = 'left'


generations = {
    'bfloat16': [],
    'int8': [],
    'int4': []
}

inference_ds = KeyDataset(ds_100.repeat(SAMPLING), key='question')
n = len(inference_ds)

with tqdm(total=n, desc='bfloat16') as pbar:
    for out in pipe(inference_ds, **pipe_kwargs):
        generations['bfloat16'].append(out[0]['generated_text'])

        pbar.update()

with tqdm(total=n, desc='int8') as pbar:
    for out in quant8_pipe(inference_ds, **pipe_kwargs):
        generations['int8'].append(out[0]['generated_text'])

        pbar.update()

with tqdm(total=n, desc='int4') as pbar:
    for out in quant4_pipe(inference_ds, **pipe_kwargs):
        generations['int4'].append(out[0]['generated_text'])

        pbar.update()


torch.cuda.empty_cache()

bfloat16: 100%|██████████| 5000/5000 [24:38<00:00,  3.38it/s]
int8: 100%|██████████| 5000/5000 [34:27<00:00,  2.42it/s]
int4: 100%|██████████| 5000/5000 [29:50<00:00,  2.79it/s]


In [None]:
def get_prompt_for_verification(question, answer, generated_response):
    normalized_value = answer['normalized_value'] if isinstance(answer, dict) and 'normalized_value' in answer else answer

    return [
        {'role': 'system', 'content': 'For the following query give response in False or True only, nothing more.'},
        {'role': 'user', 'content': f'For the question "{question}", the correct answer is "{normalized_value}". Does the response "{generated_response}" match the correct answer?'}
    ]


In [None]:
ds_100_generations = ds_100.repeat(SAMPLING)

for key, value in generations.items():
    name = f'{key}_response'
    ds_100_generations = ds_100_generations.add_column(name, value)

    prompts = list(map(get_prompt_for_verification,
                       ds_100_generations['question'],
                       ds_100_generations['answer'],
                       ds_100_generations[name]))

    ds_100_generations = ds_100_generations.add_column(f'{key}_verification_prompt', prompts)

ds_100_generations

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'question_prompt', 'bfloat16_response', 'bfloat16_verification_prompt', 'int8_response', 'int8_verification_prompt', 'int4_response', 'int4_verification_prompt'],
    num_rows: 5000
})

## Evaluating responses on "Correctness" as seen by a judge model.

In [None]:
# Select as per the GPU?
MULTIPLIER = 10

# params for Judge, no specific sampling methodology. LLama 3.2 3B follows the instructions quite well.
verification_pipe_kwargs = {
    'pad_token_id': _tokenizer.pad_token_id,
    'batch_size': 5 * MULTIPLIER
}

del MULTIPLIER

# Originally, it's right side, but huggingface throws warning.
# "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer."
# _tokenizer.padding_side = 'left'
judge_pipe.tokenizer.padding_side = 'left'

verifications = {
    'bfloat16': [],
    'int8': [],
    'int4': []
}

torch.cuda.empty_cache()

# We ask the judge multiple times and accumulate "True" or "False".
VERIFICATION_SAMPLES = 10
# VERIFICATION_SAMPLES = 10


for key in verifications.keys():
    verification_ds = KeyDataset(ds_100_generations.repeat(VERIFICATION_SAMPLES), key=f'{key}_verification_prompt')
    n = len(verification_ds)

    with tqdm(total=n, desc=f'{key}_verification') as pbar:
        for out in judge_pipe(verification_ds, **verification_pipe_kwargs):
            verifications[key].append(out[0]['generated_text'])

            pbar.update()

    torch.cuda.empty_cache()

bfloat16_verification: 100%|██████████| 50000/50000 [52:04<00:00, 16.00it/s]    
int8_verification: 100%|██████████| 50000/50000 [34:45<00:00, 23.98it/s]    
int4_verification: 100%|██████████| 50000/50000 [34:24<00:00, 24.22it/s]    


In [None]:
generations['bfloat16'][0]

'Who was President when the first Peanuts cartoon was published? Franklin D. Roosevelt\nThe first Peanuts cartoon was published on October 2, 1950. Franklin D. Roosevelt was the President of the United States at that time. The cartoon was created by Charles M. Schulz.'

In [None]:
ds_100_verifications_25 = ds_100_generations.repeat(VERIFICATION_SAMPLES)

for key, value in verifications.items():
    name = f'{key}_verification_response'
    ds_100_verifications_25 = ds_100_verifications_25.add_column(name, value)

# ds_100_generations.save_to_disk('ds_100_slice')
# ds_100_verifications_25.save_to_disk('ds_100_verifications_25_slice')

# Define the path inside your Google Drive
output_path = '/content/drive/MyDrive/ds_100_slice'
verifications_path = '/content/drive/MyDrive/ds_100_verifications_25_slice'

# Save the datasets
ds_100_generations.save_to_disk(output_path)
ds_100_verifications_25.save_to_disk(verifications_path)

Saving the dataset (0/3 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/21 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
ds_100_generations = load_from_disk('/content/drive/MyDrive/ds_100_slice')
ds_100_verifications_25 = load_from_disk('/content/drive/MyDrive/ds_100_verifications_25_slice')

Loading dataset from disk:   0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
ds_100_verifications_25['bfloat16_verification_response']

[[{'content': 'For the following query give response in False or True only, nothing more.',
   'role': 'system'},
  {'content': 'For the question "Who was President when the first Peanuts cartoon was published?", the correct answer is "harry truman". Does the response "Who was President when the first Peanuts cartoon was published? Franklin D. Roosevelt\nThe first Peanuts cartoon was published on October 2, 1950. Franklin D. Roosevelt was the President of the United States at that time. The cartoon was created by Charles M. Schulz." match the correct answer?',
   'role': 'user'},
  {'content': 'False', 'role': 'assistant'}],
 [{'content': 'For the following query give response in False or True only, nothing more.',
   'role': 'system'},
  {'content': 'For the question "Which American-born Sinclair won the Nobel Prize for Literature in 1930?", the correct answer is "sinclair lewis". Does the response "Which American-born Sinclair won the Nobel Prize for Literature in 1930? Sinclair Lewi

In [None]:
VERIFICATION_SAMPLES = 10
SAMPLING = 50

In [None]:
len(ds_100_generations)

5000

In [None]:
import numpy as np

for verification_key in ('bfloat16', 'int8', 'int4'):
    verification_key = f'{verification_key}_verification_response'
    arr = np.array([int(chat[-1]['content'].lower() == 'true') for chat in ds_100_verifications_25[verification_key]])

    # The first two dims are ordered following FIFO strategy to
    # balance the 'tile' operations applied on the original 100 questions.
    arr = arr.reshape(VERIFICATION_SAMPLES, SAMPLING, -1)
    # We average the results over each verification for each sample.
    mean_accuracy_per_sample : np.ndarray = arr.mean(axis=0)
    # We then average and std over each sample for each question.
    accuracy_per_question : np.ndarray = mean_accuracy_per_sample.mean(axis=0)
    std_per_question : np.ndarray = mean_accuracy_per_sample.std(axis=0)

    mean_accuracy = accuracy_per_question.mean(axis=0)
    avg_std = std_per_question.mean(axis=0)

    print(mean_accuracy, avg_std)

0.3549 0.09259082219483507
0.32583999999999996 0.16092776476670612
0.25898 0.06721536730367031


In [None]:
final_accuracy = 0.0
num_questions = 100
num_samples_per_question = SAMPLING  # 50
verifications_bf16 = verifications['bfloat16']

for q_idx in range(num_questions):
    start = q_idx * num_samples_per_question
    end = start + num_samples_per_question
    relevant_verifications = verifications_bf16[start:end]

    # Get the last N judged responses (e.g. 10)
    last_n = relevant_verifications[-VERIFICATION_SAMPLES:]

    # Count how many of those the judge said "true"
    correct = sum(int(j[-1]['content'].strip().lower() == 'true') for j in last_n)

    final_accuracy += correct / VERIFICATION_SAMPLES

final_accuracy /= num_questions
print(f'Accuracy is {final_accuracy:.3%}')


Accuracy is 48.900%


In [None]:
verifications['bfloat16'][0]

[{'content': 'For the following query give response in False or True only, nothing more.',
  'role': 'system'},
 {'content': 'For the question "Who was President when the first Peanuts cartoon was published?", the correct answer is "harry truman". Does the response "Who was President when the first Peanuts cartoon was published? Franklin D. Roosevelt\nThe first Peanuts cartoon was published on October 2, 1950. Franklin D. Roosevelt was the 32nd President of the United States, serving from 1933 to 1945." match the correct answer?',
  'role': 'user'},
 {'role': 'assistant', 'content': 'False'}]

In [None]:
def verify_response_via_judge_model(ds: Dataset, idx: int, verification_samples: int = 10) -> list:
    row = ds[idx]
    question = row['question']
    normalized_answer = row['answer']['normalized_value']

    response_types = ['bfloat16_response', 'int8_response', 'int4_response']

    response_accuracies = []

    for key in response_types:
        response = row[key]
        # response = row[key].removeprefix(question)
        # print(f'Question: {question}')
        # print(f'Response: {response}')

        response_item = [
            {'role': 'system', 'content': 'For the following query give response as True or False, nothing more.'},
            # {'role': 'user', 'content': f'Does "{ds_100[0]['answer']['normalized_value']}" appears in the following text "{responses[0]['generated_text'][-1]['content']}"'}
            {'role': 'user', 'content': f'For the question "{question}", the correct answer is "{normalized_answer}". Does the response "{response}" fits the correct answer?'}
        ]

        verification_responses = judge_pipe([response_item] * verification_samples, pad_token_id = _tokenizer.pad_token_id)

        print(json.dumps(verification_responses, indent=4))

        accuracy_score = sum(int(veri_response[0]['generated_text'][-1]['content'].lower() == 'true') for veri_response in verification_responses) / verification_samples
        response_accuracies.append(accuracy_score)

    return response_accuracies

In [None]:
verify_response_via_judge_model(ds_100_generations, idx=1)

[
    [
        {
            "generated_text": [
                {
                    "role": "system",
                    "content": "For the following query give response as True or False, nothing more."
                },
                {
                    "role": "user",
                    "content": "For the question \"Which American-born Sinclair won the Nobel Prize for Literature in 1930?\", the correct answer is \"sinclair lewis\". Does the response \"Which American-born Sinclair won the Nobel Prize for Literature in 1930? Sinclair Lewis.\nSinclair Lewis won the Nobel Prize for Literature in 1930.\nSinclair Lewis was an American novelist, short story writer, and playwright. He is best known for his novels such as _Main Street_ and _Babbitt_. Lewis was awarded the Nobel Prize in Literature in 1930 for his contributions to American literature.\nSinclair Lewis was an American novelist, short story writer, and playwright. He is best known for his novels such as _Main Street_

[0.4, 1.0, 0.2]

## Loading Bert Score metric

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
bert_score_metric = load('bertscore')

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

#### We choose the best F1 score from the TriviaQA dataset for each sample.
So, we repeat the responses to match the references for the bert score metric calculation.

In [None]:
possible_answers_count = [len(item['normalized_aliases']) for item in ds_100_generations['answer']]
ds_100_generations = ds_100_generations.add_column('repeat_samples_count', possible_answers_count)

ds_100_generations.save_to_disk('ds_100_slice')

ds_100_generations

Saving the dataset (0/3 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'question_prompt', 'bfloat16_response', 'bfloat16_verification_prompt', 'int8_response', 'int8_verification_prompt', 'int4_response', 'int4_verification_prompt', 'repeat_samples_count'],
    num_rows: 5000
})

In [None]:
ds_100_generations = load_from_disk('/content/ds_100_slice')

### A possible problem with BERTScore???

In [None]:
bert_score_entries = []

for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
    prefix = item['question']
    repeat_count = item['repeat_samples_count']
    bfloat16_response = item['bfloat16_response']
    normalized_ground_truth = item['answer']['normalized_aliases']

    bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
    print(bfloat16_response.removeprefix(prefix))
    print(normalized_ground_truth)
    print(bfloat16_scores['f1'])
    break

 Franklin D. Roosevelt
The first Peanuts cartoon was published on October 2, 1950. Franklin D. Roosevelt was the 32nd President of the United States, serving from 1933 to 1945.
['presidency of harry s truman', '33rd president of united states', 'truman administration', 's truman', 'mr citizen', 'harry truman s', 'harry truman', 'hary truman', 'harry shipp truman', 'h truman', 'harry shippe truman', 'h s truman', 'president truman', 'president harry truman', 'hst president', 'presidency of harry truman', 'mary jane truman', 'harry solomon truman', 'harold truman', 'harry s truman']
[0.7917824983596802, 0.8284944891929626, 0.7929587364196777, 0.7770087718963623, 0.7798301577568054, 0.7742905020713806, 0.7695346474647522, 0.772251546382904, 0.773760974407196, 0.775693416595459, 0.7778941988945007, 0.7731899619102478, 0.7822277545928955, 0.8005311489105225, 0.7934330105781555, 0.8043298125267029, 0.7771598100662231, 0.773851752281189, 0.7765055894851685, 0.7725012302398682]


In [None]:
prediction = """
 Harry Truman
When the first Peanuts cartoon was published in 1950, President Harry Truman was in office.
The cartoon was created by Charles M. Schulz and was first published in the Washington Post on October 2, 1950. It was later syndicated and became a huge success. The cartoon was originally called "Li'l Folks" and was later renamed "Peanuts." It was known for its humorous and relatable portrayal of everyday life and its characters, including Charlie Brown, Snoopy, Lucy, and Linus. The cartoon was a huge success and became a cultural phenomenon, running for over 50
"""
print(prediction)


 Harry Truman
When the first Peanuts cartoon was published in 1950, President Harry Truman was in office.
The cartoon was created by Charles M. Schulz and was first published in the Washington Post on October 2, 1950. It was later syndicated and became a huge success. The cartoon was originally called "Li'l Folks" and was later renamed "Peanuts." It was known for its humorous and relatable portrayal of everyday life and its characters, including Charlie Brown, Snoopy, Lucy, and Linus. The cartoon was a huge success and became a cultural phenomenon, running for over 50



In [None]:
value = ds_100_generations['answer'][0]['value']
print(value)

Harry Truman


In [None]:
bert_score_metric.compute(predictions=[prediction], references=[ds_100_generations['answer'][0]['value']], lang='en')

{'precision': [0.7731978893280029],
 'recall': [0.9096446633338928],
 'f1': [0.8358896970748901],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.51.3)'}

In [None]:
bert_score_entries = []
batch_size = max(ds_100_generations['repeat_samples_count'])
batch_size = 1000000

with tqdm(total=len(ds_100_generations), desc='Calculating BERT Score') as pbar:
    for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
        prefix = item['question']
        repeat_count = item['repeat_samples_count']
        bfloat16_response = item['bfloat16_response']
        int8_response = item['int8_response']
        int4_response = item['int4_response']
        normalized_ground_truth = item['answer']['normalized_aliases']
        # repeat_count = len(normalized_ground_truth)

        # wrt normalized ground truth.

        bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)
        int8_scores = bert_score_metric.compute(predictions=[int8_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)
        int4_scores = bert_score_metric.compute(predictions=[int4_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)

        entry = {
            'bfloat16': max(zip(bfloat16_scores['f1'], normalized_ground_truth)),
            'int8': max(zip(int8_scores['f1'], normalized_ground_truth)),
            'int4': max(zip(int4_scores['f1'], normalized_ground_truth))
        }

        bert_score_entries.append(entry)
        pbar.update()

Calculating BERT Score: 100%|██████████| 5000/5000 [13:37<00:00,  6.11it/s]


In [None]:
bfloat16_avg = sum([entry['bfloat16'][0] for entry in bert_score_entries]) / len(bert_score_entries)
int8_avg = sum([entry['int8'][0] for entry in bert_score_entries]) / len(bert_score_entries)
int4_avg = sum([entry['int4'][0] for entry in bert_score_entries]) / len(bert_score_entries)

print(f'Following are the results:\nbfloat16: {bfloat16_avg}\nint8: {int8_avg}\nint4: {int4_avg}')

Following are the results:
bfloat16: 0.794823805809021
int8: 0.79628797519207
int4: 0.7928388661146164


In [None]:
bert_score_kwargs = {
    'batch_size': 200
}

predictions_int8 = [response.removeprefix(question) for response, question in zip(ds_100_generations['int8_response'], ds_100_generations['question'])]
predictions_int4 = [response.removeprefix(question) for response, question in zip(ds_100_generations['int4_response'], ds_100_generations['question'])]
references_bfloat16 = [response.removeprefix(question) for response, question in zip(ds_100_generations['bfloat16_response'], ds_100_generations['question'])]

int8_to_bfloat16_scores = bert_score_metric.compute(predictions=predictions_int8, references=references_bfloat16, lang='en', **bert_score_kwargs)
int4_to_bfloat16_scores = bert_score_metric.compute(predictions=predictions_int4, references=references_bfloat16, lang='en', **bert_score_kwargs)

avg_int8_bfloat16 = sum(int8_to_bfloat16_scores['f1']) / len(int8_to_bfloat16_scores['f1'])
avg_int4_bfloat16 = sum(int4_to_bfloat16_scores['f1']) / len(int4_to_bfloat16_scores['f1'])

print(avg_int8_bfloat16)
print(avg_int4_bfloat16)

0.9232553355455398
0.890827811217308


In [None]:
# bert_score_entries = []

accumulated_predictions = {
    'bfloat16': [],
    'int8': [],
    'int4': [],
    'ground_truth': []
}

for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
    prefix = item['question']
    repeat_count = item['repeat_samples_count']
    bfloat16_response = item['bfloat16_response']
    int8_response = item['int8_response']
    int4_response = item['int4_response']
    normalized_ground_truth = item['answer']['normalized_aliases']
    ground_truth = item['answer']['value']
    # repeat_count = len(normalized_ground_truth)

    # wrt normalized ground truth.

    # accumulated_predictions['bfloat16'].extend([bfloat16_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['int8'].extend([int8_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['int4'].extend([int4_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['ground_truth'].extend(normalized_ground_truth)

    accumulated_predictions['bfloat16'].append(bfloat16_response.removeprefix(prefix))
    accumulated_predictions['int8'].append(int8_response.removeprefix(prefix))
    accumulated_predictions['int4'].append(int4_response.removeprefix(prefix))
    # accumulated_predictions['ground_truth'].append(' '.join(normalized_ground_truth))
    accumulated_predictions['ground_truth'].append(ground_truth)

bert_score_kwargs = {
    'batch_size': 200
}

bfloat16_scores = bert_score_metric.compute(predictions=accumulated_predictions['bfloat16'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)
int8_scores = bert_score_metric.compute(predictions=accumulated_predictions['int8'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)
int4_scores = bert_score_metric.compute(predictions=accumulated_predictions['int4'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)

# bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
# int8_scores = bert_score_metric.compute(predictions=[int8_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
# int4_scores = bert_score_metric.compute(predictions=[int4_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')

avg_bfloat16_scores = sum(bfloat16_scores['f1']) / len(bfloat16_scores['f1'])
avg_int8_scores = sum(int8_scores['f1']) / len(int8_scores['f1'])
avg_int4_scores = sum(int4_scores['f1']) / len(int4_scores['f1'])

print(avg_bfloat16_scores)
print(avg_int8_scores)
print(avg_int4_scores)

# entry = {
#     'bfloat16': max(zip(bfloat16_scores['f1'], normalized_ground_truth)),
#     'int8': max(zip(int8_scores['f1'], normalized_ground_truth)),
#     'int4': max(zip(int4_scores['f1'], normalized_ground_truth))
# }

# bert_score_entries.append(entry)

    # Wrt bfloat16

    # int8_to_bfloat16_scores = bert_score_metric.compute(predictions=[int8_response], references=[bfloat16_response], lang='en')

    # print(int8_to_bfloat16_scores['f1'])

0.7892080343365669
0.7908733216524124
0.7848084559798241


In [None]:
scores = bert_score_metric.compute(predictions=['harry truman'] * 20, references=triviqa_dataset[0]['answer']['normalized_aliases'], lang='en')

In [None]:
bert_score_metric.compute(predictions=['harry truman'], references=[' '.join(triviqa_dataset[0]['answer']['normalized_aliases'])], lang='en')

{'precision': [0.8335356712341309],
 'recall': [0.7481536865234375],
 'f1': [0.7885401248931885],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.51.3)'}

In [None]:
scores['f1']

[0.893217921257019,
 0.759381890296936,
 0.8547483682632446,
 0.941124439239502,
 0.7692708373069763,
 0.9693154692649841,
 1.0,
 0.9410976767539978,
 0.930251955986023,
 0.9214672446250916,
 0.9041467905044556,
 0.8703683614730835,
 0.9432796835899353,
 0.9292186498641968,
 0.7604589462280273,
 0.9056100845336914,
 0.8884553909301758,
 0.9333701133728027,
 0.9273219704627991,
 0.9495211839675903]

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
modernBERT_similarity_id = 'joe32140/ModernBERT-large-msmarco'

In [None]:
bfloat16_responses = ds_100_generations['bfloat16_response'][:10]
ground_truth = [item['value'] for item in ds_100_generations['answer'][:10]]

In [None]:
bfloat16_responses

['Who was President when the first Peanuts cartoon was published? Franklin D. Roosevelt\nThe first Peanuts cartoon was published on October 2, 1950. Franklin D. Roosevelt was the 32nd President of the United States, serving from 1933 to 1945.',
 'Which American-born Sinclair won the Nobel Prize for Literature in 1930? Sinclair Lewis.\nSinclair Lewis won the Nobel Prize for Literature in 1930.\nSinclair Lewis was an American novelist, short story writer, and playwright. He is best known for his novels such as _Main Street_ and _Babbitt_. Lewis was awarded the Nobel Prize in Literature in 1930 for his contributions to American literature.\nSinclair Lewis was an American novelist, short story writer, and playwright. He is best known for his novels such as _Main Street_ and _Babbitt_. Lewis was awarded the Nobel Prize in Literature in 1930 for his contributions to American literature.\nSinclair Lewis was an',
 'Where in England was Dame Judi Dench born? \nDame Judi Dench was born in Acton,

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer(modernBERT_similarity_id, device='cuda')

KeyboardInterrupt: 

In [None]:
ls ~/.cache/huggingface/hub/models--joe32140--ModernBERT-large-msmarco


[0m[01;34mblobs[0m/  [01;34mrefs[0m/  [01;34msnapshots[0m/


In [None]:
bfloat16_responses_embeddings = model.encode(bfloat16_responses)
ground_truth_embeddings = model.encode(ground_truth)

In [None]:
simalrity_matrtix = model.similarity(bfloat16_responses_embeddings, ground_truth_embeddings)

In [None]:
simalrity_matrtix.argmax(dim=-1)

In [None]:
plt.imshow(simalrity_matrtix)
plt.colorbar()

In [None]:
spuriously_correct = (simalrity_matrtix.argmax(dim=-1) == torch.arange(simalrity_matrtix.size(dim=0))).tolist()
print(*zip(spuriously_correct, ground_truth, bfloat16_responses), sep='\n')