In [None]:
!pip install datasets
!pip install evaluate
!pip install accelerate
!pip install bitsandbytes

In [None]:
import json

import numpy as np
import torch
import torch.nn as nn
import bitsandbytes as bnb

import datasets
from datasets import load_dataset, load_from_disk
from datasets.arrow_dataset import Dataset
from evaluate import load

from transformers import pipeline
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
from transformers.pipelines.pt_utils import KeyDataset

import accelerate

from tqdm import tqdm

import matplotlib.pyplot as plt

In [None]:
judge_model_id = "meta-llama/Llama-3.2-3B-Instruct"

judge_pipe = pipeline('text-generation', model=judge_model_id, torch_dtype=torch.bfloat16)
# LLama 3.2 has multiple eos_token_id. We use the "128001"
judge_pipe.tokenizer.pad_token_id = judge_pipe.model.config.eos_token_id[0]

In [None]:
# model_id = "meta-llama/Llama-3.2-1B"
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipe = pipeline('text-generation', model=model_id, torch_dtype=torch.bfloat16)
# LLama 3.2 has multiple eos_token_id. We use the "128001"
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id[0]

_model = pipe.model
_tokenizer = pipe.tokenizer

In [None]:
quant8_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
quant8_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant8_config)
quant8_pipe = pipeline('text-generation', model=quant8_model, tokenizer=_tokenizer, torch_dtype='auto')
# LLama 3.2 has multiple eos_token_id. We use the "128001"
quant8_pipe.tokenizer.pad_token_id = quant8_pipe.model.config.eos_token_id[0]

In [None]:
quant4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4')
quant4_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant4_config)
quant4_pipe = pipeline('text-generation', model=quant4_model, tokenizer=_tokenizer, torch_dtype='auto')
# LLama 3.2 has multiple eos_token_id. We use the "128001"
quant4_pipe.tokenizer.pad_token_id = quant4_pipe.model.config.eos_token_id[0]

In [None]:
llama_1b = sum(param.numel() * param.element_size() for param in _model.parameters()) / (1024 ** 2)
llama_1b_int8 = sum(param.numel() * param.element_size() for param in quant8_model.parameters()) /  (1024 ** 2)
llama_1b_int4 = sum(param.numel() * param.element_size() for param in quant4_model.parameters()) /  (1024 ** 2)

print(f'LLama 3.2 1B Instruct with bfloat16 uses {round(llama_1b, 2)}MB of memory')
print(f'LLama 3.2 1B Instruct with bfloat16 and int8 uses {round(llama_1b_int8, 2)}MB of memory')
print(f'LLama 3.2 1B Instruct with bfloat16 and int4 uses {round(llama_1b_int4, 2)}MB of memory')

## TriviaQA open-ended short-form generation

In [None]:
triviqa_dataset = load_dataset('mandarjoshi/trivia_qa', 'unfiltered', split='all')

In [None]:
ds_100 = triviqa_dataset.select(range(100))

In [None]:
def get_prompt_for_question(question: str) -> list:
    """
    Get prompt in chat format. This includes a system and an user prompt.
    """
    return [
        {'role': 'system', 'content': 'You are a chatbot which answers user question in the most concise manner possible.'},
        {'role': 'user', 'content': question}
    ]

In [None]:
ds_100 = ds_100.add_column('question_prompt', column=list(map(get_prompt_for_question, ds_100['question'])))
ds_100

In [None]:
SAMPLING = 50

# Select as per the GPU?
MULTIPLIER = 100

pipe_kwargs = {
                'top_k': 0, 
                'top_p': 0.9, # Nucleus sampling: cumulative probability threshold
                'max_new_tokens': 128,
                'pad_token_id': _tokenizer.pad_token_id,
                'batch_size': 5 * MULTIPLIER 
            }

del MULTIPLIER

# Originally, it's right side, but huggingface throws warning. 
# "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer."
_tokenizer.padding_side = 'left'


generations = {
    'bfloat16': [],
    'int8': [],
    'int4': []
}

# inference_ds = KeyDataset(ds_100.repeat(SAMPLING), key='question')
inference_ds = KeyDataset(ds_100.repeat(SAMPLING), key='question_prompt')
n = len(inference_ds)

with tqdm(total=n, desc='bfloat16') as pbar:
    for out in pipe(inference_ds, **pipe_kwargs):
        generations['bfloat16'].append(out[0]['generated_text'])

        pbar.update()

with tqdm(total=n, desc='int8') as pbar:
    for out in quant8_pipe(inference_ds, **pipe_kwargs):
        generations['int8'].append(out[0]['generated_text'])

        pbar.update()

with tqdm(total=n, desc='int4') as pbar:
    for out in quant4_pipe(inference_ds, **pipe_kwargs):
        generations['int4'].append(out[0]['generated_text'])

        pbar.update()


torch.cuda.empty_cache()

In [None]:
def get_prompt_for_verification(question: str, answer: dict, chat_history: list) -> list:
    """
    Get prompt in chat format. This includes a system and an user prompt.
    """
    normalized_value = answer['normalized_value']
    response = chat_history[-1]['content']
    
    return [
        {'role': 'system', 'content': 'For the following query give response as True or False, nothing more.'}, # , nothing more
        # {'role': 'user', 'content': f'For the question "{ds_100[0]['question']}", the correct answer is "{ds_100[0]['answer']['normalized_value']}". Does the response "{responses[0]['generated_text'][-1]['content']} the time was Harry Truman" the correct answer?'}
        # {'role': 'user', 'content': f'Does "{ds_100[0]['answer']['normalized_value']}" appears in the following text "{responses[0]['generated_text'][-1]['content']}"'}
        {'role': 'user', 'content': f'For the question "{question}", the correct answer is "{normalized_value}". Does the response "{response}" fits the correct answer?'}
    ]

In [None]:
ds_100_generations = ds_100.repeat(SAMPLING)

for key, value in generations.items():
    name = f'{key}_response'
    ds_100_generations = ds_100_generations.add_column(name, value)
    ds_100_generations = ds_100_generations.add_column(f'{key}_verification_prompt', column=list(map(get_prompt_for_verification, 
                                                                                                       ds_100_generations['question'], 
                                                                                                       ds_100_generations['answer'],
                                                                                                       ds_100_generations[name])))

ds_100_generations

## Evaluating responses on "Correctness" as seen by a judge model.

In [None]:
# Select as per the GPU?
MULTIPLIER = 35

# params for Judge, no specific sampling methodology. LLama 3.2 3B follows the instructions quite well.
verification_pipe_kwargs = {

    'pad_token_id': _tokenizer.pad_token_id,
    'batch_size': 5 * MULTIPLIER
}

del MULTIPLIER

# Originally, it's right side, but huggingface throws warning. 
# "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer."
# _tokenizer.padding_side = 'left'
judge_pipe.tokenizer.padding_side = 'left'

verifications = {
    'bfloat16': [],
    'int8': [],
    'int4': []
}

torch.cuda.empty_cache()

# We ask the judge multiple times and accumulate "True" or "False".
VERIFICATION_SAMPLES = 25
# VERIFICATION_SAMPLES = 10


for key in verifications.keys():
    verification_ds = KeyDataset(ds_100_generations.repeat(VERIFICATION_SAMPLES), key=f'{key}_verification_prompt')
    n = len(verification_ds)

    with tqdm(total=n, desc=f'{key}_verification') as pbar:
        for out in judge_pipe(verification_ds, **verification_pipe_kwargs):
            verifications[key].append(out[0]['generated_text'])

            pbar.update()

    torch.cuda.empty_cache()



In [None]:
generations['bfloat16'][0]

In [None]:
ds_100_verifications_25 = ds_100_generations.repeat(VERIFICATION_SAMPLES)

for key, value in verifications.items():
    name = f'{key}_verification_response'
    ds_100_verifications_25 = ds_100_verifications_25.add_column(name, value)

ds_100_generations.save_to_disk('ds_100_slice')
ds_100_verifications_25.save_to_disk('ds_100_verifications_25_slice')

In [None]:
ds_100_generations = load_from_disk('ds_100_slice')
ds_100_verifications_25 = load_from_disk('ds_100_verifications_25_slice')

### Resample from tile to repeat (numpy style tile and repeat).
example -> [1, 2, 3, 1, 2, 3] to [1, 1, 2, 2, 3, 3]

In [None]:
for verification_key in ('bfloat16', 'int8', 'int4'): 
    verification_key = f'{verification_key}_verification_response'
    arr = np.array([int(chat[-1]['content'].lower() == 'true') for chat in ds_100_verifications_25[verification_key]])
    
    # The first two dims are ordered following FIFO strategy to 
    # balance the 'tile' operations applied on the original 100 questions.
    arr = arr.reshape(VERIFICATION_SAMPLES, SAMPLING, -1)
    # We average the results over each verification for each sample.
    mean_accuracy_per_sample : np.ndarray = arr.mean(axis=0)
    # We then average and std over each sample for each question.
    accuracy_per_question : np.ndarray = mean_accuracy_per_sample.mean(axis=0)
    std_per_question : np.ndarray = mean_accuracy_per_sample.std(axis=0)

    mean_accuracy = accuracy_per_question.mean(axis=0)
    avg_std = std_per_question.mean(axis=0)

    print(mean_accuracy, avg_std, arr.mean(), arr.std())

In [None]:
final_accuracy = 0.0

for sample_question_idx in range(SAMPLING):
    accuracy = 0.0

    # Idx from the 50 * 100 questions.
    # starting_idx = 1 + 100 * 49
    original_question_idx = 0 
    # sample_question_idx = 1
    starting_idx = original_question_idx + 100 * sample_question_idx
    for idx in range(starting_idx, n, len(ds_100_generations)):
        item = verifications['bfloat16'][idx]
        # print(item)
        accuracy += int(item[-1]['content'].lower() == 'true')

    # print('*' * 15)
    # print(f'Accuracy is {(accuracy / count):.3f}%')
    final_accuracy += accuracy / VERIFICATION_SAMPLES

print(f'Accuracy is {(final_accuracy / SAMPLING):.3f}%')

In [None]:
def verify_response_via_judge_model(ds: Dataset, idx: int, verification_samples: int = 25) -> list:
    row = ds[idx]
    question = row['question']
    normalized_answer = row['answer']['normalized_value']

    response_types = ['bfloat16_response', 'int8_response', 'int4_response']

    response_accuracies = []

    for key in response_types:
        response = row[key]
        # response = row[key].removeprefix(question)
        # print(f'Question: {question}')
        # print(f'Response: {response}')

        response_item = [
            {'role': 'system', 'content': 'For the following query give response as True or False, nothing more.'},
            # {'role': 'user', 'content': f'Does "{ds_100[0]['answer']['normalized_value']}" appears in the following text "{responses[0]['generated_text'][-1]['content']}"'}
            {'role': 'user', 'content': f'For the question "{question}", the correct answer is "{normalized_answer}". Does the response "{response}" fits the correct answer?'}
        ]

        verification_responses = judge_pipe([response_item] * verification_samples, pad_token_id = _tokenizer.pad_token_id)

        print(json.dumps(verification_responses, indent=4))

        accuracy_score = sum(int(veri_response[0]['generated_text'][-1]['content'].lower() == 'true') for veri_response in verification_responses) / verification_samples
        response_accuracies.append(accuracy_score)

    return response_accuracies

In [None]:
verify_response_via_judge_model(ds_100_generations, idx=1)

In [None]:
generation_accuracies = {
    'bfloat16': [],
    'int8': [],
    'int4': []
}

for idx in tqdm(range(len(ds_100_generations))):
    response_accuracies = verify_response_via_judge_model(ds_100_generations, idx=idx)
    generation_accuracies['bfloat16'].append(response_accuracies[0])
    generation_accuracies['int8'].append(response_accuracies[1])
    generation_accuracies['int4'].append(response_accuracies[2])

torch.cuda.empty_cache()

## Loading Bert Score metric

In [None]:
bert_score_metric = load('bertscore')

#### We choose the best F1 score from the TriviaQA dataset for each sample.
So, we repeat the responses to match the references for the bert score metric calculation.

In [None]:
possible_answers_count = [len(item['normalized_aliases']) for item in ds_100_generations['answer']]
ds_100_generations = ds_100_generations.add_column('repeat_samples_count', possible_answers_count)

# ds_100_generations.save_to_disk('ds_100_slice')

ds_100_generations

In [None]:
ds_100_generations = load_from_disk('ds_100_slice')

### A possible problem with BERTScore???

In [None]:
bert_score_entries = []

for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
    prefix = item['question']
    repeat_count = item['repeat_samples_count']
    bfloat16_response = item['bfloat16_response']
    normalized_ground_truth = item['answer']['normalized_aliases']
    
    bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
    print(bfloat16_response.removeprefix(prefix))
    print(normalized_ground_truth)
    print(bfloat16_scores['f1'])
    break

In [None]:
prediction = """
 Harry Truman
When the first Peanuts cartoon was published in 1950, President Harry Truman was in office.
The cartoon was created by Charles M. Schulz and was first published in the Washington Post on October 2, 1950. It was later syndicated and became a huge success. The cartoon was originally called "Li'l Folks" and was later renamed "Peanuts." It was known for its humorous and relatable portrayal of everyday life and its characters, including Charlie Brown, Snoopy, Lucy, and Linus. The cartoon was a huge success and became a cultural phenomenon, running for over 50
"""
print(prediction)

In [None]:
value = ds_100_generations['answer'][0]['value']
print(value)

In [None]:
bert_score_metric.compute(predictions=[prediction], references=[ds_100_generations['answer'][0]['value']], lang='en')

In [None]:
bert_score_entries = []
batch_size = max(ds_100_generations['repeat_samples_count'])

with tqdm(total=n, desc='Calculating BERT Score') as pbar:
    for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
        prefix = item['question']
        repeat_count = item['repeat_samples_count']
        bfloat16_response = item['bfloat16_response']
        int8_response = item['int8_response']
        int4_response = item['int4_response']
        normalized_ground_truth = item['answer']['normalized_aliases']
        # repeat_count = len(normalized_ground_truth)

        # wrt normalized ground truth.
        
        bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)
        int8_scores = bert_score_metric.compute(predictions=[int8_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)
        int4_scores = bert_score_metric.compute(predictions=[int4_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en', batch_size=batch_size)

        entry = {
            'bfloat16': max(zip(bfloat16_scores['f1'], normalized_ground_truth)),
            'int8': max(zip(int8_scores['f1'], normalized_ground_truth)),
            'int4': max(zip(int4_scores['f1'], normalized_ground_truth))
        }

        bert_score_entries.append(entry)
        pbar.update()

In [None]:
bfloat16_avg = sum([entry['bfloat16'][0] for entry in bert_score_entries]) / len(bert_score_entries)
int8_avg = sum([entry['int8'][0] for entry in bert_score_entries]) / len(bert_score_entries)
int4_avg = sum([entry['int4'][0] for entry in bert_score_entries]) / len(bert_score_entries)

print(f'Following are the results:\nbfloat16: {bfloat16_avg}\nint8: {int8_avg}\nint4: {int4_avg}')

In [None]:
bert_score_kwargs = {
    'batch_size': 200
}

predictions_int8 = [response.removeprefix(question) for response, question in zip(ds_100_generations['int8_response'], ds_100_generations['question'])]
predictions_int4 = [response.removeprefix(question) for response, question in zip(ds_100_generations['int4_response'], ds_100_generations['question'])]
references_bfloat16 = [response.removeprefix(question) for response, question in zip(ds_100_generations['bfloat16_response'], ds_100_generations['question'])]

int8_to_bfloat16_scores = bert_score_metric.compute(predictions=predictions_int8, references=references_bfloat16, lang='en', **bert_score_kwargs)
int4_to_bfloat16_scores = bert_score_metric.compute(predictions=predictions_int4, references=references_bfloat16, lang='en', **bert_score_kwargs)

avg_int8_bfloat16 = sum(int8_to_bfloat16_scores['f1']) / len(int8_to_bfloat16_scores['f1'])
avg_int4_bfloat16 = sum(int4_to_bfloat16_scores['f1']) / len(int4_to_bfloat16_scores['f1'])

print(avg_int8_bfloat16)
print(avg_int4_bfloat16)

In [None]:
# bert_score_entries = []

accumulated_predictions = {
    'bfloat16': [],
    'int8': [],
    'int4': [],
    'ground_truth': []
}

for item in ds_100_generations.select_columns(['question', 'answer', 'bfloat16_response', 'int8_response', 'int4_response', 'repeat_samples_count']):
    prefix = item['question']
    repeat_count = item['repeat_samples_count']
    bfloat16_response = item['bfloat16_response']
    int8_response = item['int8_response']
    int4_response = item['int4_response']
    normalized_ground_truth = item['answer']['normalized_aliases']
    ground_truth = item['answer']['value']
    # repeat_count = len(normalized_ground_truth)

    # wrt normalized ground truth.

    # accumulated_predictions['bfloat16'].extend([bfloat16_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['int8'].extend([int8_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['int4'].extend([int4_response.removeprefix(prefix)] * repeat_count)
    # accumulated_predictions['ground_truth'].extend(normalized_ground_truth)

    accumulated_predictions['bfloat16'].append(bfloat16_response.removeprefix(prefix))
    accumulated_predictions['int8'].append(int8_response.removeprefix(prefix))
    accumulated_predictions['int4'].append(int4_response.removeprefix(prefix))
    # accumulated_predictions['ground_truth'].append(' '.join(normalized_ground_truth))
    accumulated_predictions['ground_truth'].append(ground_truth)

bert_score_kwargs = {
    'batch_size': 200
}

bfloat16_scores = bert_score_metric.compute(predictions=accumulated_predictions['bfloat16'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)
int8_scores = bert_score_metric.compute(predictions=accumulated_predictions['int8'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)
int4_scores = bert_score_metric.compute(predictions=accumulated_predictions['int4'], references=accumulated_predictions['ground_truth'], lang='en', **bert_score_kwargs)
    
# bfloat16_scores = bert_score_metric.compute(predictions=[bfloat16_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
# int8_scores = bert_score_metric.compute(predictions=[int8_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')
# int4_scores = bert_score_metric.compute(predictions=[int4_response.removeprefix(prefix)] * repeat_count, references=normalized_ground_truth, lang='en')

avg_bfloat16_scores = sum(bfloat16_scores['f1']) / len(bfloat16_scores['f1'])
avg_int8_scores = sum(int8_scores['f1']) / len(int8_scores['f1'])
avg_int4_scores = sum(int4_scores['f1']) / len(int4_scores['f1'])

print(avg_bfloat16_scores)
print(avg_int8_scores)
print(avg_int4_scores)

# entry = {
#     'bfloat16': max(zip(bfloat16_scores['f1'], normalized_ground_truth)),
#     'int8': max(zip(int8_scores['f1'], normalized_ground_truth)),
#     'int4': max(zip(int4_scores['f1'], normalized_ground_truth))
# }

# bert_score_entries.append(entry)

    # Wrt bfloat16

    # int8_to_bfloat16_scores = bert_score_metric.compute(predictions=[int8_response], references=[bfloat16_response], lang='en')

    # print(int8_to_bfloat16_scores['f1'])

In [None]:
scores = bert_score_metric.compute(predictions=['harry truman'] * 20, references=triviqa_dataset[0]['answer']['normalized_aliases'], lang='en')

In [None]:
bert_score_metric.compute(predictions=['harry truman'], references=[' '.join(triviqa_dataset[0]['answer']['normalized_aliases'])], lang='en')