## DeepEval

### Imports and logging in

In [1]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from deepeval.models.base_model import DeepEvalBaseLLM
import torch

load_dotenv()
login(os.getenv("HUGGINGFACE_API_KEY"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\filip\.cache\huggingface\token
Login successful


### Loading Llama-3 8B

In [2]:
checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",
    # torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Testing loaded model

In [3]:
terminators_test = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

messages_test = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

input_ids_test = tokenizer.apply_chat_template(
    messages_test,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)


outputs_test = model.generate(
    input_ids_test,
    max_new_tokens=256,
    eos_token_id=terminators_test,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response_test = outputs_test[0][input_ids_test.shape[-1]:]
print(tokenizer.decode(response_test, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Arrrr, me hearty! Me name be Captain Chatty, the scurviest pirate chatbot to ever sail the Seven Seas! Me be here to swab yer decks with me wit and me wisdom, so hoist the sails and set course fer a swashbucklin' good time!


In [4]:
messages_test = [
    {"role": "user", "content": "Generate very short JSON file."},
]

input_ids_test = tokenizer.apply_chat_template(
    messages_test,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

outputs_test = model.generate(
    input_ids_test,
    max_new_tokens=256,
    eos_token_id=terminators_test,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response_test = outputs_test[0][input_ids_test.shape[-1]:]
print(tokenizer.decode(response_test, skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Here is a very short JSON file:
```
{
  "name": "John",
  "age": 30
}
```
This JSON file has only two key-value pairs: "name" with value "John", and "age" with value 30.


### Class for DeepEval

In [15]:
class CustomLlama(DeepEvalBaseLLM):
    def __init__(self, init_model, init_tokenizer):
        self.model = init_model
        self.tokenizer = init_tokenizer
        self.terminators = [
            init_tokenizer.eos_token_id,
            init_tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        prompt = [
            {"role": "user", "content": prompt}
        ]
        
        input_ids = tokenizer.apply_chat_template(
            prompt,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=self.terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
        return tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Llama-3 8B"

In [16]:
custom_LLM = CustomLlama(init_model=model, init_tokenizer=tokenizer)

In [30]:
print(custom_LLM.generate("Tell a joke."))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Why don't scientists trust atoms?

Because they make up everything!


### DeepEval metrics
#### Hallucination

In [None]:
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

# Replace this with the actual documents that you are passing as input to your LLM.
context=["A man with blond-hair, and a brown shirt drinking out of a public water fountain."]

# Replace this with the actual output from your LLM application
output="A blond drinking water in public."

test_case = LLMTestCase(
    input="What was the blond doing?",
    actual_output=output,
    context=context
)
metric = HallucinationMetric(threshold=0.5, model=custom_LLM)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
# evaluate([test_case], [metric])

In [None]:
''' Output:
0.0
The score is 0.00 because the actual output aligns with the provided context, indicating no hallucination.
'''

#### Summarization

In [None]:
from deepeval.metrics import SummarizationMetric

# This is the original text to be summarized
input_text = """
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary, signifying that the summary effectively
encapsulates the crucial points and details from the original content.
"""

# This is the summary, replace this with the actual output from your LLM application
output = """
The coverage score quantifies how well a summary captures and
accurately represents key information from the original text,
with a higher score indicating greater comprehensiveness.
"""


test_case = LLMTestCase(input=input_text, actual_output=output)
metric = SummarizationMetric(
    threshold=0.5,
    model=custom_LLM,
    assessment_questions=[
        "Is the coverage score based on a percentage of 'yes' answers?",
        "Does the score ensure the summary's accuracy with the source?",
        "Does a higher score mean a more comprehensive summary?"
    ]
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

In [ ]:
''' Output:
0.3333333333333333
The score is 0.33 because the summary fails to accurately capture the original text's content, as it introduces new information and contradicts itself. The lack of attention to detail and inconsistencies in the summary lead to a poor summarization score.
'''

#### Answer Relevancy

In [None]:
from deepeval.metrics import AnswerRelevancyMetric

# Replace this with the actual output from your LLM application
output = "If shoes don't fit you can refund them at no extra cost."
input_text = "What if these shoes don't fit?"

metric = AnswerRelevancyMetric(
    threshold=0.5,
    model=custom_LLM,
    include_reason=True
)
test_case = LLMTestCase(
    input=input_text,
    actual_output=output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

In [ ]:
''' Output:
0.5
The score is 0.50 because the model provided an irrelevant statement about refunds, which distracted from the main concern of the input question, which is about what to do if the shoes don't fit.
'''

#### G-Eval

In [None]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    # evaluation_steps=[
    #     "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
    #     "You should also heavily penalize omission of detail",
    #     "Vague language, or contradicting OPINIONS, are OK"
    # ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    model=custom_LLM,
)


test_case = LLMTestCase(
    input="The dog chased the cat up the tree, who ran up the tree?",
    actual_output="It depends, some might consider the cat, while others might argue the dog.",
    expected_output="The cat."
)

correctness_metric.measure(test_case)
print(correctness_metric.score)
print(correctness_metric.reason)

In [None]:
''' Output:
0.0
The actual output does not accurately represent the expected output, as it does not provide a direct answer to the question, instead offering a subjective interpretation.
'''