# Base model evaluation


## Overview
Evaluation is a crucial step in the Knowldedge Tuning workflow. It allows you to measure the performance and generalization ability of your model. In this notebook, you systematically evaluate the base model by using appropriate metrics and validation datasets before you fine tune the model on the example Bank of Montreal (BMO) data.

Throughout this notebook, you use visualizations and quantitative metrics to analyze performance of the base model.

## Install dependencies

In [None]:
!pip install -qqU .

## Set up paths and directories

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

WORKSPACE = Path.cwd().parent  # Path to the workspace directory

OUTPUT_DIR = WORKSPACE / "output"

MODEL_NAME = os.getenv("STUDENT_MODEL_NAME", "RedHatAI/Llama-3.1-8B-Instruct")

# Create the output directory if does not exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = OUTPUT_DIR / "base_model" / MODEL_NAME.replace("/", "__")


print(f"Model name : {MODEL_NAME}")

## Save the model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Save the model locally, for easier access in the following steps
if not MODEL_PATH.exists():
    print("Model not available locally, Downloading the model locally ")

    # Save the model
    print(f"Loading model {MODEL_NAME}")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    model.save_pretrained(MODEL_PATH)
    print(f"Model saved to {MODEL_PATH}")

    # Save the tokenizer
    print(f"Loading tokenizer {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(MODEL_PATH)
    print(f"Tokenizer saved to {MODEL_PATH}")

    del model
    del tokenizer
else:
    print(f"Model Available locally : {MODEL_PATH}")

## Load the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, dtype=torch.float16, device_map="cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("âœ… Successfully loaded the model")

## LLM sampling parameters

Use the following parameters to test the base model:

In [None]:
################################################################################
# ðŸŽ¯ Sampling/Generation Parameters                                            #
################################################################################
MAX_NEW_TOKENS = 256
DO_SAMPLE = True
TEMPERATURE = 0.7  # Meta's recommended temperature for Llama
TOP_P = 0.9  # Standard top_p for Llama models

print(f"MAX_NEW_TOKENS: {MAX_NEW_TOKENS}")
print(f"DO_SAMPLE: {DO_SAMPLE}")
print(f"TEMPERATURE: {TEMPERATURE}")
print(f"TOP_P: {TOP_P}")
print("âœ… LLM sampling parameters defined")
print()
print("ðŸ“Š Using Meta's recommended Llama sampling settings:")
print("  â€¢ Temperature 0.6 for balanced creativity/consistency")
print("  â€¢ Top-p 0.9 for good token diversity")
print("  â€¢ Stop on both EOS and <|eot_id|> tokens")

## Utility functions

In [None]:
from rich import print as pprint


def prompt_runner(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=DO_SAMPLE,
            top_p=TOP_P,
        )

    response = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
    )

    return response


def run_experimentation(prompt):
    # Run prompt in base model
    base_model_response = prompt_runner(model=model, tokenizer=tokenizer, prompt=prompt)

    # Print the response of the model

    pprint(f"""
    [bold]EXPERIMENTATION DETAILS[/bold]:
        MODEL NAME     : {MODEL_NAME}
        MAX NEW TOKENS : {MAX_NEW_TOKENS}
        DO SAMPLE      : {DO_SAMPLE}
        TEMPERATURE    : {TEMPERATURE}
        TOP P          : {TOP_P}


    [bold]PROMPT ðŸ’¬[/bold]:

        [green]{prompt}[/green]

    [bold]MODEL RESPONSE ðŸ¤–[/bold]:

        {base_model_response}

    """)

## Test 1

Test the knowledge of the base model on BMO data:

Question: `what is the meaning of verifying the identity of a person or an entity`

In [None]:
prompt = """what is the meaning of verifying the identity of a person or an entity"""


run_experimentation(prompt)