# Evaluating the model

In order to evaluate how good a model is we need to think in percentages: 
- "On average, the model responds approprietly to questions"

In [1]:
import myllm.gpt as gpt
import myllm.util
import myllm.data as data
import json
import torch
import tiktoken

In [4]:
# SETUP OVER MODEL


# load model
device = torch.device("cpu")
tokenizer = tiktoken.get_encoding('gpt2')

gpt_config = myllm.util.gpt_config("gpt2-medium")
gpt_config.update({'qkv_bias': True})
model = gpt.GPTModel(gpt_config)

# fine tuned model
model_state_dict = torch.load("model_instruction.pth", map_location=device)
model.load_state_dict(model_state_dict)

# load dataset 
train_data, val_data, test_data = data.split_instruction_data("data/instruction-data.json")



In [5]:
# We iterate over the entire trest set to validate answers
from tqdm import tqdm 

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = data.format_instruction_input(entry)

    token_ids = model.generate(
        idx=myllm.util.text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=gpt_config["context_length"],
        eos_id=data.PAD_TOKEN_ID,
    ).to(device)

    generated_text = myllm.util.token_ids_to_text(token_ids, tokenizer)

    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    test_data[i]["model_response"] = response_text



100%|██████████| 110/110 [2:06:14<00:00, 68.86s/it] 


TypeError: JSONEncoder.__init__() got an unexpected keyword argument 'ident'

In [8]:

print(test_data[0])
with open("data/instruction-data.json", "w") as file:
    json.dump(test_data, file, indent=4)


{'instruction': 'Rewrite the sentence using a simile.', 'input': 'The car is very fast.', 'output': 'The car is as fast as lightning.', 'model_response': ''}


In [10]:
# save information
import re

file_name = f"{re.sub(r'[ ()]', '', '355M')}-sft.pth"
torch.save(model.state_dict(), file_name)
