In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_from_disk
from torch.utils.data import DataLoader
from peft import PeftModel
import evaluate
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [None]:
#dataset = load_dataset("tatsu-lab/alpaca_eval") this line does not work
#eval_set = load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"]
#eval_set.save_to_disk("./AlpacaEval")

In [3]:
data_path = "./AlpacaEval"
eval_set = load_from_disk(data_path)
eval_set = eval_set.select(range(500))

In [4]:
eval_set[:3]

{'instruction': ['What are the names of some famous actors that started their careers on Broadway?',
  'How did US states get their names?',
  "Hi, my sister and her girlfriends want me to play kickball with them. Can you explain how the game is played, so they don't take advantage of me?"],
 'output': ['Some famous actors that started their careers on Broadway include: \n1. Hugh Jackman \n2. Meryl Streep \n3. Denzel Washington \n4. Julia Roberts \n5. Christopher Walken \n6. Anthony Rapp \n7. Audra McDonald \n8. Nathan Lane \n9. Sarah Jessica Parker \n10. Lin-Manuel Miranda',
  'US states get their names from a variety of sources, including Native American tribes, Spanish explorers, British colonists, and even presidents. For example, the state of Alabama was named after the Native American tribe that lived in the area, while the state of Florida gets its name from the Spanish explorer, Ponce de Leon, who explored the area in the 1500s. Other states are named after English kings (like 

### Load models

In [5]:
#tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
bloom_path = "./Bloom1b1"
tokenizer = AutoTokenizer.from_pretrained(bloom_path)
tokenizer

BloomTokenizerFast(name_or_path='./Bloom1b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
#base_model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", low_cpu_mem_usage=True)
base_model = AutoModelForCausalLM.from_pretrained(bloom_path, low_cpu_mem_usage=True)

In [7]:
method = "p_tuning"
model_path = f".\{method}\chatbot\checkpoint-10000"
ft_model = PeftModel.from_pretrained(model=base_model, model_id=model_path)


### Evaluation

In [33]:
eval_bleu = evaluate.load("bleu")
eval_rouge = evaluate.load("rouge")
eval_bertscore = evaluate.load("bertscore")

In [9]:
def generate_model_output(model, instruction):
    with torch.no_grad(): 
        input_string = "Human: {}\n{}".format(instruction, "").strip() + "\n\nAssistant: "        
        ipt = tokenizer(input_string, return_tensors="pt").to(model.device)
        otpt = model.generate(**ipt, max_new_tokens=256, do_sample=True)[0]       
        result = tokenizer.decode(otpt, skip_special_tokens=True).replace(input_string, "")        
        return result

In [10]:
base_model.cuda()
base_model.eval() 

pred_base = []

with torch.no_grad():
    for data in eval_set:        
        output = generate_model_output(model=base_model,instruction=data['instruction'])
        pred_base.append(output)  

In [12]:
ft_model.cuda()
ft_model.eval()

pred_ft = []
with torch.no_grad():
    for data in eval_set:
        output = generate_model_output(model=ft_model,instruction=data['instruction'])
        pred_ft.append(output)

In [13]:
ref = []
for data in eval_set:
    ref.append(data['output'])

In [41]:
base_bleu = eval_bleu.compute(predictions = pred_base, references= ref)
base_rouge = eval_rouge.compute(predictions = pred_base, references= ref)

base_bert = eval_bertscore.compute(predictions=pred_base, references=ref, lang="en")
base_bert = base_bert['precision']

print(base_bleu)
print(base_rouge)
print("Average bertscore precision: ")
print(sum(base_bert)/len(base_bert))

{'bleu': 0.010969173044895468, 'precisions': [0.1449696279422931, 0.019511729925615107, 0.003967037178995784, 0.0012901983439245139], 'brevity_penalty': 1.0, 'length_ratio': 1.9976110573915022, 'translation_length': 105360, 'reference_length': 52743}
{'rouge1': 0.16237897969571732, 'rouge2': 0.02475328223523595, 'rougeL': 0.09984475869642855, 'rougeLsum': 0.13522026779994906}
Average bertscore precision: 
0.7960452679395675


In [38]:
ft_bleu = eval_bleu.compute(predictions = pred_ft, references= ref)
ft_rouge = eval_rouge.compute(predictions = pred_ft, references= ref)

ft_bert = eval_bertscore.compute(predictions=pred_ft, references=ref, lang="en")
ft_bert = ft_bert['precision']

print(ft_bleu)
print(ft_rouge)
print("Average bertscore precision: ")
print(sum(ft_bert)/len(ft_bert))



{'bleu': 0.031113092632829267, 'precisions': [0.2223935550870126, 0.04792565338746511, 0.01432225063938619, 0.006138626623272001], 'brevity_penalty': 1.0, 'length_ratio': 1.2049750677815065, 'translation_length': 63554, 'reference_length': 52743}
{'rouge1': 0.2229655337233108, 'rouge2': 0.051207244107131546, 'rougeL': 0.14802214894464744, 'rougeLsum': 0.17962834147424978}
Average bertscore precision: 
0.8341480239629745




In [39]:
# Save result
import json
import os

Eval_result = {
    "bleu": ft_bleu,
    "rouge": ft_rouge,
    "bertscore": sum(ft_bert)/len(ft_bert),
}

directory = f"./{method}/"
json_filename = "eval_result.json"

if not os.path.exists(directory):
    os.makedirs(directory)

json_file_path = os.path.join(directory, json_filename)

with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(Eval_result, f, ensure_ascii=False, indent=4)
