In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import LogitsProcessor
from typing import Iterable
import envs
import pandas as pd
import string
from leaderboard import SummaryGenerator, EvaluationModel

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1')
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1',
                                             device_map="auto",
                                             torch_dtype="auto")

2024-05-07 20:30:53,288 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


In [3]:
tokenizer.chat_template = open("mistral_template.jinja", "r").read()

In [4]:
class WhiteListLogitsProcessor(LogitsProcessor):
    """
    A simple LogitsProcessor constraining the generation to "white list", i.e. a set of allowed tokens.     
    """
    def __init__(self, white_list_word_ids: Iterable[int]):
        self.white_list = white_list_word_ids 
        self.mask = None 
            
    def __call__(self, input_ids, scores):
        """
        This method will be called during each step of the beam search algorithm. 
        The method takes as input the input_ids sequence of the partially generated beam and the scores of the next possible tokens.
        By manipulating these scores based on the tokens present in the input_ids, we can control the structure of the generated sentence.
        """
        if self.mask is None:
            self.mask = torch.ones(scores.shape).to(scores.device)
            # put zeros in allowed tokens
            self.mask[:, self.white_list] = 0
            self.mask = self.mask.bool()
        scores = scores.masked_fill(self.mask, -float("inf"))
        return scores

In [5]:
good_words = "only these words can occur in the generated text"
good_word_ids = tokenizer.encode(good_words)
white_list_processor = WhiteListLogitsProcessor(good_word_ids)
input_seq = "here are the input words to condition generated text upon"
input_ids = tokenizer.encode(input_seq, return_tensors='pt').to("cuda")
out = model.generate(input_ids, do_sample=False, logits_processor=[white_list_processor])
print(tokenizer.batch_decode(out))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['<s> here are the input words to condition generated text upon in the text generated in the text generated in']


In [6]:
def gen_func(source):
    messages = [{"role": "system", "content": envs.SYSTEM_PROMPT},
                {"role": "user", "content": envs.USER_PROMPT.format(passage=source)}]
    good_word_ids = tokenizer.encode(source + "\n" + "\n".join(string.printable) + "\n" + tokenizer.eos_token)
    white_list_processor = WhiteListLogitsProcessor(good_word_ids)
    input_ids = tokenizer.apply_chat_template(messages, 
                                              add_generation_prompt=True, 
                                              return_tensors="pt").to("cuda")
    out = model.generate(input_ids, 
                        #  do_sample=False, 
                        #  logits_processor=[white_list_processor],
                         max_new_tokens=512,
                         num_beams=10,
                         pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0][len(input_ids[0]):], skip_special_tokens=True)
    return text

In [7]:
text = gen_func("The first vaccine for Ebola was approved by the FDA in 2019 in the US, five years after the initial outbreak in 2014. To produce the vaccine, scientists had to sequence the DNA of Ebola, then identify possible vaccines, and finally show successful clinical trials. Scientists say a vaccine for COVID-19 is unlikely to be ready this year, although clinical trials have already started.")
print(text)

1. The first Ebola vaccine was approved by the FDA in 2019 in the US, five years after the initial outbreak in 2014.
2. To produce the vaccine, scientists had to sequence the DNA of Ebola, then identify possible vaccines, and finally show successful clinical trials.
3. Scientists say a vaccine for COVID-19 is unlikely to be ready this year, although clinical trials have already started.


In [3]:
summ = SummaryGenerator()

In [9]:
df = summ.generate_summaries(pd.read_csv("leaderboard_dataset.csv"), gen_func)
df.to_csv("generated.csv", index=False)

  0%|          | 0/1006 [00:00<?, ?it/s]

 78%|███████▊  | 780/1006 [3:10:07<2:06:34, 33.61s/it]

In [4]:
summ.summaries_df = pd.read_csv("generated.csv")
summ._compute_avg_length()
summ._compute_answer_rate()

In [5]:
hem = EvaluationModel("vectara/hallucination_evaluation_model")

  return self.fget.__get__(instance, owner)()
2024-05-08 20:10:00,007 - INFO - Use pytorch device: cuda


In [6]:
hscore = hem.evaluate_hallucination(summ.summaries_df)
hrate = hem.compute_factual_consistency_rate()

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.12it/s]:00<?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 78.29it/s]:00<07:58,  2.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 95.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 104.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 103.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 96.78it/s]:00<01:19, 12.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 110.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 106.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 101.66it/s]:00<00:46, 21.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 112.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 111.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 96.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 105.20it/s]
Batches: 100%|██████████| 1/1 [00:00

In [7]:
print("Average Length", summ.avg_length)
print("Answer Rate", summ.answer_rate)
print("Consistent Rate", hrate)

Average Length 95.48111332007953
Answer Rate 1.0
Consistent Rate 94.53280318091451


In [None]:
summ.summaries_df.insert(0, "Score", hscore, allow_duplicates=True)
summ.summaries_df.to_csv("hhem_eval.csv", index=False)