In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import LogitsProcessor
from typing import Iterable
import envs
import pandas as pd
import string
from leaderboard import SummaryGenerator, EvaluationModel, run_eval

# MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
MODEL_NAME = "mistral_dpo_5k"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
                                             device_map="auto",
                                             torch_dtype="auto",
                                             attn_implementation="flash_attention_2")

2024-06-13 22:31:03,002 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.34it/s]


In [4]:
tokenizer.chat_template = open("mistral_template.jinja", "r").read()

In [5]:
class WhiteListLogitsProcessor(LogitsProcessor):
    """
    A simple LogitsProcessor constraining the generation to "white list", i.e. a set of allowed tokens.     
    """
    def __init__(self, white_list_word_ids: Iterable[int]):
        self.white_list = white_list_word_ids 
        self.mask = None 
            
    def __call__(self, input_ids, scores):
        """
        This method will be called during each step of the beam search algorithm. 
        The method takes as input the input_ids sequence of the partially generated beam and the scores of the next possible tokens.
        By manipulating these scores based on the tokens present in the input_ids, we can control the structure of the generated sentence.
        """
        if self.mask is None:
            self.mask = torch.ones(scores.shape).to(scores.device)
            # put zeros in allowed tokens
            self.mask[:, self.white_list] = 0
            self.mask = self.mask.bool()
        scores = scores.masked_fill(self.mask, -float("inf"))
        return scores

In [6]:
# good_words = "only these words can occur in the generated text"
# good_word_ids = tokenizer.encode(good_words)
# white_list_processor = WhiteListLogitsProcessor(good_word_ids)
# input_seq = "here are the input words to condition generated text upon"
# input_ids = tokenizer.encode(input_seq, return_tensors='pt').to("cuda")
# out = model.generate(input_ids, do_sample=False, logits_processor=[white_list_processor])
# print(tokenizer.batch_decode(out))

In [7]:
def gen_func(source, debug=False):
    messages = [{"role": "system", "content": envs.SYSTEM_PROMPT},
                {"role": "user", "content": envs.USER_PROMPT.format(passage=source)}]
    # good_word_ids = tokenizer.encode(source + "\n" + "\n".join(string.printable) + "\n" + tokenizer.eos_token)
    # white_list_processor = WhiteListLogitsProcessor(good_word_ids)
    input_ids = tokenizer.apply_chat_template(messages, 
                                              add_generation_prompt=True, 
                                              return_tensors="pt").to("cuda")
    out = model.generate(input_ids, 
                         do_sample=False, 
                        #  logits_processor=[white_list_processor],
                         max_new_tokens=512,
                        #  num_beams=10,
                         pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0][len(input_ids[0]):], skip_special_tokens=True)
    if debug:
        print(tokenizer.decode(out[0], skip_special_tokens=False))
    return text

In [8]:
text = gen_func("The first vaccine for Ebola was approved by the FDA in 2019 in the US, five years after the initial outbreak in 2014. To produce the vaccine, scientists had to sequence the DNA of Ebola, then identify possible vaccines, and finally show successful clinical trials. Scientists say a vaccine for COVID-19 is unlikely to be ready this year, although clinical trials have already started.", debug=True)

<s> [INST] You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided. 

You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': 
Passage:
The first vaccine for Ebola was approved by the FDA in 2019 in the US, five years after the initial outbreak in 2014. To produce the vaccine, scientists had to sequence the DNA of Ebola, then identify possible vaccines, and finally show successful clinical trials. Scientists say a vaccine for COVID-19 is unlikely to be ready this year, although clinical trials have already started. [/INST] The first Ebola vaccine was approved by the FDA in the US in 2019, five years after the initial outbreak in 2014. Scientists had to sequence the DNA of Ebola, identify possible vaccines, and conduct successful clinical trials to produce the vaccine. A vaccine for COVID-19 is unlikely to be ready this year, although c

In [9]:
summ = SummaryGenerator()

In [10]:
df = summ.generate_summaries(pd.read_csv("leaderboard_dataset.csv"), gen_func)
df.to_csv("generated.csv", index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1006/1006 [1:19:11<00:00,  4.72s/it]


In [11]:
run_eval("generated.csv")

  return self.fget.__get__(instance, owner)()
2024-06-13 23:56:42,701 - INFO - Use pytorch device: cuda
Evaluating hallucinations: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1006/1006 [00:14<00:00, 67.74it/s]

Average Length 96.98508946322067
Answer Rate 1.0
Consistent Rate 95.82504970178927



