In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import LogitsProcessor
from typing import Iterable
import envs
import pandas as pd
import string
from leaderboard import SummaryGenerator, EvaluationModel, run_eval, run_eval_TT

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
                                             device_map="auto",
                                             torch_dtype="auto",
                                             attn_implementation="flash_attention_2")

2024-07-25 17:46:55,069 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.20s/it]


In [3]:
tokenizer.chat_template = open("mistral_template.jinja", "r").read()

In [4]:
def gen_func(source, debug=False):
    messages = [{"role": "system", "content": envs.SYSTEM_PROMPT},
                {"role": "user", "content": envs.USER_PROMPT.format(passage=source)}]
    input_ids = tokenizer.apply_chat_template(messages, 
                                              add_generation_prompt=True, 
                                              return_tensors="pt").to("cuda")
    out = model.generate(input_ids, 
                         do_sample=False, 
                         max_new_tokens=512,
                         pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0][len(input_ids[0]):], skip_special_tokens=True)
    if debug:
        print(tokenizer.decode(out[0], skip_special_tokens=False))
    return text

In [5]:
text = gen_func("The first vaccine for [CONDITION_1] was approved by the [ORGANIZATION_1] in [DATE_INTERVAL_1] in [LOCATION_COUNTRY_1], [DURATION_1] after the initial outbreak in [DATE_INTERVAL_2]. To produce the vaccine, scientists had to sequence the DNA of [CONDITION_1], then identify possible vaccines, and finally show successful clinical trials. Scientists say a vaccine for [CONDITION_2] is unlikely to be ready this year, although clinical trials have already started.", debug=True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<s>[INST] You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided. 

You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': 
Passage:
The first vaccine for [CONDITION_1] was approved by the [ORGANIZATION_1] in [DATE_INTERVAL_1] in [LOCATION_COUNTRY_1], [DURATION_1] after the initial outbreak in [DATE_INTERVAL_2]. To produce the vaccine, scientists had to sequence the DNA of [CONDITION_1], then identify possible vaccines, and finally show successful clinical trials. Scientists say a vaccine for [CONDITION_2] is unlikely to be ready this year, although clinical trials have already started. [/INST] The first vaccine for [CONDITION_1] was approved by [ORGANIZATION_1] in [LOCATION_COUNTRY_1] in [DATE_INTERVAL_1] after the initial outbreak in [DATE_INTERVAL_2]. The vaccine was produced by sequencing the DNA of [CONDITION_1], identifying pos

In [6]:
summ = SummaryGenerator()

In [7]:
df = summ.generate_summaries(pd.read_csv("anonymized_leaderboard.csv"), gen_func)
df.to_csv("generated.csv", index=False)

 47%|████▋     | 477/1006 [29:51<56:02,  6.36s/it]  

# De-anonymize

In [6]:
gen_sum = pd.read_csv("generated.csv")
ano_src = pd.read_csv("anonymized_leaderboard.csv")

In [10]:
import json

In [14]:
dea_sum = []
for ano_sum, ent_info in zip(gen_sum["summary"], ano_src["entities"]):
    ent_info = json.loads(ent_info)
    for entity in ent_info:
        ano_sum = ano_sum.replace(f"[{entity['processed_text']}]", entity["text"])
    dea_sum.append(ano_sum)

In [21]:
ldb = pd.read_csv("leaderboard_dataset.csv")
ldb["summary"] = dea_sum
ldb = ldb.rename(columns={"text":"source"})
ldb.to_csv("generated_deanonymized.csv", index=False)

In [None]:
run_eval_TT("generated.csv")