In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import LogitsProcessor
from typing import Iterable
import envs
import pandas as pd
import string
from leaderboard import SummaryGenerator, EvaluationModel, run_eval
from tqdm import tqdm

MODEL_NAME = "fava-uw/fava-model"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
                                             device_map="auto",
                                             torch_dtype="auto",
                                             attn_implementation="flash_attention_2")

2024-06-13 17:12:42,939 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|███████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.66s/it]


In [3]:
from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter

def post_process(edited_text):
    soup = BeautifulSoup(edited_text, "html.parser")
    for tag in soup.findAll(["delete", "subjective", "unverifiable", "invented", "contradictory"]):
        tag.decompose()
    clean_text = soup.get_text().strip()
    if "Edited:" in clean_text:
        clean_text = clean_text.rpartition("Edited:")[2].strip()
    return clean_text

In [4]:
def gen_func(source, summary):
    input_text = "Read the following references:\n{evidence}\nPlease identify all the errors in the following text using the information in the references provided and suggest edits if necessary:\n[Text] {output}\n[Edited] "
    input_text = input_text.format(evidence=source, output=summary)
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    out = model.generate(input_ids,
                         do_sample=False,
                         max_new_tokens=512,
                         pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(out[0][len(input_ids[0]):], skip_special_tokens=True)
    return text

In [5]:
# Success case
source = "Banff National Park is Canada's oldest national park, established in 1885 as Rocky Mountains Park. Located in Alberta's Rocky Mountains, 110–180 kilometres (68–112 mi) west of Calgary, Banff encompasses 6,641 square kilometres (2,564 sq mi) of mountainous terrain."
summary = "Canada's oldest national park, Banff, was established in 1886. It recently won a Nature's Choice 2023 award for its beautiful mountainous terrain. It's the best national park ever."

# Failure case of unverifiable
# source = "Fist of Legend . It is a remake of the 1972 Fist of Fury , which starred Bruce Lee as the lead character ."
# summary = "Fist of Legend is a remake of the 1972 film Fist of Fury. The original film starred Bruce Lee as the lead character. The remake is a reinterpretation of the original story and characters. The remake was released in 1994 and starred Jet Li as the lead character. The film was directed by Corey Yuen and produced by Golden Harvest. The film was a commercial success and received positive reviews for its action sequences and choreography."
text = gen_func(source, summary)
print(text)




Edited: Canada's oldest national park, Banff, was established in <entity><mark>1885</mark><delete>1886</delete></entity>. <invented><delete>It recently won a Nature's Choice 2023 award for its beautiful mountainous terrain.</delete></invented> <subjective>It's the best national park ever.</subjective>


In [6]:
post_process(text)

"Canada's oldest national park, Banff, was established in 1885."

In [7]:
df = pd.read_csv("generated_greedy.csv")

In [None]:
import util
source_summary_pairs = util.create_pairs(df)
edited = []
for doc, summary in tqdm(source_summary_pairs, desc="Fava Editing"):
    edited_summary = ""
    if util.is_summary_valid(summary):
        edited_summary = gen_func(doc, summary)
    edited.append(edited_summary)

Fava Editing:  37%|█████████████████████▉                                     | 375/1006 [20:06<56:54,  5.41s/it]

In [9]:
df = df.rename(columns={"summary": "original_summary"})
processed_text = [post_process(edited[i]) for i, _ in enumerate(edited)]
df["summary"] = processed_text
df.to_csv("fava.csv", index=False)

  soup = BeautifulSoup(edited_text, "html.parser")


In [10]:
run_eval("fava.csv")

2024-06-13 20:00:10,931 - INFO - Use pytorch device: cuda
Evaluating hallucinations: 100%|█████████████████████████████████████████████| 1006/1006 [00:14<00:00, 68.19it/s]


Average Length 93.27932405566601
Answer Rate 1.0
Consistent Rate 93.7375745526839


In [12]:
for _, row in df.iterrows():
    if row["summary"] != row["original_summary"]:
        print(_)
        print(row)
        input()

11
source              Team-mates Neymar and Dani Alves proved their ...
original_summary    The passage describes the actions of Neymar an...
dataset                                                summeval_valid
summary             The passage describes the actions of Neymar an...
Name: 11, dtype: object


KeyboardInterrupt: Interrupted by user