In [1]:
from argparse import Namespace
import string
import json
import numpy as np
import os
import logging
from tqdm import tqdm

# Create an empty Namespace object
args = Namespace()

# Manually assign values to attributes
args.input_path = "./data/labeled/test.jsonl"
args.model_name = "retrieval+llama"
args.data_dir = "./data/"
args.model_dir = "~/.cache/huggingface/hub"
args.cache_dir = ".cache/factscore/"
args.openai_key = "./data/openaikey.txt"
args.cost_estimate = "consider_cache"
args.abstain_detection_type = None
args.print_rate_limit_error = True
args.batch_size = 256

logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL)

from factscore.abstain_detection import is_response_abstained
from factscore.atomic_facts import AtomicFactGenerator
# from factscore.clm import CLM
# from factscore.npm import NPM
from factscore.openai_lm import OpenAIModel
from factscore.retrieval import DocDB, Retrieval

def register_knowledge_source(args, name="enwiki-20230401", db_path=None, data_path=None):
    db, retrieval, npm = {}, {}, {}
    if db_path is None:
        db_path = os.path.join(args.data_dir, f"{name}.db")

    if data_path is None:
        data_path = os.path.join(args.data_dir, f"{name}.jsonl")

    cache_path = os.path.join(args.cache_dir, f"retrieval-{name}.json")
    embed_cache_path = os.path.join(args.cache_dir, f"retrieval-{name}.pkl")

    db[name] = DocDB(db_path=db_path, data_path=data_path)
    retrieval[name] = Retrieval(db[name], cache_path, embed_cache_path, batch_size=args.batch_size, retrieval_type="bm25")
    if "npm" in args.model_name:
        cache_path = os.path.join(args.cache_dir, f"bm25-{name}.json")
        embed_cache_path = os.path.join(args.cache_dir, f"bm25-{name}.pkl")
        self.npm[name] = NPM(Retrieval(db[name], cache_path, embed_cache_path, "bm25"),
                             "npm-single",
                             cache_file=os.path.join(args.cache_dir, f"npm-{name}.pkl"))
    return db, retrieval, npm

db, retrieval, npm = register_knowledge_source(args, name="enwiki-20230401", db_path=None, data_path=None)

  from .autonotebook import tqdm as notebook_tqdm
2024-04-28 18:46:09,636	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
[nltk_data] Downloading package punkt to /home/yuxiawang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Atomic claims

In [2]:
af_generator = AtomicFactGenerator(key_path=args.openai_key,
                                   demon_dir=os.path.join(args.data_dir, "demos"),
                                   gpt3_cache_file=os.path.join(args.cache_dir, "ChatGPT.pkl"))

generation = "Doug Sheehan is an American actor best known for his role as Ben Galvin in the hit television drama series, Knots Landing. He has also appeared in films such as The Big Easy and The Last Boy Scout, and television shows such as NYPD Blue and Beverly Hills, 90210. He was born in Los Angeles, California on August 24, 1956. Sheehan attended the University of California, Los Angeles, where he received a Bachelor of Arts degree in Theatre Arts. He has been married since 1984 to Lisa Cooper and has two children."

curr_afs, para_break = af_generator.run(generation)
curr_afs = [fact for _, facts in curr_afs for fact in facts]
print(len(curr_afs))

25


### Evaluate over claim

In [3]:
atomic_facts = curr_afs
topic = "Doug Sheehan"
knowledge_source = "enwiki-20230401"
# from factscore.call_llms import LLaMA3
# lm = LLaMA3(model_name="meta-llama/Meta-Llama-3-8B-Instruct")
from factscore.clm import LLaMA3
lm = LLaMA3(model_name="meta-llama/Meta-Llama-3-8B-Instruct",
            cache_file=os.path.join(args.cache_dir, "inst-llama3-8B.pkl"))

decisions = []
prompts = []
total_words = 0
for atom in atomic_facts:
    atom = atom.strip()
    if lm:
        passages = retrieval[knowledge_source].get_passages(topic, atom, k=5)
        # print(atom, passages)
        
        definition = "Answer the question about {} based on the given context.\n\n".format(topic)
        context = ""
        for psg_idx, psg in enumerate(reversed(passages)):
            context += "Title: {}\nText: {}\n\n".format(psg["title"], psg["text"].replace("<s>", "").replace("</s>", ""))
        definition += context.strip()
        if not definition[-1] in string.punctuation:
            definition += "."
        prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
        # prompts.append(prompt)

        output = lm.generate(prompt)
        
        # when logits are unavailable
        generated_answer = output[0].lower()
        if "true" in generated_answer or "false" in generated_answer:
            if "true" in generated_answer and "false" not in generated_answer:
                is_supported = True
            elif "false" in generated_answer and "true" not in generated_answer:
                is_supported = False
            else:
                is_supported = generated_answer.index("true") > generated_answer.index("false")
        else:
            is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]])
    
    decisions.append({"atom": atom, "is_supported": is_supported})

2024-04-28 15:03:10,018	INFO worker.py:1749 -- Started a local Ray instance.


INFO 04-28 15:03:10 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-28 15:03:13 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 04-28 15:03:13 selector.py:28] Using FlashAttention backend.
[36m(RayWorkerWrapper pid=180096)[0m INFO 04-28 15:03:13 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
[36m(RayWorkerWrapper pid=180096)[0m INFO 04-28 15:03:13 selector.py:28] Using FlashAttention backend.
INFO 04-28 15:03:14 pynccl_utils.py:43] vLLM is using nccl==2.18.1
[36m(RayWorkerWrapper pid=180096)[0m INFO 04-28 15:03:14 pynccl_utils.py:43] vLLM is using nccl==2.18.1
INFO 04-28 15:03:14 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
[36m(RayWorkerWrapper pid=180096)[0m INFO 04-28 15:03:14 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 04-28 15:03:15 weight_utils.py:193] Using model weights format ['*.safetensors']




[36m(RayWorkerWrapper pid=180096)[0m INFO 04-28 15:03:23 model_runner.py:1057] Graph capturing finished in 4 secs.
INFO 04-28 15:03:23 model_runner.py:1057] Graph capturing finished in 4 secs.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  9.69it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.82it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  9.83it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  9.83it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.74it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.19it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.83it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.24it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.48it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1

### Evaluate on factoolqa and FELM-WK

#### Get wikipedia title as topic input 

In [2]:
import re
import requests
from bs4 import BeautifulSoup
from typing import List, Dict

def search_google_first_wikipedia_title(query: str, num_web_pages: int = 1, timeout : int = 6):
    """how to get the related wikipedia title as topic input for each claim or prompt? 
    Google search: prompt + Wikipedia, the first page title is considered to be.
    
    Args:
        query: Search query.
        num_web_pages: the number of web pages to request.
        save_url: path to save returned urls, such as 'urls.txt'
    Returns:
        search_results: the first web page title
    """
    query = query.replace(" ", "+")

    # set headers: Google returns different web-pages according to agent device
    # desktop user-agent
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    # mobile user-agent
    MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
    headers = {'User-Agent': USER_AGENT}
    
    # set language
    # set the Google interface language, use &hl=XX
    # set the preferred language of the search results, use &lr=lang_XX
    # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
    lang = "en" 

    # scrape google results
    urls = []
    for page in range(0, num_web_pages, 10):
        # here page is google search's bottom page meaning, click 2 -> start=10
        # url = "https://www.google.com/search?q={}&start={}".format(query, page)
        url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
        r = requests.get(url, headers=headers, timeout=timeout)
        # collect all urls by regular expression
        # how to do if I just want to have the returned top-k pages?
        urls += re.findall('href="(https?://.*?)"', r.text)

        # Find the first web page title
        soup = BeautifulSoup(r.text, "html.parser")
        first_result = soup.find("h3", class_="LC20lb MBeuO DKV0Md")
        if first_result:
            return first_result.text
        else:
            print("Fail to get the first page title.")
            return r


query = "Which country or city has the maximum number of nuclear power plants? Wikepedia"
a = search_google_first_wikipedia_title(query)
print(a)

Nuclear power by country


#### Load data

In [3]:
import pandas as pd
df = pd.read_json("../data/Factbench.jsonl", lines = True)

In [5]:
df[49:51]

Unnamed: 0,prompt,response,response_label,claims,claim_labels,ability_to_test,source,hallucination_spans
49,What did SOS originally stand for?,"SOS originally stood for ""Save Our Souls"" or ""...",False,"[SOS originally stood for ""Save Our Souls"", SO...","[False, False, True, True, False, True, True]",knowledge,factool-qa,
50,Which country or city has the maximum number o...,The United States has the highest number of nu...,False,[The United States has the highest number of n...,"[False, True]",knowledge,felm-wk,


#### load models and eval

In [6]:
knowledge_source = "enwiki-20230401"
# from factscore.call_llms import LLaMA3
# lm = LLaMA3(model_name="meta-llama/Meta-Llama-3-8B-Instruct")
from factscore.clm import LLaMA3
lm = LLaMA3(model_name="meta-llama/Meta-Llama-3-8B-Instruct",
            cache_file=os.path.join(args.cache_dir, "inst-llama3-8B.pkl"))

In [7]:
def read_txt(input_file):
    with open(input_file, "r", encoding = "utf-8") as f:
        return f.readlines()

def save_txt(data, output_file):
    with open(output_file, "w", encoding = "utf-8") as writer:
        writer.write("\n".join(data))

# save_txt(factoolqa + felmwk, "./data/labeled/topics.txt")
topics = read_txt("./data/labeled/topics.txt")
topic_dict = {'factool-qa': topics[:50], 'felm-wk': topics[50:]}

In [8]:
decisions = []
data = []

for source in ['factool-qa', 'felm-wk']:
    df1 = df[df['source'] == source]
    # print(len(df1))
    for k, v in df1.iterrows():
        # print(k)
        atomic_facts = v['claims']
        topic = topics[k].strip()
        

        for atom in atomic_facts:
            atom = atom.strip()
            if lm:
                try:
                    passages = retrieval[knowledge_source].get_passages(topic, atom, k=5)
                    # print(atom, passages)
                except:
                    # the topic is not in the wikipedia dump
                    print("Fail to retrieve related passages!")
                    passages = [{"title": "", "text": ""}]
                    
                
                definition = "Answer the question about {} based on the given context.\n\n".format(topic)
                context = ""
                for psg_idx, psg in enumerate(reversed(passages)):
                    context += "Title: {}\nText: {}\n\n".format(psg["title"], psg["text"].replace("<s>", "").replace("</s>", ""))
                definition += context.strip()
                if not definition[-1] in string.punctuation:
                    definition += "."
                prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
                # prompts.append(prompt)
        
                output = lm.generate(prompt)
                
                # when logits are unavailable
                generated_answer = output[0].lower()
                if "true" in generated_answer or "false" in generated_answer:
                    if "true" in generated_answer and "false" not in generated_answer:
                        is_supported = True
                    elif "false" in generated_answer and "true" not in generated_answer:
                        is_supported = False
                    else:
                        is_supported = generated_answer.index("true") > generated_answer.index("false")
                else:
                    is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]])
            
            decisions.append({"atom": atom, "is_supported": is_supported})
            data.append({"source": source, "id": k, "atom": atom, "evidence": passages, 
                         "prompt": prompt, "llm_eval_response": output[0], "is_supported": is_supported})
            pd.DataFrame(data).to_json("factscore_evaluation.jsonl", lines=True, orient="records")

2024-04-28 18:46:48,827	INFO worker.py:1749 -- Started a local Ray instance.


INFO 04-28 18:46:49 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-28 18:46:51 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 04-28 18:46:51 selector.py:28] Using FlashAttention backend.
[36m(RayWorkerWrapper pid=197910)[0m INFO 04-28 18:46:51 utils.py:608] Found nccl from library /home/yuxiawang/.config/vllm/nccl/cu12/libnccl.so.2.18.1
[36m(RayWorkerWrapper pid=197910)[0m INFO 04-28 18:46:52 selector.py:28] Using FlashAttention backend.
INFO 04-28 18:46:52 pynccl_utils.py:43] vLLM is using nccl==2.18.1
[36m(RayWorkerWrapper pid=197910)[0m INFO 04-28 18:46:52 pynccl_utils.py:43] vLLM is using nccl==2.18.1
INFO 04-28 18:46:52 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
[36m(RayWorkerWrapper pid=197910)[0m INFO 04-28 18:46:52 utils.py:129] reading GPU P2P access cache from /home/yuxiawang/.config/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 04-28 18:46:53 weight_utils.py:193] Using model weights format ['*.safetensors']


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.09it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.35it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.24s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.01it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.19it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.17s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.19s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.40it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.06it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.61it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.27it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.38it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  8.72it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.80it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.37s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.00it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.00it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.74it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.36it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.59it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.63it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.08s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.26s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.94it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.49it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.81it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.06s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.22it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.20s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.54s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.48it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.21s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.86it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.05it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.70it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.23it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 14.14it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.49it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.14it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.18it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.16s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.47it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.27s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.04it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.25it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.25s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.00it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.28it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.15it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.52it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.22it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.86it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.12it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.83it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.27s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.27it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.89it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.10s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.11it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.24it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.25it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.13it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.29s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.75it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.81it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.74it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.46it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.18it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 10.47it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.36s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.81it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.34s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.06it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.93it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.46it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.12it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.51it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.37it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.25it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.58it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.23s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.61it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.18it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.47it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.03s/it]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.54it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  6.65it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.22it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.86it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.83it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.29it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.23s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.40it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.37s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.66it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.47it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.23it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.78it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.57it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.47it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.95it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.46it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.63it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.00it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.65it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.88it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.61it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.72it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.79it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.12s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.55it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.74it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.04s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.65it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.63it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.75it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.38it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.69it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.60it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.58it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.20it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.05it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 10.47it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.71it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.59it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.04it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.88it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.87it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.11s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.39s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.57s/it]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.83it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.48it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.40s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.70it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.27it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.09it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 10.70it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.81it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.03it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.71it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.35it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.02s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.57it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.27it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.26it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.24it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.83it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.93it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.20it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.18it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.62it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.35s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.46it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 13.43it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.47it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.07s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.81it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.72it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.14it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.90it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.65it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.08it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.81it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.66it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  7.27it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.54it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.85it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.46s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.87it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.37s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.84it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.26it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.55it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.43it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.40s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.40it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.06s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.24s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.98it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.23s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.31it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.32it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.69it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.24s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.03s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.12s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.08it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.42s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.61it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.92it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.00s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.20it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.14s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.34it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.01it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.12it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.04s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 10.48it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.20s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.42it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.44it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.15it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.20s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.29it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.40it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.40s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.57s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.91it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.25it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.86it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 11.77it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.05it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.93it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.85it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.58it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.26it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.28it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.81it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.02it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.22s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.21s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.21it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.51it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.00it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.96it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.25it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.28s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.75it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.11it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.07it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.52s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.92it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.33it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.16it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.95it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.43it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00, 10.38it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.05it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.77it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.49it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.11it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.08it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.33s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.18s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.17it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.77it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.30it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.78it/s]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.55it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.49it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.38it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.33it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.19it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.56it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.16it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.33it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.01s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.57s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.05it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.59it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.09s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.62it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.90it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.11s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.32it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.04s/it]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.10it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.47it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.98it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.36it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  3.92it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.41it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.33it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.51it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.08it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.10it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.58s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.06s/it]
Processed prompts: 100%|████

Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.29it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  5.29it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.56it/s]


Fail to retrieve related passages!


Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.59it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.24it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.39it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  2.06it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.60it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.40it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  4.08it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.49it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.69it/s]
Processed prompts: 100%|██████████████████████████| 1/1 [00:01<00:00,  1.20s/it]
Processed prompts: 100%|██████████████████████████| 1/1 [00:00<00:00,  1.25it/s]


### Evaluate results

In [8]:
import random
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, f1_score, recall_score
def eval_classification(y_true, y_pred, average="macro"):
    precision, recall, F1, support = precision_recall_fscore_support(y_true, y_pred, average=average)
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {
        "accuracy": round(accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "F1": round(F1, 3),
    }
    return metrics


def eval_binary_classification(y_true, y_pred, pos_label="yes"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, pos_label=pos_label)
    F1 = f1_score(y_true, y_pred, pos_label=pos_label)

    metrics = {
        "accuracy": round(accuracy, 2),
        "precision": round(precision, 2),
        "recall": round(recall, 2),
        "F1": round(F1, 2),
    }
    return metrics

In [11]:
df = pd.read_json("./data/labeled/Factbench.jsonl", lines=True)
gold_labels = {'factcheckgpt': [], 'factool-qa': [], 'felm-wk': [], 'halueval-dolly15k': []}
for source in ['factcheckgpt', 'factool-qa', 'felm-wk', 'halueval-dolly15k']:        
    t = df[df['source'] == source]
    if source == 'halueval-dolly15k':
        gold_labels[source] = list(t['response_label'])
    else:
        for l in t['claim_labels']:
            gold_labels[source] += l

# df = pd.read_json("./factscore_evaluation.jsonl", lines=True)
df = pd.read_json("./factscore_evaluation_chatgpt.jsonl", lines=True)
for source in ['factool-qa', 'felm-wk']:    
    d1 = df[df['source'] == source]
    # print(len(d1))
    p = d1["is_supported"]
    v = gold_labels[source]
    mt = eval_binary_classification(v, p, pos_label=True)
    mf = eval_binary_classification(v, p, pos_label=False)
    print(f"{mt['precision']} & {mt['recall']} & {mt['F1']} & {mf['precision']} & {mf['recall']} & {mf['F1']}")

0.82 & 0.58 & 0.68 & 0.31 & 0.59 & 0.4
0.77 & 0.71 & 0.74 & 0.36 & 0.43 & 0.39


### FactScore with ChatGPT as Verifier

In [1]:
from factscore.call_llms import gpt_easy

  from .autonotebook import tqdm as notebook_tqdm
2024-05-02 16:43:40,000	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [7]:
import pandas as pd
import string
decisions, data = [], []
df = pd.read_json("./factscore_evaluation.jsonl", lines=True)
for source in ['factool-qa', 'felm-wk']:    
    d1 = df[df['source'] == source]
    print(len(d1))
    for id, prompt in d1['prompt'].items():
        output = gpt_easy(prompt, model="gpt-3.5-turbo-0125", system_role="You are a helpful assistant.")
        # when logits are unavailable
        generated_answer = output.lower()
        if "true" in generated_answer or "false" in generated_answer:
            if "true" in generated_answer and "false" not in generated_answer:
                is_supported = True
            elif "false" in generated_answer and "true" not in generated_answer:
                is_supported = False
            else:
                is_supported = generated_answer.index("true") > generated_answer.index("false")
        else:
            is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]])
    
        decisions.append(is_supported)
        data.append({"source": source, "id": id, 
                     "prompt": prompt, "llm_eval_response": output, "is_supported": is_supported})
        pd.DataFrame(data).to_json("factscore_evaluation_chatgpt.jsonl", lines=True, orient="records")

532


### Generate Topics

In [83]:
df1 = df[df['source'] == 'factool-qa']
for k, v in df1.iterrows():
    # print(v['prompt'], v['claims'])
    # topic = v['prompt']
    topic = "Nuclear power by country"
    for atom in v['claims']:
        passages = retrieval[knowledge_source].get_passages(topic, atom, k=5)
        print(atom, passages)
        break
    break

The United States has the highest number of nuclear power plants in the world [{'title': 'Nuclear power by country', 'text': "<s>Nuclear power by country Nuclear power plants operate in 32 countries and generate about a tenth of the world's electricity. Most are in Europe, North America, East Asia and South Asia. The United States is the largest producer of nuclear power, while France has the largest share of electricity generated by nuclear power, at about 70%. China has the fastest growing nuclear power programme with 16 new reactors under construction, followed by India, which has 8 under construction. Some countries operated nuclear reactors in the past but have no operating nuclear plants. Among them, Italy closed all of its nuclear stations by 1990 and nuclear power has since been discontinued because of the 1987 referendums. Kazakhstan is planning to reintroduce nuclear power in the future. Belarus began operating one unit of its first nuclear power plant in June 2021 and expect

In [67]:
df1 = df[df['source'] == 'factool-qa']
topics = []
for k, v in df1.iterrows():
    query = v['prompt'] + " Wikipedia"
    topic = search_google_first_wikipedia_title(query)
    # print(query)
    # print(topic)
    topics.append(topic)
topics_new = [topic.replace("- Wikipedia", "").strip() for topic in topics]

Which country or city has the maximum number of nuclear power plants? Wikipedia
Nuclear power by country
Who is the CEO of Twitter? Wikipedia
Twitter - Wikipedia
Is Jupiter more dense than Saturn? Wikipedia
Saturn
How many sons had eise eisinga in total? Wikipedia
Eise Eisinga - Wikipedia
How many times did Argentina win the FIFA world cup? Wikipedia
FIFA World Cup - Wikipedia
What is the fastest animal with wings and fur? Wikipedia
Peregrine falcon - Wikipedia
How many female US. Supreme Court justices have there been? Wikipedia
List of justices of the Supreme Court of the United States
Where did fortune cookies originate? Wikipedia
Fortune cookie - Wikipedia
What is the most valuable Non-fungible Token (NFT)? Wikipedia
Non-fungible token - Wikipedia
What is the most commonly used language in the world? Wikipedia
English language - Wikipedia
What is the smallest ocean in the world? Wikipedia
Arctic Ocean - Wikipedia
Who was the founder of Buddhism? Wikipedia
The Buddha - Wikipedia
Whi

In [None]:
df2 = df[df['source'] == 'felm-wk']
topics = []
for k, v in df2.iterrows():
    query = v['prompt'] + " Wikipedia"
    topic = search_google_first_wikipedia_title(query)
    # print(query)
    # print(topic)
    topics.append(topic)

topics_new = [topic.replace("- Wikipedia", "").strip() for topic in topics]