In [1]:
!pip install openai==0.28.0
!pip install peft
!pip install transformers
!pip install numpy
!pip install torch





In [2]:
import getpass
openai_api_key = getpass.getpass('Enter your OpenAI API key:')
%env OPENAI_API_KEY=$openai_api_key

Enter your OpenAI API key:········
env: OPENAI_API_KEY=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


In [3]:
# LLMs
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import os

# Initialize OpenAI API
openai.api_key = os.environ["OPENAI_API_KEY"]

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output


def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # offload_folder="/home/ec2-user/SageMaker/hf_cache",
        max_memory= {i: '16384MB' for i in range(torch.cuda.device_count())}, # V100 16GB
    )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

In [4]:
# paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    res = call_openai_model(prompt, model, temperature) # openai model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [5]:
# Consistency_Checker
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def score_scc_api(self, question, target_answer, candidate_answers, temperature):

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)

        with ThreadPoolExecutor(max_workers=num_candidate_answer+2) as executor:
            all_res = []
            for i in range(num_candidate_answer):
                candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
                prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
                output = executor.submit(self.openai_api_parallel, prompt, temperature)
                all_res.append(output)

            for temp in concurrent.futures.as_completed(all_res):
                res = temp.result()
                guess = res.split(':')[1].split('\n')[0].strip()
                # print(res, guess)
                value = 0 if guess == 'Yes' else 1
                # print('value',value)
                sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output



    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            res = call_openai_model(prompt, self.model, temperature) # openai model call
            # res = call_guanaco_33b(prompt, max_new_tokens=200)  # guanaco_33b model call
            # res = call_falcon_7b(prompt, max_new_tokens = 200) # falcon_7b model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [6]:
# evaluator
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Please answer the following question:\n'

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

#     def self_evaluate_api(self, self_question, temperature, self_num):

#         prompt = self.prompt_temp + '\nQ:' + self_question
#         self_responses = []
#         with ThreadPoolExecutor(max_workers=self_num) as executor:
#             outputs = executor.map(self.openai_api_parallel, prompt, temperature)
#             for res in outputs:
#                 self_responses.append(res)

#         return self_responses

    def self_evaluate_api(self, self_question, temperature, self_num):

        prompt = self.prompt_temp + self_question
        self_responses = []
        with ThreadPoolExecutor(max_workers=self_num) as executor:
            futures = [executor.submit(self.openai_api_parallel, prompt, temperature) for _ in range(self_num)]
            for future in concurrent.futures.as_completed(futures):
                self_responses.append(future.result())

        return self_responses



    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'falcon-7b':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        # thread

        return self_responses


    def perb_evaluate_api(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        with ThreadPoolExecutor(max_workers=len(perb_questions)) as executor:
            future_to_pq = {
                executor.submit(self.openai_api_parallel, self.prompt_temp + perb_question, temperature): perb_question
                for perb_question in perb_questions
            }

            for future in concurrent.futures.as_completed(future_to_pq):
                perb_question = future_to_pq[future]
                try:
                    perb_responses.append(future.result())
                except Exception as exc:
                    print('%r generated an exception: %s' % (perb_question, exc))

        return perb_responses



    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'falcon-7b':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [7]:
import json

def load_dataset(filepath):
    """Loads the dataset from a specified file path.

    Args:
    filepath (str): The path to the dataset file.

    Returns:
    list: A list of JSON objects loaded from the file.
    """
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

# Adjust the path below if your dataset file is in a different directory
dataset_path = 'hotpotQA_halu.json'
qa_data = load_dataset(dataset_path)

# Example usage:
print(qa_data[0])

{'knowledge': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.", 'question': "Which magazine was started first Arthur's Magazine or First for Women?", 'right_answer': "Arthur's Magazine", 'hallucinated_answer': 'First for Women was started first.'}


In [None]:
import json

def load_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

dataset_path = 'hotpotQA_halu.json'
qa_data = load_dataset(dataset_path)

In [8]:
# self-check consistency
def sac2_score(question, target_answer, model, num_samples):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = num_samples)
    # fast consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = self_responses, temperature = 0.0)

    return consistency_res[0]

In [None]:
# AUROC score for self-check
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 50
model = 'gpt-3.5-turbo'
num_samples_list = [3,5,10,15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        if i <= n_data // 2:
            target_answer = qa_data[i]['hallucinated_answer']
        else:
            target_answer = qa_data[i]['right_answer']

        question = qa_data[i]['question']
        sc2_score = sac2_score(question, target_answer, model, num_samples)
        halu_score_all.append(sc2_score)

    # auroc
    print('Time per query', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label,halu_score_all)
    print('AUROC score', roc_auc)

100%|██████████| 50/50 [02:32<00:00,  3.04s/it]


Time per query 3.041416354179382
AUROC score 0.7432000000000001


100%|██████████| 50/50 [04:59<00:00,  5.99s/it]


Time per query 5.992955040931702
AUROC score 0.736


100%|██████████| 50/50 [03:16<00:00,  3.93s/it]


Time per query 3.9344684362411497
AUROC score 0.7464


100%|██████████| 50/50 [03:46<00:00,  4.53s/it]

Time per query 4.532747850418091
AUROC score 0.7456





In [9]:
# AUROC score for self-consistency without num_samples (50 data points)
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

# self consistency
def hallucination_score(question, target_answer, model):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = 10)
    # fast consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = self_responses, temperature = 0.0)

    return consistency_res[0]

n_data = 50
model = 'gpt-3.5-turbo'
halu_score_all = []
filename = 'halu_sac3_' + str(n_data) + '.txt'

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = qa_data[i]['hallucinated_answer']
    else:
        target_answer = qa_data[i]['right_answer']
    question = qa_data[i]['question']

    halu_score = hallucination_score(question, target_answer, model)
    halu_score_all.append(halu_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|███████████████████████████████████████████| 50/50 [02:03<00:00,  2.47s/it]

Time per query: 2.47478000164032
AUROC score: 0.732





In [None]:
# AUROC score for self-consistency without num_samples (100 data points)
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

# self consistency
def hallucination_score(question, target_answer, model):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = 10)
    # fast consistency checker
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = fast_self_responses, temperature = 0.0)

    return fast_consistency_res[0]

n_data = 100
model = 'gpt-3.5-turbo'
halu_score_all = []
filename = 'halu_sac3_' + str(n_data) + '.txt'

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = qa_data[i]['hallucinated_answer']
    else:
        target_answer = qa_data[i]['right_answer']
    question = qa_data[i]['question']

    halu_score = hallucination_score(question, target_answer, model)
    halu_score_all.append(halu_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|██████████| 100/100 [07:18<00:00,  4.38s/it]

Time per query: 4.381981918811798
AUROC score: 0.7424





In [None]:
def sac3_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model='gpt-3.5-turbo')
    scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# AUROC score for cross-check
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 50
model = 'gpt-3.5-turbo'
num_samples_list = [3,5,10,15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        if i <= n_data // 2:
            target_answer = qa_data[i]['hallucinated_answer']
        else:
            target_answer = qa_data[i]['right_answer']

        question = qa_data[i]['question']
        sc3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sc3_q_score)

    # auroc
    print('Time per query', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label,halu_score_all)
    print('AUROC score', roc_auc)

100%|██████████| 50/50 [07:00<00:00,  8.41s/it]


Time per query 8.40940809249878
AUROC score 0.8016


100%|██████████| 50/50 [07:18<00:00,  8.76s/it]


Time per query 8.76126256942749
AUROC score 0.8168


100%|██████████| 50/50 [07:15<00:00,  8.71s/it]


Time per query 8.711775751113892
AUROC score 0.7951999999999999


100%|██████████| 50/50 [08:07<00:00,  9.75s/it]

Time per query 9.754176802635193
AUROC score 0.8168





In [None]:
# AUROC score of cross-consistency without num_samples
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def sac3_score(question, target_answer, model):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model='gpt-3.5-turbo')
    scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=1)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

n_data = 50
model = 'gpt-3.5-turbo'
halu_score_all = []
filename = 'halu_sac3_' + str(n_data) + '.txt'

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = qa_data[i]['hallucinated_answer']
    else:
        target_answer = qa_data[i]['right_answer']
    question = qa_data[i]['question']

    sac3_q_score = sac3_score(question, target_answer, model)
    halu_score_all.append(sac3_q_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|██████████| 50/50 [07:52<00:00,  9.44s/it]

Time per query: 9.441791810989379
AUROC score: 0.8128





In [10]:
# AUROC score of cross-consistency without num_samples
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def sac3_score(question, target_answer, model):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model='gpt-3.5-turbo')
    scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=3)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

n_data = 100
model = 'gpt-3.5-turbo'
halu_score_all = []
filename = 'halu_sac3_' + str(n_data) + '.txt'

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = qa_data[i]['hallucinated_answer']
    else:
        target_answer = qa_data[i]['right_answer']
    question = qa_data[i]['question']

    sac3_q_score = sac3_score(question, target_answer, model)
    halu_score_all.append(sac3_q_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|█████████████████████████████████████████| 100/100 [12:52<00:00,  7.73s/it]

Time per query: 7.727928912639618
AUROC score: 0.7527999999999999



