In [None]:
!pip install openai==0.28.0
!pip install peft
!pip install transformers
!pip install numpy
!pip install torch

Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.27.2 peft-0.9.0


In [None]:
# LLM Models
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Initialize OpenAI API
openai.api_key = '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'

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output

def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # offload_folder="/home/ec2-user/SageMaker/hf_cache",
        max_memory= {i: '16384MB' for i in range(torch.cuda.device_count())}, # V100 16GB
    )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

In [None]:
# Paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    res = call_openai_model(prompt, model, temperature) # openai model call
    # res = call_guanaco_33b(prompt, max_new_tokens = 200)  # guanaco_33b model call
    # res = call_falcon_7b (prompt, max_new_tokens = 200) # falcon_7b model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [None]:
# Consistency_Checker
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def score_scc_api(self, question, target_answer, candidate_answers, temperature):

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)

        with ThreadPoolExecutor(max_workers=num_candidate_answer+2) as executor:
            all_res = []
            for i in range(num_candidate_answer):
                candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
                prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
                output = executor.submit(self.openai_api_parallel, prompt, temperature)
                all_res.append(output)

            for temp in concurrent.futures.as_completed(all_res):
                res = temp.result()
                guess = res.split(':')[1].split('\n')[0].strip()
                # print(res, guess)
                value = 0 if guess == 'Yes' else 1
                # print('value',value)
                sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output



    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            res = call_openai_model(prompt, self.model, temperature) # openai model call
            # res = call_guanaco_33b(prompt, max_new_tokens=200)  # guanaco_33b model call
            # res = call_falcon_7b(prompt, max_new_tokens = 200) # falcon_7b model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [None]:
# Evaluator
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures


class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Please answer the following question:\n'

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def self_evaluate_api(self, self_question, temperature, self_num):

        prompt = self.prompt_temp + self_question
        self_responses = []
        with ThreadPoolExecutor(max_workers=self_num+2) as executor:
            futures = [executor.submit(self.openai_api_parallel, prompt, temperature) for _ in range(self_num)]
            for future in concurrent.futures.as_completed(futures):
                self_responses.append(future.result())

        return self_responses



    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        # thread

        return self_responses


    def perb_evaluate_api(self, perb_questions, temperature, self_num):

        perb_responses = []
        with ThreadPoolExecutor(max_workers=self_num+2) as executor:
            # Create a future for each paraphrased question
            futures = [executor.submit(self.openai_api_parallel, self.prompt_temp + perb_question, temperature) for perb_question in perb_questions]
            # As each future completes, append its result to perb_responses
            for future in concurrent.futures.as_completed(futures):
                perb_responses.append(future.result())

        return perb_responses


    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [None]:
from google.colab import files
uploaded = files.upload()

Saving hotpotQA_halu.json to hotpotQA_halu.json


In [None]:
import json

def load_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

dataset_path = 'hotpotQA_halu.json'
qa_data = load_dataset(dataset_path)

In [None]:
# self consistency
def hallucination_score(question, target_answer, model, num_samples):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = num_samples)
    # fast consistency checker
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = fast_self_responses, temperature = 0.0)

    return fast_consistency_res[0]

In [None]:
# sc2 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = qa_item['hallucinated_answer']
        else:
            target_answer = qa_item['right_answer']

        question = qa_item['question']
        halu_score = hallucination_score(question, target_answer, model, num_samples)
        halu_score_all.append(halu_score)

    # AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print('AUROC score:', roc_auc)

100%|██████████| 10/10 [00:22<00:00,  2.24s/it]


Time per query: 2.2427576541900636
AUROC score: 0.66


100%|██████████| 10/10 [00:24<00:00,  2.40s/it]


Time per query: 2.4051982402801513
AUROC score: 0.8200000000000001


100%|██████████| 10/10 [00:28<00:00,  2.89s/it]


Time per query: 2.8945090770721436
AUROC score: 0.8600000000000001


100%|██████████| 10/10 [01:08<00:00,  6.87s/it]

Time per query: 6.874573349952698
AUROC score: 0.72





In [None]:
def sac3_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # question pertubation
    gen_question = paraphrase(question, number = 5, model = 'gpt-3.5-turbo', temperature=1.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=1.0, self_num=num_samples)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = qa_item['hallucinated_answer']
        else:
            target_answer = qa_item['right_answer']

        question = qa_item['question']
        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print('AUROC score:', roc_auc)

100%|██████████| 10/10 [01:10<00:00,  7.02s/it]


Time per query: 7.015428972244263
AUROC score: 0.7200000000000001


100%|██████████| 10/10 [01:04<00:00,  6.47s/it]


Time per query: 6.468082427978516
AUROC score: 0.84


100%|██████████| 10/10 [01:13<00:00,  7.39s/it]


Time per query: 7.387742328643799
AUROC score: 0.8200000000000001


100%|██████████| 10/10 [01:25<00:00,  8.52s/it]

Time per query: 8.517774319648742
AUROC score: 0.8800000000000001





In [None]:
# Accuracy for self-check (SC2)
from sklearn.metrics import accuracy_score
from tqdm import tqdm

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    correct_predictions = 0
    total_predictions = 0

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        question = qa_item['question']

        # Since we're focusing on detecting hallucinated (incorrect) answers with 100% hallucinated samples,
        # every evaluation is against the hallucinated answer.
        target_answer = qa_item['hallucinated_answer']

        halu_score = hallucination_score(question, target_answer, model, num_samples)

        # A higher score (>= 0.5) indicates the model correctly identifies the answer as hallucinated (incorrect).
        predicted_label = 1 if halu_score >= 0.5 else 0

        # Increment correct_predictions if predicted_label is 1, indicating correct detection of hallucination.
        if predicted_label == 1:
            correct_predictions += 1

        total_predictions += 1

    # Calculate accuracy by dividing correct predictions by total predictions
    accuracy = correct_predictions / total_predictions
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')

Accuracy for 3 samples: 0.3
Accuracy for 5 samples: 0.2
Accuracy for 10 samples: 0.3
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying...
Accuracy for 15 samples: 0.3


In [None]:
# Accuracy for cross-check (SAC3)
from sklearn.metrics import accuracy_score
from tqdm import tqdm

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    correct_predictions = 0
    total_predictions = 0

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        question = qa_item['question']

        # Since we're focusing on detecting hallucinated (incorrect) answers with 100% hallucinated samples,
        # every evaluation is against the hallucinated answer.
        target_answer = qa_item['hallucinated_answer']

        halu_score = sac3_score(question, target_answer, model, num_samples)

        # A higher score (>= 0.5) indicates the model correctly identifies the answer as hallucinated (incorrect).
        predicted_label = 1 if halu_score >= 0.5 else 0

        # Increment correct_predictions if predicted_label is 1, indicating correct detection of hallucination.
        if predicted_label == 1:
            correct_predictions += 1

        total_predictions += 1

    # Calculate accuracy by dividing correct predictions by total predictions
    accuracy = correct_predictions / total_predictions
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')

Using primality dataset and calculating AUROC

In [None]:
# LLM Models
import os
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Initialize OpenAI API
openai.api_key = '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'

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output


def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # offload_folder="/home/ec2-user/SageMaker/hf_cache",
        max_memory= {i: '16384MB' for i in range(torch.cuda.device_count())}, # V100 16GB
    )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

In [None]:
# Paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    res = call_openai_model(prompt, model, temperature) # openai model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [None]:
# Consistency Check
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def score_scc_api(self, question, target_answer, candidate_answers, temperature):

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + str(target_answer)
        num_candidate_answer = len(candidate_answers)

        with ThreadPoolExecutor(max_workers=num_candidate_answer) as executor:
            all_res = []
            for i in range(num_candidate_answer):
                candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
                prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
                output = executor.submit(self.openai_api_parallel, prompt, temperature)
                all_res.append(output)

            for temp in concurrent.futures.as_completed(all_res):
                res = temp.result()
                guess = res.split(':')[1].split('\n')[0].strip()
                # print(res, guess)
                value = 0 if guess == 'Yes' else 1
                # print('value',value)
                sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output



    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + str(target_answer)
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            res = call_openai_model(prompt, self.model, temperature) # openai model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [None]:
# Evaluator
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Please answer the following question:\n'

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

#     def self_evaluate_api(self, self_question, temperature, self_num):

#         prompt = self.prompt_temp + '\nQ:' + self_question
#         self_responses = []
#         with ThreadPoolExecutor(max_workers=self_num) as executor:
#             outputs = executor.map(self.openai_api_parallel, prompt, temperature)
#             for res in outputs:
#                 self_responses.append(res)

#         return self_responses

    def self_evaluate_api(self, self_question, temperature, self_num):

        prompt = self.prompt_temp + self_question
        self_responses = []
        with ThreadPoolExecutor(max_workers=self_num) as executor:
            futures = [executor.submit(self.openai_api_parallel, prompt, temperature) for _ in range(self_num)]
            for future in concurrent.futures.as_completed(futures):
                self_responses.append(future.result())

        return self_responses



    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        # thread

        return self_responses


    def perb_evaluate_api(self, perb_questions, temperature, self_num):

        perb_responses = []
        with ThreadPoolExecutor(max_workers=self_num+2) as executor:
            # Create a future for each paraphrased question
            futures = [executor.submit(self.openai_api_parallel, self.prompt_temp + perb_question, temperature) for perb_question in perb_questions]
            # As each future completes, append its result to perb_responses
            for future in concurrent.futures.as_completed(futures):
                perb_responses.append(future.result())

        return perb_responses


    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [None]:
from google.colab import files
uploaded = files.upload()

import json

def load_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

dataset_path = 'primality_testing.json'
qa_data = load_dataset(dataset_path)

print(qa_data)

Saving primality_testing.json to primality_testing.json
[[{'question': 'Is 7411 a prime number?', 'number': 7411, 'answer': True}, {'question': 'Is 10733 a prime number?', 'number': 10733, 'answer': True}, {'question': 'Is 4219 a prime number?', 'number': 4219, 'answer': True}, {'question': 'Is 5471 a prime number?', 'number': 5471, 'answer': True}, {'question': 'Is 10663 a prime number?', 'number': 10663, 'answer': True}, {'question': 'Is 18539 a prime number?', 'number': 18539, 'answer': True}, {'question': 'Is 6911 a prime number?', 'number': 6911, 'answer': True}, {'question': 'Is 9791 a prime number?', 'number': 9791, 'answer': True}, {'question': 'Is 16921 a prime number?', 'number': 16921, 'answer': True}, {'question': 'Is 2677 a prime number?', 'number': 2677, 'answer': True}, {'question': 'Is 1933 a prime number?', 'number': 1933, 'answer': True}, {'question': 'Is 10631 a prime number?', 'number': 10631, 'answer': True}, {'question': 'Is 3691 a prime number?', 'number': 3691, 

In [None]:
# self consistency check
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def hallucination_score(question, target_answer, model, num_samples):
    # Initialize instances of Evaluate and SemanticConsistencyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # Fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)

    # Fast consistency checker
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=fast_self_responses, temperature=0.0)

    return fast_consistency_res[0]  # Return the inconsistency score

# Load dataset
dataset_path = 'primality_testing.json'
qa_data = load_dataset(dataset_path)

# Define parameters
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

# Iterate over different number of samples
for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []

    for i in tqdm(range(n_data)):
        qa_item = qa_data[0][i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = "False"
        else:
            target_answer = qa_item['answer']

        question = qa_item['question']
        halu_score = hallucination_score(question, target_answer, model, num_samples)
        halu_score_all.append(halu_score)


    # Calculate AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print('AUROC score:', roc_auc)

100%|██████████| 10/10 [00:31<00:00,  3.17s/it]


Time per query: 3.1686809062957764
AUROC score: 0.7400000000000001


 60%|██████    | 6/10 [00:22<00:15,  3.92s/it]

The server is overloaded or not ready yet.
Retrying...


100%|██████████| 10/10 [02:24<00:00, 14.42s/it]


Time per query: 14.424967956542968
AUROC score: 0.6599999999999999


100%|██████████| 10/10 [00:53<00:00,  5.31s/it]


Time per query: 5.31343834400177
AUROC score: 0.56


100%|██████████| 10/10 [00:54<00:00,  5.50s/it]

Time per query: 5.496872544288635
AUROC score: 0.52





In [None]:
# cross consistency check
def sac3_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # question pertubation
    gen_question = paraphrase(question, number = 5, model = 'gpt-3.5-turbo', temperature=1.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=1.0, self_num=num_samples)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        qa_item = qa_data[0][i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = "No"
        else:
            target_answer = qa_item['answer']

        question = qa_item['question']

        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print('AUROC score:', roc_auc)

100%|██████████| 10/10 [06:35<00:00, 39.55s/it] 


Time per query: 39.5467280626297
AUROC score: 0.42


 60%|██████    | 6/10 [00:53<00:35,  8.88s/it]

Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Retrying...


100%|██████████| 10/10 [11:48<00:00, 70.83s/it]


Time per query: 70.82578628063202
AUROC score: 0.52


100%|██████████| 10/10 [02:04<00:00, 12.49s/it]


Time per query: 12.493115544319153
AUROC score: 0.96


 20%|██        | 2/10 [00:34<02:31, 18.98s/it]

The server is overloaded or not ready yet.
Retrying...


100%|██████████| 10/10 [04:10<00:00, 25.09s/it]

Time per query: 25.09105749130249
AUROC score: 0.8200000000000001





In [None]:
# Change data structure by adding hallucinated answers and modifying factual answers (i.e., same data structure as the hotpotQA_halu dataset)
from google.colab import files
import json

def modify_dataset(data):
    for data_point in data:
        number_value = str(data_point['number'])  # Convert number to string for replacement

        # Update the 'answer' key to 'true_answer' and change its value with number replacement
        if data_point['answer'] == True:
            data_point['right_answer'] = f"Yes, {number_value} is a prime number"
        else:
            data_point['right_answer'] = f"No, {number_value} is not a prime number"
        del data_point['answer']  # Remove the original 'answer' key

        # Add the 'hallucinated_answer' key-value pair with number replacement
        data_point['hallucinated_answer'] = f"No, {number_value} is not a prime number"

# Upload a file from your local file system
uploaded = files.upload()

filename = next(iter(uploaded))

# Load and modify the dataset
with open(filename, 'r') as file:
    qa_data = json.load(file)
modify_dataset(qa_data)

print(qa_data[:5])  # Print first 5 modified data points for verification

# Save the modified data back to a file, or use it as needed
with open(filename, 'w') as file:
    json.dump(qa_data, file, indent=4)

# download the modified file
# files.download(filename)

Saving primality_testing.json to primality_testing.json
[{'question': 'Is 7411 a prime number?', 'number': 7411, 'right_answer': 'Yes, 7411 is a prime number', 'hallucinated_answer': 'No, 7411 is not a prime number'}, {'question': 'Is 10733 a prime number?', 'number': 10733, 'right_answer': 'Yes, 10733 is a prime number', 'hallucinated_answer': 'No, 10733 is not a prime number'}, {'question': 'Is 4219 a prime number?', 'number': 4219, 'right_answer': 'Yes, 4219 is a prime number', 'hallucinated_answer': 'No, 4219 is not a prime number'}, {'question': 'Is 5471 a prime number?', 'number': 5471, 'right_answer': 'Yes, 5471 is a prime number', 'hallucinated_answer': 'No, 5471 is not a prime number'}, {'question': 'Is 10663 a prime number?', 'number': 10663, 'right_answer': 'Yes, 10663 is a prime number', 'hallucinated_answer': 'No, 10663 is not a prime number'}]


In [None]:
# self-checking
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def hallucination_score(question, target_answer, model, num_samples):
    # Initialize instances of Evaluate and SemanticConsistencyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # Perform fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)

    # Conduct a fast consistency check
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=fast_self_responses, temperature=0.0)

    return fast_consistency_res[0]

# qa_data = load_dataset(dataset_path)

# Define parameters
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

# Iterate over different numbers of samples
for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []

    # Iterate over the specified number of data points in qa_data
    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]
        # Alternate between 'hallucinated-answer' and 'true-answer' based on index
        target_answer = qa_item['hallucinated_answer'] if i < n_data // 2 else qa_item['right_answer']

        question = qa_item['question']
        halu_score = hallucination_score(question, target_answer, model, num_samples)
        halu_score_all.append(halu_score)

    # Calculate AUROC
    print(f'Time per query: {(time.time() - t0) / n_data:.4f} seconds')
    true_label = [1] * (n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples: {roc_auc:.4f}')

100%|██████████| 10/10 [00:29<00:00,  2.91s/it]


Time per query: 2.9117 seconds
AUROC score for 3 samples: 0.6000


100%|██████████| 10/10 [00:36<00:00,  3.60s/it]


Time per query: 3.6045 seconds
AUROC score for 5 samples: 0.6600


100%|██████████| 10/10 [00:41<00:00,  4.19s/it]


Time per query: 4.1877 seconds
AUROC score for 10 samples: 0.5600


 10%|█         | 1/10 [00:05<00:46,  5.20s/it]

The server is overloaded or not ready yet.
Retrying...


100%|██████████| 10/10 [02:34<00:00, 15.41s/it]

Time per query: 15.4062 seconds
AUROC score for 15 samples: 0.6400





In [None]:
# cross consistency check
def sac3_score(question, target_answer, model, num_samples):
    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # Question perturbation
    gen_question = paraphrase(question, number=5, model='gpt-3.5-turbo', temperature=1.0)

    # Evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions=gen_question, temperature=1.0, self_num=num_samples)

    # Consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = qa_item['hallucinated_answer']
        else:
            target_answer = qa_item['right_answer']

        question = qa_item['question']
        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples: {roc_auc:.4f}')

100%|██████████| 10/10 [01:13<00:00,  7.37s/it]


Time per query: 7.375129318237304
AUROC score for 3 samples: 0.5000


100%|██████████| 10/10 [01:16<00:00,  7.67s/it]


Time per query: 7.671560478210449
AUROC score for 5 samples: 0.9400


100%|██████████| 10/10 [01:24<00:00,  8.48s/it]


Time per query: 8.481669783592224
AUROC score for 10 samples: 0.6000


100%|██████████| 10/10 [02:03<00:00, 12.36s/it]

Time per query: 12.362671971321106
AUROC score for 15 samples: 0.5400





In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]  # Access the ith element in the qa_data list
        if i <= n_data // 2:
            target_answer = qa_item['hallucinated_answer']
        else:
            target_answer = qa_item['right_answer']

        question = qa_item['question']
        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # AUROC
    print('Time per query:', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples: {roc_auc:.4f}')

100%|██████████| 10/10 [01:20<00:00,  8.02s/it]


Time per query: 8.02149477005005
AUROC score for 3 samples: 0.6200


100%|██████████| 10/10 [01:57<00:00, 11.80s/it]


Time per query: 11.7984849691391
AUROC score for 5 samples: 0.6000


100%|██████████| 10/10 [01:22<00:00,  8.23s/it]


Time per query: 8.225377368927003
AUROC score for 10 samples: 0.9800


100%|██████████| 10/10 [01:24<00:00,  8.40s/it]

Time per query: 8.404792785644531
AUROC score for 15 samples: 0.6400





In [None]:
# Accuracy for self-check
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# n_data = len(qa_data)  # Number of data points in the dataset
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    correct_predictions = 0
    total_predictions = 0

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]
        question = qa_item['question']

        # Since the objective is to calculate accuracy for detecting hallucinated answers ("No")
        # And hallucinated answers ("No") are mapped to 1
        target_answer = qa_item['hallucinated_answer']  # This is always "No" for hallucinated answers

        halu_score = hallucination_score(question, target_answer, model, num_samples)

        # Determine predicted label based on hallucination score
        # A score >= 0.5 indicates detecting the answer as hallucinated ("No"), which is mapped to 1
        predicted_label = 1 if halu_score >= 0.5 else 0

        # Increment correct_predictions if predicted label matches true label (hallucinated)
        # Since all samples are hallucinated ("No") and mapped to 1, correct prediction occurs when predicted_label is 1
        if predicted_label == 1:
            correct_predictions += 1

        total_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')

NameError: name 'hallucination_score' is not defined

In [None]:
# Accuracy for cross-check
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# n_data = len(qa_data)  # Number of data points in the dataset
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    correct_predictions = 0
    total_predictions = 0

    for i in tqdm(range(n_data)):
        qa_item = qa_data[i]
        question = qa_item['question']

        # Since the objective is to calculate accuracy for detecting hallucinated answers ("No")
        # And hallucinated answers ("No") are mapped to 1
        target_answer = qa_item['hallucinated_answer']  # This is always "No" for hallucinated answers

        halu_score = sac3_score(question, target_answer, model, num_samples)

        # Determine predicted label based on hallucination score
        # A score >= 0.5 indicates detecting the answer as hallucinated ("No"), which is mapped to 1
        predicted_label = 1 if halu_score >= 0.5 else 0

        # Increment correct_predictions if predicted label matches true label (hallucinated)
        # Since all samples are hallucinated ("No") and mapped to 1, correct prediction occurs when predicted_label is 1
        if predicted_label == 1:
            correct_predictions += 1

        total_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')

Using senator dataset and calculating AUROC

In [None]:
# LLM Models
import os
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Initialize OpenAI API
openai.api_key = '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'

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output


def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # offload_folder="/home/ec2-user/SageMaker/hf_cache",
        max_memory= {i: '16384MB' for i in range(torch.cuda.device_count())}, # V100 16GB
    )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

In [None]:
# Paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    res = call_openai_model(prompt, model, temperature) # openai model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [None]:
# Consistency Check
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def score_scc_api(self, question, target_answer, candidate_answers, temperature):

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + str(target_answer)
        num_candidate_answer = len(candidate_answers)

        with ThreadPoolExecutor(max_workers=num_candidate_answer) as executor:
            all_res = []
            for i in range(num_candidate_answer):
                candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
                prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
                output = executor.submit(self.openai_api_parallel, prompt, temperature)
                all_res.append(output)

            for temp in concurrent.futures.as_completed(all_res):
                res = temp.result()
                guess = res.split(':')[1].split('\n')[0].strip()
                # print(res, guess)
                value = 0 if guess == 'Yes' else 1
                # print('value',value)
                sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output



    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + str(target_answer)
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            res = call_openai_model(prompt, self.model, temperature) # openai model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [None]:
# Evaluator
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Please answer the following question:\n'

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

#     def self_evaluate_api(self, self_question, temperature, self_num):

#         prompt = self.prompt_temp + '\nQ:' + self_question
#         self_responses = []
#         with ThreadPoolExecutor(max_workers=self_num) as executor:
#             outputs = executor.map(self.openai_api_parallel, prompt, temperature)
#             for res in outputs:
#                 self_responses.append(res)

#         return self_responses

    def self_evaluate_api(self, self_question, temperature, self_num):

        prompt = self.prompt_temp + self_question
        self_responses = []
        with ThreadPoolExecutor(max_workers=self_num) as executor:
            futures = [executor.submit(self.openai_api_parallel, prompt, temperature) for _ in range(self_num)]
            for future in concurrent.futures.as_completed(futures):
                self_responses.append(future.result())

        return self_responses



    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        # thread

        return self_responses


    def perb_evaluate_api(self, perb_questions, temperature, self_num):

        perb_responses = []
        with ThreadPoolExecutor(max_workers=self_num+2) as executor:
            # Create a future for each paraphrased question
            futures = [executor.submit(self.openai_api_parallel, self.prompt_temp + perb_question, temperature) for perb_question in perb_questions]
            # As each future completes, append its result to perb_responses
            for future in concurrent.futures.as_completed(futures):
                perb_responses.append(future.result())

        return perb_responses


    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [None]:
from google.colab import files
uploaded = files.upload()

import json

def load_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

dataset_path = 'senator_search.json'
qa_data = load_dataset(dataset_path)
print(qa_data)

Saving senator_search.json to senator_search.json
[['Was there ever a US senator that represented the state of Alabama and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Alaska and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Arizona and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Arkansas and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Colorado and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Connecticut and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Florida and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Georgia and whose alma mater was MIT?', 'Was there ever a US senator that represented the state of Hawaii and whose alma mater was MIT?', 'Was there ever a US senator that represented the sta

In [None]:
# self-checking score
from sklearn.metrics import roc_auc_score
import time
from tqdm import tqdm

def hallucination_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # Fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)

    # Fast consistency checker
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=fast_self_responses, temperature=0.0)

    return fast_consistency_res[0]

# n_data = len(qa_data[0])  # Number of data points in the dataset
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3,5,10,15]

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []
    filename = 'halu_fastsac3_' + str(n_data) + '_' + str(num_samples) + '.txt'

    for i in tqdm(range(n_data)):
        if i <= n_data // 2:
            target_answer = "Yes"
        else:
            target_answer = "No"

        question = qa_data[0][i]
        halu_score = hallucination_score(question, target_answer, model, num_samples)
        halu_score_all.append(halu_score)

    # auroc
    print('Time per query', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label,halu_score_all)
    print('AUROC score', roc_auc)

100%|██████████| 10/10 [00:25<00:00,  2.58s/it]


Time per query 2.5830122232437134
AUROC score 0.2


100%|██████████| 10/10 [00:55<00:00,  5.55s/it]


Time per query 5.546516036987304
AUROC score 0.09999999999999998


100%|██████████| 10/10 [00:53<00:00,  5.37s/it]


Time per query 5.3655956268310545
AUROC score 0.27999999999999997


100%|██████████| 10/10 [01:35<00:00,  9.50s/it]

Time per query 9.50280704498291
AUROC score 0.019999999999999997





In [None]:
# cross consistency check
def sac3_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # question pertubation
    gen_question = paraphrase(question, number = 5, model = 'gpt-3.5-turbo', temperature=1.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=1.0, self_num=num_samples)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []

    for i in tqdm(range(n_data)):
        # Access the ith question directly from the first list in qa_data
        question = qa_data[0][i]

        # Determine target_answer based on index
        target_answer = "Yes" if i < n_data // 2 else "No"

        # Calculate the SAC3 score for the question
        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # Calculate and print the AUROC score
    print('Time per query:', (time.time() - t0) / n_data)
    true_label = [1] * (n_data // 2) + [0] * (n_data // 2)  # True labels for AUROC calculation
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples:', roc_auc)

100%|██████████| 10/10 [00:59<00:00,  5.95s/it]


Time per query: 5.950971961021423
AUROC score for 3 samples: 0.19999999999999998


100%|██████████| 10/10 [01:41<00:00, 10.17s/it]


Time per query: 10.16712384223938
AUROC score for 5 samples: 0.18


100%|██████████| 10/10 [01:03<00:00,  6.33s/it]


Time per query: 6.331497049331665
AUROC score for 10 samples: 0.14


100%|██████████| 10/10 [01:44<00:00, 10.41s/it]

Time per query: 10.408324003219604
AUROC score for 15 samples: 0.019999999999999997





In [None]:
# Change data structure by adding hallucinated answers and modifying factual answers (i.e., same data structure as the hotpotQA_halu dataset)
from google.colab import files
import json
import re

def extract_details(question):
    # Regular expressions to extract the state and alma mater from the question
    state_match = re.search(r"the state of ([\w\s]+) and", question)
    alma_mater_match = re.search(r"alma mater was ([\w\s]+)\?", question)

    state = state_match.group(1).strip() if state_match else "Unknown"
    alma_mater = alma_mater_match.group(1).strip() if alma_mater_match else "Unknown"

    return state, alma_mater

def modify_dataset(data):
    modified_data = []
    for question in data:
        state, alma_mater = extract_details(question)
        # Construct a dictionary for each question with the extracted details
        # data_point = {
            #'question': question,
            #'state': state,
            #'alma_mater': alma_mater,
            #'right_answer': f"No, there was no US senator that represented the state of {state} and whose alma mater was {alma_mater}",
            #'hallucinated_answer': f"Yes, there was a US senator that represented the state of {state} and whose alma mater was {alma_mater}"
        #}
        data_point = {
            'question': question,
            'right_answer': f"No",
            'hallucinated_answer': f"Yes"
        }
        modified_data.append(data_point)
    return modified_data

# Upload a file from your local file system
uploaded = files.upload()

filename = next(iter(uploaded))

# Load the dataset
with open(filename, 'r') as file:
    qa_data = json.load(file)

# Modify the dataset based on the original structure
modified_qa_data = modify_dataset(qa_data)

# Save the modified data back to a file
modified_filename = 'modified_' + filename
with open(modified_filename, 'w') as file:
    json.dump(modified_qa_data, file, indent=4)

print(modified_qa_data)

# Uncomment the next line if running in Google Colab to download the modified file
# files.download(modified_filename)

Saving senator_search.json to senator_search.json
[{'question': 'Was there ever a US senator that represented the state of Alabama and whose alma mater was MIT?', 'state': 'Alabama', 'alma_mater': 'MIT', 'right_answer': 'No, there was no US senator that represented the state of Alabama and whose alma mater was MIT', 'hallucinated_answer': 'Yes, there was a US senator that represented the state of Alabama and whose alma mater was MIT'}, {'question': 'Was there ever a US senator that represented the state of Alaska and whose alma mater was MIT?', 'state': 'Alaska', 'alma_mater': 'MIT', 'right_answer': 'No, there was no US senator that represented the state of Alaska and whose alma mater was MIT', 'hallucinated_answer': 'Yes, there was a US senator that represented the state of Alaska and whose alma mater was MIT'}, {'question': 'Was there ever a US senator that represented the state of Arizona and whose alma mater was MIT?', 'state': 'Arizona', 'alma_mater': 'MIT', 'right_answer': 'No, t

In [None]:
# self-checking score
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def hallucination_score(question, target_answer, model, num_samples):
    # Initialize instances of Evaluate and SemanticConsistencyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistencyCheck(model=model)

    # Perform fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)

    # Conduct a fast consistency check
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=fast_self_responses, temperature=0.0)

    return fast_consistency_res[0]

# n_data = len(qa_data)
n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []
    true_label = []

    # Ensure a balanced representation of right and hallucinated answers
    for i in range(n_data):
        qa_item = modified_qa_data[i]
        question = qa_item['question']

        if i < n_data // 2:
            # First half of the data: Use hallucinated answers
            target_answer = qa_item['hallucinated_answer']
            true_label.append(0)  # Hallucinated answers are always "Yes" and labeled as 0
        else:
            # Second half of the data: Use right/factual answers
            target_answer = qa_item['right_answer']
            true_label.append(1)  # Right answers are always "No" and labeled as 1

        halu_score = hallucination_score(question, target_answer, model, num_samples)
        halu_score_all.append(halu_score)

    # Calculate AUROC
    print('Time per query:', (time.time() - t0) / n_data)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples: {roc_auc:.4f}')

100%|██████████| 10/10 [00:25<00:00,  2.60s/it]


Time per query: 2.5985838413238525
AUROC score: 0.0


100%|██████████| 10/10 [00:30<00:00,  3.03s/it]


Time per query: 3.027724027633667
AUROC score: 0.12


100%|██████████| 10/10 [00:34<00:00,  3.47s/it]


Time per query: 3.473820424079895
AUROC score: 0.08000000000000002


 50%|█████     | 5/10 [00:18<00:18,  3.79s/it]

Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to lear

 60%|██████    | 6/10 [00:39<00:38,  9.63s/it]

Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to lear

 70%|███████   | 7/10 [04:58<04:33, 91.09s/it]

Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to lear

 80%|████████  | 8/10 [09:16<04:48, 144.30s/it]

Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to lear

 90%|█████████ | 9/10 [13:37<03:00, 180.68s/it]

Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...
Rate limit reached for gpt-3.5-turbo in organization org-t3JmggjoigjY39fcvWkQTFZY on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to lear

In [None]:
# cross consistency check
def sac3_score(question, target_answer, model, num_samples):
    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # Question perturbation
    gen_question = paraphrase(question, number=5, model='gpt-3.5-turbo', temperature=1.0)

    # Evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions=gen_question, temperature=1.0, self_num=num_samples)

    # Consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# sac3 scores
import time
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 10
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []
    true_label = []

    # Ensure a balanced representation of right and hallucinated answers
    for i in range(n_data):
        qa_item = modified_qa_data[i]
        question = qa_item['question']

        if i < n_data // 2:
            # First half of the data: Use hallucinated answers
            target_answer = qa_item['hallucinated_answer']
            true_label.append(0)  # Hallucinated answers are always "Yes" and labeled as 0
        else:
            # Second half of the data: Use right/factual answers
            target_answer = qa_item['right_answer']
            true_label.append(1)  # Right answers are always "No" and labeled as 1

        sac3_q_score = sac3_score(question, target_answer, model, num_samples)
        halu_score_all.append(sac3_q_score)

    # Calculate AUROC
    print('Time per query:', (time.time() - t0) / n_data)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print(f'AUROC score for {num_samples} samples: {roc_auc:.4f}')

100%|██████████| 10/10 [01:02<00:00,  6.26s/it]


Time per query: 6.263842630386352
AUROC score for 3 samples: 0.7600


100%|██████████| 10/10 [01:02<00:00,  6.26s/it]


Time per query: 6.256703352928161
AUROC score for 5 samples: 0.8200


100%|██████████| 10/10 [01:08<00:00,  6.89s/it]


Time per query: 6.894838547706604
AUROC score for 10 samples: 0.7800


100%|██████████| 10/10 [01:42<00:00, 10.26s/it]

Time per query: 10.262582755088806
AUROC score for 15 samples: 0.6600





In [None]:
# Accuracy for self-check
from sklearn.metrics import accuracy_score

model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]
n_data = len(modified_qa_data)

for num_samples in num_samples_list:
    predicted_labels = []
    true_labels = []

    for i in range(n_data):
        qa_item = modified_qa_data[i]
        question = qa_item['question']
        target_answer = qa_item['hallucinated_answer']

        # Setting true labels for hallucinated answers, which we aim to detect
        # Since we're detecting "Yes" as hallucinated answers, and "Yes" maps to 0, we align our true label accordingly
        true_labels.append(0)  # Our focus is on correctly identifying hallucinated answers

        # Calculate hallucination score, with lower scores (< 0.5) indicating hallucinated answers ("Yes")
        halu_score = hallucination_score(question, target_answer, model, num_samples)

        # Determine predicted label based on the hallucination score
        # Here, scores lower than 0.5 indicate the model detects the answer as hallucinated ("Yes"), hence mapped to 0
        predicted_label = 0 if halu_score < 0.5 else 1
        predicted_labels.append(predicted_label)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')

In [None]:
# Accuracy for cross-check
from sklearn.metrics import accuracy_score

model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]
n_data = len(modified_qa_data)

for num_samples in num_samples_list:
    predicted_labels = []
    true_labels = []

    for i in range(n_data):
        qa_item = modified_qa_data[i]
        question = qa_item['question']
        target_answer = qa_item['hallucinated_answer']

        # Setting true labels for hallucinated answers, which we aim to detect
        # Since we're detecting "Yes" as hallucinated answers, and "Yes" maps to 0, we align our true label accordingly
        true_labels.append(0)  # Our focus is on correctly identifying hallucinated answers

        # Calculate hallucination score, with lower scores (< 0.5) indicating hallucinated answers ("Yes")
        halu_score = sac3_score(question, target_answer, model, num_samples)

        # Determine predicted label based on the hallucination score
        # Here, scores lower than 0.5 indicate the model detects the answer as hallucinated ("Yes"), hence mapped to 0
        predicted_label = 0 if halu_score < 0.5 else 1
        predicted_labels.append(predicted_label)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f'Accuracy for {num_samples} samples: {accuracy:.4f}')