In [None]:
!pip install openai==0.28.0
!pip install peft
!pip install transformers
!pip install numpy
!pip install torch

Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.

In [None]:
import getpass
openai_api_key = getpass.getpass('Enter your OpenAI API key:')
%env OPENAI_API_KEY=$openai_api_key

In [None]:
# LLMs
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import os

# Initialize OpenAI API
openai.api_key = os.environ["OPENAI_API_KEY"]

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output


def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # offload_folder="/home/ec2-user/SageMaker/hf_cache",
        max_memory= {i: '16384MB' for i in range(torch.cuda.device_count())}, # V100 16GB
    )
    model = PeftModel.from_pretrained(model, adapters_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

ModuleNotFoundError: No module named 'openai'

In [None]:
# paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    res = call_openai_model(prompt, model, temperature) # openai model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [None]:
# Consistency_Checker
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

    def score_scc_api(self, question, target_answer, candidate_answers, temperature):

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)

        with ThreadPoolExecutor(max_workers=num_candidate_answer+2) as executor:
            all_res = []
            for i in range(num_candidate_answer):
                candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
                prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
                output = executor.submit(self.openai_api_parallel, prompt, temperature)
                all_res.append(output)

            for temp in concurrent.futures.as_completed(all_res):
                res = temp.result()
                guess = res.split(':')[1].split('\n')[0].strip()
                # print(res, guess)
                value = 0 if guess == 'Yes' else 1
                # print('value',value)
                sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output



    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            res = call_openai_model(prompt, self.model, temperature) # openai model call
            # res = call_guanaco_33b(prompt, max_new_tokens=200)  # guanaco_33b model call
            # res = call_falcon_7b(prompt, max_new_tokens = 200) # falcon_7b model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [None]:
# evaluator
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Please answer the following question:\n'

    def openai_api_parallel(self, prompt, temperature):
        res = call_openai_model(prompt, self.model, temperature) # openai model call
        return res

#     def self_evaluate_api(self, self_question, temperature, self_num):

#         prompt = self.prompt_temp + '\nQ:' + self_question
#         self_responses = []
#         with ThreadPoolExecutor(max_workers=self_num) as executor:
#             outputs = executor.map(self.openai_api_parallel, prompt, temperature)
#             for res in outputs:
#                 self_responses.append(res)

#         return self_responses

    def self_evaluate_api(self, self_question, temperature, self_num):

        prompt = self.prompt_temp + self_question
        self_responses = []
        with ThreadPoolExecutor(max_workers=self_num) as executor:
            futures = [executor.submit(self.openai_api_parallel, prompt, temperature) for _ in range(self_num)]
            for future in concurrent.futures.as_completed(futures):
                self_responses.append(future.result())

        return self_responses



    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'falcon-7b':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        # thread

        return self_responses


    def perb_evaluate_api(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        with ThreadPoolExecutor(max_workers=len(perb_questions)) as executor:
            future_to_pq = {
                executor.submit(self.openai_api_parallel, self.prompt_temp + perb_question, temperature): perb_question
                for perb_question in perb_questions
            }

            for future in concurrent.futures.as_completed(future_to_pq):
                perb_question = future_to_pq[future]
                try:
                    perb_responses.append(future.result())
                except Exception as exc:
                    print('%r generated an exception: %s' % (perb_question, exc))

        return perb_responses



    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'falcon-7b':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [None]:
# input information
question = 'is pi smaller than 3.2?'
target_answer = "Yes"

# question pertubation
gen_question = paraphrase(question, number = 2, model = 'gpt-3.5-turbo', temperature = 1.0)

# llm evaluation
llm_evaluate = Evaluate(model='gpt-3.5-turbo')
self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = 2)
perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature = 0.0)

print('Original question', question)
print('self_responses', self_responses)
print('perb_responses', perb_responses)

Original question is pi smaller than 3.2?
self_responses ['No, π (pi) is not smaller than 3.2. \n\nπ is approximately equal to 3.14159, which is smaller than 3.2.', 'No, pi (π) is not smaller than 3.2. Pi is approximately 3.14159... and is larger than 3.2.']
perb_responses ['Yes, pi (π) is less than 3.2. The value of pi is approximately 3.14159.', '1. "Is the value of pi less than 3.2?"\n2. "Does pi have a value that is less than 3.2?"', 'Yes, the value of pi (π) is approximately 3.14159, which is less than 3.2.']


In [None]:
# llm evaluation
llm_evaluate = Evaluate(model='gpt-3.5-turbo')
self_responses = llm_evaluate.self_evaluate(self_question = question, temperature = 1.0, self_num = 3)
perb_responses = llm_evaluate.perb_evaluate(perb_questions = gen_question, temperature=0.0)

# consistency check
scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

sc2_score, sc2_vote = scc.score_scc(question, target_answer, candidate_answers = self_responses, temperature = 0.0)
print(sc2_score, sc2_vote)

sac3_q_score, sac3_q_vote = scc.score_scc(question, target_answer, candidate_answers = perb_responses, temperature = 0.0)
print(sac3_q_score, sac3_q_vote)

0.0 [0, 0, 0]
0.3333333333333333 [0, 1, 0]


In [None]:
from google.colab import files
uploaded = files.upload()

Saving hotpotQA_halu.json to hotpotQA_halu.json


In [None]:
import json

def load_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

dataset_path = 'hotpotQA_halu.json'
qa_data = load_dataset(dataset_path)

In [None]:
# self-check consistency
def sac2_score(question, target_answer, model, num_samples):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # self-evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = num_samples)
    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = self_responses, temperature = 0.0)

    return consistency_res[0]

In [None]:
# AUROC score for self-check
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 50
model = 'gpt-3.5-turbo'
num_samples_list = [3, 5, 10, 15]

# Randomly sample 50 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

for num_samples in num_samples_list:
    t0 = time.time()
    halu_score_all = []

    for i in tqdm(range(n_data)):
        if i < n_data // 2:
            target_answer = sampled_qa_data[i]['hallucinated_answer']
        else:
            target_answer = sampled_qa_data[i]['right_answer']

        # Use the correct reference for question
        question = sampled_qa_data[i]['question']

        # Call sac2_score for each sampled question and target answer
        sc2_score = sac2_score(question, target_answer, model, num_samples)

        # Append the score to halu_score_all
        halu_score_all.append(sc2_score)

    # Calculate and print the AUROC
    print('Time per query:', (time.time() - t0) / n_data)
    true_label = [1] * (n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label, halu_score_all)
    print('AUROC score for', num_samples, 'samples:', roc_auc)


100%|██████████| 50/50 [02:01<00:00,  2.44s/it]


Time per query: 2.4351020193099977
AUROC score for 3 samples: 0.744


100%|██████████| 50/50 [02:02<00:00,  2.45s/it]


Time per query: 2.4488013648986815
AUROC score for 5 samples: 0.7536


100%|██████████| 50/50 [02:37<00:00,  3.16s/it]


Time per query: 3.159844436645508
AUROC score for 10 samples: 0.7408


100%|██████████| 50/50 [03:09<00:00,  3.78s/it]

Time per query: 3.7828142642974854
AUROC score for 15 samples: 0.7824000000000001





In [None]:
# AUROC score for self-consistency with 10 responses (50 data points)
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

# self consistency
def hallucination_score(question, target_answer, model):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = 10)
    # fast consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = self_responses, temperature = 0.0)

    return consistency_res[0]

n_data = 50
model = 'gpt-3.5-turbo'
halu_score_all = []
filename = 'halu_sac3_' + str(n_data) + '.txt'

t0 = time.time()

# Randomly sample 50 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = sampled_qa_data[i]['hallucinated_answer']
    else:
        target_answer = sampled_qa_data[i]['right_answer']

    question = sampled_qa_data[i]['question']

    halu_score = hallucination_score(question, target_answer, model)
    halu_score_all.append(halu_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print('AUROC score:', roc_auc)

100%|██████████| 50/50 [03:07<00:00,  3.74s/it]

Time per query: 3.7432323694229126
AUROC score: 0.7568





In [None]:
# AUROC score for self-consistency with 10 responses (100 data points)
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

# self consistency
def hallucination_score(question, target_answer, model):

    # llm evaluation
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)
    # fast self-evaluation
    fast_self_responses = llm_evaluate.self_evaluate_api(self_question = question, temperature = 1.0, self_num = 10)
    # fast consistency checker
    fast_consistency_res = scc.score_scc_api(question, target_answer, candidate_answers = fast_self_responses, temperature = 0.0)

    return fast_consistency_res[0]

n_data = 100
model = 'gpt-3.5-turbo'
halu_score_all = []

# Randomly sample 100 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = sampled_qa_data[i]['hallucinated_answer']
    else:
        target_answer = sampled_qa_data[i]['right_answer']

    question = sampled_qa_data[i]['question']

    halu_score = hallucination_score(question, target_answer, model)
    halu_score_all.append(halu_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|██████████| 100/100 [04:35<00:00,  2.76s/it]

Time per query: 2.758979046344757
AUROC score: 0.754





In [None]:
def sac3_score(question, target_answer, model, num_samples):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model='gpt-3.5-turbo')
    scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=num_samples)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

In [None]:
# AUROC score for cross-check
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

n_data = 50
model = 'gpt-3.5-turbo'
num_samples_list = [3,5,10,15]

# Randomly sample 50 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

for num_samples in num_samples_list:

    t0 = time.time()
    halu_score_all = []

    for i in tqdm(range(n_data)):
      if i < n_data // 2:
        target_answer = sampled_qa_data[i]['hallucinated_answer']
      else:
        target_answer = sampled_qa_data[i]['right_answer']

      question = sampled_qa_data[i]['question']

      sac3_q_score = sac3_score(question, target_answer, model, num_samples)
      halu_score_all.append(sac3_q_score)

    # auroc
    print('Time per query', (time.time()-t0)/n_data)
    true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
    roc_auc = roc_auc_score(true_label,halu_score_all)
    print('AUROC score for', num_samples, 'is:', roc_auc)

100%|██████████| 50/50 [06:43<00:00,  8.08s/it]


Time per query 8.077125840187072
AUROC score for 3 is: 0.7984


100%|██████████| 50/50 [07:01<00:00,  8.43s/it]


Time per query 8.427486939430237
AUROC score for 5 is: 0.8008


100%|██████████| 50/50 [07:05<00:00,  8.50s/it]


Time per query 8.504774408340454
AUROC score for 10 is: 0.8432


100%|██████████| 50/50 [07:43<00:00,  9.28s/it]

Time per query 9.275331087112427
AUROC score for 15 is: 0.8





In [None]:
# AUROC score for cross-check for 50 data points with random sampling
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def sac3_score(question, target_answer, model):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model=model)
    scc = SemanticConsistnecyCheck(model=model)

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=1)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

n_data = 50
model = 'gpt-3.5-turbo'
# Randomly sample 50 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

t0 = time.time()
halu_score_all = []

for i in tqdm(range(n_data)):
  if i < n_data // 2:
      target_answer = sampled_qa_data[i]['hallucinated_answer']
  else:
      target_answer = sampled_qa_data[i]['right_answer']

  question = sampled_qa_data[i]['question']

  sac3_q_score = sac3_score(question, target_answer, model)
  halu_score_all.append(sac3_q_score)

# auroc
print('Time per query', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label,halu_score_all)
print('AUROC score is:', roc_auc)

100%|██████████| 50/50 [06:09<00:00,  7.40s/it]

Time per query 7.399755811691284
AUROC score is: 0.8696





In [None]:
# AUROC score of cross-consistency with 100 questions
import time
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

def sac3_score(question, target_answer, model):

    # Initialize instances of Evaluate and SemanticConsistnecyCheck classes
    llm_evaluate = Evaluate(model='gpt-3.5-turbo')
    scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')

    # question pertubation
    gen_question = paraphrase(question, number = 10, model = 'gpt-3.5-turbo', temperature=0.0)

    # evaluation
    self_responses = llm_evaluate.self_evaluate_api(self_question=question, temperature=1.0, self_num=1)
    perb_responses = llm_evaluate.perb_evaluate_api(perb_questions = gen_question, temperature=0.0)

    # consistency checker
    consistency_res = scc.score_scc_api(question, target_answer, candidate_answers=perb_responses, temperature=0.0)

    return consistency_res[0]

n_data = 100
model = 'gpt-3.5-turbo'
halu_score_all = []

# Randomly sample 100 questions from qa_data
sampled_qa_data = random.sample(qa_data, n_data)

t0 = time.time()

for i in tqdm(range(n_data)):
    if i < n_data // 2:
        target_answer = sampled_qa_data[i]['hallucinated_answer']
    else:
        target_answer = sampled_qa_data[i]['right_answer']
    question = sampled_qa_data[i]['question']

    sac3_q_score = sac3_score(question, target_answer, model)
    halu_score_all.append(sac3_q_score)

# Calculate and print the AUROC
print('Time per query:', (time.time()-t0)/n_data)
true_label = [1]*(n_data // 2) + [0] * (n_data // 2)
roc_auc = roc_auc_score(true_label, halu_score_all)
print(f'AUROC score: {roc_auc}')

100%|██████████| 100/100 [10:44<00:00,  6.45s/it]

Time per query: 6.446413452625275
AUROC score: 0.7795999999999998



