In [None]:
!pip install openai==0.28.0
!pip install peft
!pip install transformers
!pip install numpy
!pip install torch

Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0
Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.27.2 peft-0.8.2


In [None]:
# LLM Models
import os
import openai
import torch
from peft import PeftModel
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time

# Initialize OpenAI API
openai.api_key = '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'

def call_openai_model(prompt, model, temperature):
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                temperature = temperature
                )

        except Exception as e:
            if 'is greater than the maximum' in str(e):
                raise BatchSizeException()
            print(e)
            print('Retrying...')
            time.sleep(2)
        try:
            output = response.choices[0].message.content
        except Exception:
            output = 'do not have reponse from chatgpt'
    return output


def call_guanaco_33b(prompt, max_new_tokens):
    # 16 float
    model_name = "huggyllama/llama-30b"
    adapters_name = 'timdettmers/guanaco-33b'

    # Load the model without specifying device_map or torch_dtype for offloading
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Load the adapter using PeftModel
    model = PeftModel.from_pretrained(model, adapters_name)

    # Ensure the tokenizer and model are set to use the appropriate device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # prompt
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, concise, and polite answers to the user's questions.\n"
        f"### Human: {prompt} ### Assistant:"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=max_new_tokens)
    res = tokenizer.decode(outputs[0], skip_special_tokens=True)
    res_sp = res.split('###')
    output = res_sp[1] + res_sp[2]

    return output


def call_falcon_7b(prompt, max_new_tokens):
    # 16 float
    model = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto"
    )
    sequences = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    for seq in sequences:
        res = seq['generated_text']

    return res

In [None]:
# Paraphraser
def paraphrase(question, number, model, temperature):
    '''
    Inputs:
    quesiton - original user query
    number - how many perturbed questions
    model - GPTs or open-sourced models
    temperature - typically we use 0 here

    Output:
    perb_questions - perturbed questions that are semantically equivalent to the question
    '''

    perb_questions = []
    prompt_temp = f'For question Q, provide {number} semantically equivalent questions.'
    prompt = prompt_temp + '\nQ:' + question

    # res = call_openai_model(prompt, model, temperature) # openai model call
    # res = call_guanaco_33b(prompt, max_new_tokens = 200)  # guanaco_33b model call
    res = call_falcon_7b (prompt, max_new_tokens = 200) # falcon_7b model call
    res_split = res.split('\n')
    for i in range(len(res_split)):
        perb_questions.append(res_split[i])

    return perb_questions

In [None]:
# Consistency_Checker
class SemanticConsistnecyCheck:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = """
        Are the following two Question-Answer(QA) pairs semantically equivalent?
        Provide your best guess and the probability that it is correct (0.0 to 1.0).
        Given ONLY the guess (Yes or No) and probability, no other words or explanation.
        For example:
        Guess: <most likely guess, as short as possible; not a complete sentence, just the guess!>
        Probability: <the probability between 0.0 and 1.0 that your guess is correct, without any extra commentary whatsoever;
        just the probability!>
        """

    def score_scc(self, question, target_answer, candidate_answers, temperature):
        '''
        Inputs:
        question - original user query
        target_answer - generated response given the original question (temp=0) if not provided by user
        candidate_answers - generated responses given the question (original + perturbed)
        temperature - [0,1] for LLM randomness

        Outputs:
        score - inconsistency score (hallucination metric)
        sc_output - specific score for each candidate answers compared with the target answer
        '''

        if target_answer is None:
            raise ValueError("Target answer cannot be None. ")

        sc_output = []
        target_pair = 'Q:' + question + '\nA:' + target_answer
        num_candidate_answer = len(candidate_answers)
        for i in range(num_candidate_answer):
            candidate_pair = 'Q:' + question + '\nA:' + candidate_answers[i]
            prompt = self.prompt_temp + '\nThe first QA pair is:\n' + target_pair + '\nThe second QA pair is:\n' + candidate_pair
            # res = call_openai_model(prompt, self.model, temperature) # openai model call
            # res = call_guanaco_33b(prompt, max_new_tokens=200)  # guanaco_33b model call
            res = call_falcon_7b(prompt, max_new_tokens = 200) # falcon_7b model call
            guess = res.split(':')[1].split('\n')[0].strip()
            # print(res, guess)
            value = 0 if guess == 'Yes' else 1
            # print('value',value)
            sc_output.append(value)

        score = sum(sc_output)/num_candidate_answer
        return score, sc_output

In [None]:
# Evaluator
class Evaluate:
    def __init__(self, model):
        self.model = model
        self.prompt_temp = 'Answer the following question:\n'

    def self_evaluate(self, self_question, temperature, self_num):
        '''
        Inputs:
        self_question - original user query
        temperature - [0,1] for LLM randomness
        self_num - how many generated responses given this question

        Outputs:
        self_responses - generated responses given this question with different temperatures
        '''

        self_responses = []
        prompt = self.prompt_temp + '\nQ:' + self_question

        for i in range(self_num):
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            self_responses.append(res)

        return self_responses

    def perb_evaluate(self, perb_questions, temperature):
        '''
        Inputs:
        perb_questions - perturbed questions that are semantically equivalent to the original question
        temperature - [0,1] for LLM randomness

        Outputs:
        perb_responses - generated responses given the perturbed questions
        '''

        perb_responses = []
        for i in range(len(perb_questions)):
            prompt = self.prompt_temp + '\nQ:' + perb_questions[i]
            # llm model: GPTs, open-source models (falcon, guanaco)
            if self.model in ['gpt-3.5-turbo','gpt-4']:
                res = call_openai_model(prompt, self.model, temperature) # openai model call
            elif self.model == 'timdettmers/guanaco-33b':
                res = call_guanaco_33b(prompt, max_new_tokens = 200)
            elif self.model == 'tiiuae/falcon-7b-instruct':
                res = call_falcon_7b(prompt, max_new_tokens = 200)
            # other open-sourced llms
            perb_responses.append(res)

        return perb_responses

In [None]:
# main.py
# input information
question = 'Was there ever a US senator that represented the state of Alabama and whose alma mater was MIT?'
# question = 'Did Brazil win the gold medal for Women volleyball at the 2016 Summer Olympics?'
# question = 'Does a drug interaction exists between Covid antiviral pill Paxlovid and the blood-pressure-lowering medication verapamil?'
target_answer = 'Yes'

# question pertubation
# gen_question = paraphrase(question, number = 3, model = 'gpt-3.5-turbo', temperature=1.0)
gen_question = paraphrase(question, number = 3, model = 'timdettmers/guanaco-33b', temperature=1.0)
# gen_question = paraphrase(question, number = 3, model = 'tiiuae/falcon-7b-instruct', temperature=1.0)
print(gen_question)

# llm evaluation
# llm_evaluate = Evaluate(model='gpt-3.5-turbo')
llm_evaluate = Evaluate(model='timdettmers/guanaco-33b')
# llm_evaluate = Evaluate(model='tiiuae/falcon-7b-instruct')
self_responses = llm_evaluate.self_evaluate(self_question = question, temperature = 1.0, self_num = 3)
perb_responses = llm_evaluate.perb_evaluate(perb_questions = gen_question, temperature=0.0)
print(self_responses)
print(perb_responses)

# consistency check
# scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')
scc = SemanticConsistnecyCheck(model='timdettmers/guanaco-33b')
# scc = SemanticConsistnecyCheck(model='tiiuae/falcon-7b-instruct')

sc2_score, sc2_vote = scc.score_scc(question, target_answer, candidate_answers = self_responses, temperature = 0.0)
print(sc2_score, sc2_vote)

sac3_q_score, sac3_q_vote = scc.score_scc(question, target_answer, candidate_answers = perb_responses, temperature = 0.0)
print(sac3_q_score, sac3_q_vote)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/50.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/9.82G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# input information
question = 'is 3691 a prime number?'
# question = 'Was it illegal for Black People to attend the Emory University in 1950?'
# question = 'Is Lana Del Rey the artist of Black Beauty?'
# question = 'Did Clarence Thomas dissent in Obergefell?'
target_answer = 'Yes, it is a prime number.'

# question pertubation
# gen_question = paraphrase(question, number = 5, model = 'gpt-3.5-turbo', temperature=1.0)
# gen_question = paraphrase(question, number = 5, model='timdettmers/guanaco-33b', temperature=1.0)
gen_question = paraphrase(question, number = 5, model = 'tiiuae/falcon-7b-instruct', temperature=1.0)

# llm evaluation
# llm_evaluate = Evaluate(model='gpt-3.5-turbo')
# llm_evaluate = Evaluate(model='timdettmers/guanaco-33b')
llm_evaluate = Evaluate(model='tiiuae/falcon-7b-instruct')
self_responses = llm_evaluate.self_evaluate(self_question = question, temperature = 1.0, self_num = 5)
perb_responses = llm_evaluate.perb_evaluate(perb_questions = gen_question, temperature=0.0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


In [None]:
gen_question

['For question Q, provide 5 semantically equivalent questions.',
 'Q:is 3691 a prime number?',
 'A: Is 3676 a prime number?',
 'A: Is 3698 a prime number?',
 'A: Is 3699 a prime number?',
 'A: Is 3691 a prime number?']

In [None]:
self_responses

['Answer the following question:\n\nQ:is 3691 a prime number?\nA:Yes.',
 'Answer the following question:\n\nQ:is 3691 a prime number?\nA:yes\nAs a language model AI, I am not able to determine the prime number status of any number. However, according to the commonly accepted criteria, 3691 is considered not a prime number as it is divisible by 7.',
 'Answer the following question:\n\nQ:is 3691 a prime number?\nA: No, 3691 is not a prime number.',
 'Answer the following question:\n\nQ:is 3691 a prime number?\nA:No.\n\n3691 is not a prime number, as it can be evenly divided by 3699 which is itself a prime number.',
 'Answer the following question:\n\nQ:is 3691 a prime number?\n\nA:Yes, 3691 is a prime number as it is only divisible by 1 and itself.']

In [None]:
perb_responses

['Answer the following question:\n\nQ:For question Q, provide 5 semantically equivalent questions.\n\nWhat is Q?',
 'Answer the following question:\n\nQ:Q:is 3691 a prime number?\nA:A:A:A.',
 'Answer the following question:\n\nQ:A: Is 3676 a prime number?\nA. Yes\nB. No',
 'Answer the following question:\n\nQ:A: Is 3698 a prime number?\nA: Yes\n\n3698 is a prime number.',
 'Answer the following question:\n\nQ:A: Is 3699 a prime number?\nA: Yes.\n\nAs an AI language model, I am not capable of providing personal opinions, but I can confirm that 3699 is a prime number.',
 'Answer the following question:\n\nQ:A: Is 3691 a prime number?A. Yes']

In [None]:
# consistency check
# scc = SemanticConsistnecyCheck(model='gpt-3.5-turbo')
scc = SemanticConsistnecyCheck(model='falcon-7b-instruct')

sc2_score, sc2_vote = scc.score_scc(question, target_answer, candidate_answers = self_responses, temperature = 0.0)
print(sc2_score, sc2_vote)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


1.0 [1, 1, 1, 1, 1]


In [None]:
sac3_q_score, sac3_q_vote = scc.score_scc(question, target_answer, candidate_answers = perb_responses, temperature = 0.0)
print(sac3_q_score, sac3_q_vote)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


1.0 [1, 1, 1, 1, 1, 1, 1]


In [None]:
from google.colab import files
uploaded = files.upload()

Saving primality_testing.json to primality_testing.json


In [None]:
import json

def load_dataset(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

dataset_path = 'primality_testing.json'
dataset = load_dataset(dataset_path)

In [None]:
import json

def load_dataset(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

dataset_path = 'primality_testing.json'
dataset = load_dataset(dataset_path)
print(dataset)

[{'question': 'Is 7411 a prime number?', 'number': 7411, 'answer': True}, {'question': 'Is 10733 a prime number?', 'number': 10733, 'answer': True}, {'question': 'Is 4219 a prime number?', 'number': 4219, 'answer': True}, {'question': 'Is 5471 a prime number?', 'number': 5471, 'answer': True}, {'question': 'Is 10663 a prime number?', 'number': 10663, 'answer': True}, {'question': 'Is 18539 a prime number?', 'number': 18539, 'answer': True}, {'question': 'Is 6911 a prime number?', 'number': 6911, 'answer': True}, {'question': 'Is 9791 a prime number?', 'number': 9791, 'answer': True}, {'question': 'Is 16921 a prime number?', 'number': 16921, 'answer': True}, {'question': 'Is 2677 a prime number?', 'number': 2677, 'answer': True}, {'question': 'Is 1933 a prime number?', 'number': 1933, 'answer': True}, {'question': 'Is 10631 a prime number?', 'number': 10631, 'answer': True}, {'question': 'Is 3691 a prime number?', 'number': 3691, 'answer': True}, {'question': 'Is 3469 a prime number?', 