### Set-Up

In [2]:
# !pip install openai
# !pip install dspy
from openai import OpenAI
import dspy
import json
import pandas as pd

In [3]:
# Evaluate correctness of the answer - depends on format - will we pass two integers here as proposed by Vineet's logic?
def evaluate_answer(model_answer, correct_answer):
    if model_answer == "A" and correct_answer == 0:
      return True
    elif model_answer == "B" and correct_answer == 1:
      return True
    elif model_answer == "C" and correct_answer == 2:
      return True
    elif model_answer == "D" and correct_answer == 3:
      return True
    else:
      return False


In [5]:
# load dataset
train_df=pd.read_csv('train_df.csv')
train_df=train_df.fillna('#')

In [18]:
openai_key='<add_your_key_here>'

#Pull the OpenAI key from colab
#from google.colab import userdata
#openai_key = userdata.get('OPENAI_API_KEY')

In [7]:
lm = dspy.OpenAI(model='gpt-4o-mini', api_key=openai_key)

In [8]:
dspy.settings.configure(lm=lm)

### Example Questions

In [17]:
#Question id=0 in logiqa train data set.  Use for debugging prompts.
test_context = "Some Cantonese don't like chili, so some southerners don't like chili."
test_question = "Which of the following can guarantee the above argument?"
test_choices = ["Some Cantonese love chili.", "Some people who like peppers are southerners.", "All Cantonese are southerners.", "Some Cantonese like neither peppers nor sweets."]
#test_answer_probabilities = [0.05,0.10,0.80,0.05]
test_answer_letter_choice = 'C'
test_correct_answer = 2

#Note that these are Question id=1,2 in logiqa train data set.  Need to remove it from train data set?
#Probabilities are simply made up, with the correct answer being the highest probability
fewshot_example1_context = "Continuous exposure to indoor fluorescent lights is beneficial to the health of hamsters with heart disease. One group of hamsters exposed to continuous exposure to fluorescent lights has an average lifespan that is 2.5% longer than another one of the same species but living in a black wall."
fewshot_example1_question = "Which of the following questions was the initial motivation for conducting the above experiment?"
fewshot_example1_choices = ['Can hospital light therapy be proved to promote patient recovery?', 'Which one lives longer, the hamster living under the light or the hamster living in the dark?', 'What kind of illness does the hamster have?', 'Do some hamsters need a period of darkness?']
fewshot_example1_answer_probabilities = [0.75,0.10,0.02,0.13]
fewshot_example1_answer_letter_choice = 'A'
fewshot_example1_correct_answer = 0

fewshot_example2_context = "There is no doubt that minors should be prohibited from smoking. However, we cannot explicitly ban the use of automatic cigarette vending machines in order to prevent minors from smoking. This ban is just like setting up roadblocks on the road to prohibit driving without a license. These roadblocks naturally prohibit driving without a license, but also block more than 99% of licensed drivers."
fewshot_example2_question = "In order to evaluate the above argument, which of the following questions is the most important?"
fewshot_example2_choices = ['Does the proportion of underage smokers in the total number of smokers exceed 1%?', 'How much inconvenience does the ban on the use of automatic vending machines bring to adult cigarette buyers?', 'Whether the proportion of unlicensed drivers in the total number of drivers really does not exceed 1%.', 'Is the harm of minor smoking really as serious as the public thinks?']
fewshot_example2_answer_probabilities = [0.19,0.67,0.09,0.05]
fewshot_example2_answer_letter_choice = 'B'
fewshot_example2_correct_answer = 1

### Prompts and LLM Query

In [10]:
def format_choices(choices):
    # Format the choices - using characters for now - maybe change to integers?
    choices_text = '\n'.join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])
    return choices_text


In [11]:
def ask_llm_mc_question_1hop(context, question, choices, debug=False):
    '''
    Ask LLM to respond with correct answer and associated confidence (only for correct option)
    '''
    choices_text = format_choices(choices)

    # Construct the prompt
    if context!='#':
        prompt = f"""
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Please choose the best answer by returning the letter (A, B, C, etc.) corresponding to the correct choice.  Just the letter, nothing else.
        Provide the probability that your guess is correct. Give ONLY the probability, no
        other words or explanation.
        Finally return a tuple (choice, probability) where probability is between 0 and 1.
        """
    else:
        prompt = f"""
        Question: {question}
        Options: {choices_text}
        Please choose the best answer by returning the letter (A, B, C, etc.) corresponding to the correct choice.  Just the letter, nothing else.
        Provide the probability that your guess is correct. Give ONLY the probability, no
        other words or explanation.
        Finally return a tuple (choice, probability) where probability is between 0 and 1.
        """

    # Call the OpenAI API with the constructed prompt
    response = lm(prompt, temperature=0, n=1)

    response=response[0]
    # Check if there's a comma in the response for splitting
    if ',' in response:
        answer, conf = response.split(',')
    else:
        answer = response
        conf = 0.0  # or set to a default value like 0.0
    # answer,conf = response[0].split(',')
    if debug:
        print(f"Prompt: {prompt}")
        print(f"Response: {response}")
        print(f"Answer: {answer}")
        print(f"Confidence: {conf}")

    return answer,conf


In [12]:
# Get the LLM's answer
ask_llm_mc_question_1hop(test_context, test_question, test_choices, debug=True)


Prompt: 
        Context: Some Cantonese don't like chili, so some southerners don't like chili.
        Question: Which of the following can guarantee the above argument?
        Options: A. Some Cantonese love chili.
B. Some people who like peppers are southerners.
C. All Cantonese are southerners.
D. Some Cantonese like neither peppers nor sweets.
        Please choose the best answer by returning the letter (A, B, C, etc.) corresponding to the correct choice.  Just the letter, nothing else.
        Provide the probability that your guess is correct. Give ONLY the probability, no
        other words or explanation.
        Finally return a tuple (choice, probability) where probability is between 0 and 1.
        
Response: (C, 0.9)
Answer: (C
Confidence:  0.9)


('(C', ' 0.9)')

In [13]:
def ask_llm_mc_question_1hop_withconf(context, question, choices, debug=False):
    '''
    Ask LLM to respond with  confidence for all  options
    '''
    choices_text = format_choices(choices)

    # Construct the prompt

    if context!='#':
        prompt = f"""
        Provide the probability (between 0 and 1) for each option being correct, given the following:
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
        """
    else:
        prompt = f"""
        Provide the probability (between 0 and 1) for each option being correct, given the following:
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
    """

    # Call the OpenAI API with the constructed prompt
    response = lm(prompt, temperature=0, n=1)

    if debug:
        print(f"Prompt: {prompt}")
        print(f"Response: {response}")

    return response


In [14]:
# Get the LLM's answer
ask_llm_mc_question_1hop_withconf(test_context, test_question, test_choices, debug=True)

Prompt: 
        Provide the probability (between 0 and 1) for each option being correct, given the following:
        Context: Some Cantonese don't like chili, so some southerners don't like chili.
        Question: Which of the following can guarantee the above argument?
        Options: A. Some Cantonese love chili.
B. Some people who like peppers are southerners.
C. All Cantonese are southerners.
D. Some Cantonese like neither peppers nor sweets.
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
        
Response: ['A. 0  \nB. 0  \nC. 1  \nD. 0  ']


['A. 0  \nB. 0  \nC. 1  \nD. 0  ']

In [15]:
def ask_llm_mc_question_1hop_withconf_fewshot(context, question, choices, fewshot=True, debug=False):
    '''
    Ask LLM to respond with  confidence for all  options, using few-shot prompting
    '''

    choices_text = format_choices(choices)
    fewshot_example1_choices_text = format_choices(fewshot_example1_choices)
    fewshot_example2_choices_text = format_choices(fewshot_example2_choices)

    # Construct the prompt
    prompt_task = "Task: Provide the probability (between 0 and 1) for each option being correct, corresponding to options A, B, C, and D.  Return exactly 4 probabilities. No explanation is needed, only the probabilities."

    if fewshot:
        prompt_examples = f"""
        Example 1:
        Context: {fewshot_example1_context}
        Question: {fewshot_example1_question}
        Options: {fewshot_example1_choices_text}
        Answer: {fewshot_example1_answer_probabilities}
        Example 2:
        Context: {fewshot_example2_context}
        Question: {fewshot_example2_question}
        Options: {fewshot_example2_choices_text}
        Answer: {fewshot_example2_answer_probabilities}
        """
    else:
        prompt_examples = ""

    if context!='#':
        prompt_actual_question = f"""
        Actual Question:
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Answer:
        """
    else:
        prompt_actual_question = f"""
        Actual Question:
        Question: {question}
        Options: {choices_text}
        Answer:
    """
    prompt = prompt_task + prompt_examples + prompt_actual_question

    # Call the OpenAI API with the constructed prompt
    response = lm(prompt, temperature=0, n=1)

    if debug:
        print(f"Prompt: {prompt}")
        print(f"Response: {response}")

    return response

In [16]:
# Get the LLM's answer
ask_llm_mc_question_1hop_withconf_fewshot(test_context, test_question, test_choices, fewshot=True, debug=True)

Prompt: Task: Provide the probability (between 0 and 1) for each option being correct, corresponding to options A, B, C, and D.  Return exactly 4 probabilities. No explanation is needed, only the probabilities.
        Example 1:
        Context: Continuous exposure to indoor fluorescent lights is beneficial to the health of hamsters with heart disease. One group of hamsters exposed to continuous exposure to fluorescent lights has an average lifespan that is 2.5% longer than another one of the same species but living in a black wall.
        Question: Which of the following questions was the initial motivation for conducting the above experiment?
        Options: A. Can hospital light therapy be proved to promote patient recovery?
B. Which one lives longer, the hamster living under the light or the hamster living in the dark?
C. What kind of illness does the hamster have?
D. Do some hamsters need a period of darkness?
        Answer: [0.75, 0.1, 0.02, 0.13]
        Example 2:
        C

['[0.05, 0.1, 0.8, 0.05]']

### Iterate over dataset and store results

In [None]:
from tqdm import tqdm
import ast

commonsense_qa_trainset_results_conf={}
for idx,row in tqdm(train_df[train_df['ds_name']=='commonsense_qa'].iterrows()):
    answer=ask_llm_mc_question_1hop_withconf(row['context'], row['question'], ast.literal_eval(row['options']))
    commonsense_qa_trainset_results_conf[idx]=(answer)

## TODO: Logit based expt
see here : https://cookbook.openai.com/examples/using_logprobs

In [None]:
from openai import OpenAI
from math import exp
import numpy as np


In [None]:
client = OpenAI(api_key='<add_key_here>')



In [None]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4o-mini",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [None]:
QUERY_PROMPT = """Provide the probability (between 0 and 1) for each option being correct, given the following:
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
"""


In [None]:
test_context = "Some Cantonese don't like chili, so some southerners don't like chili."
test_question = "Which of the following can guarantee the above argument?"
test_choices = ["Some Cantonese love chili.", "Some people who like peppers are southerners.", "All Cantonese are southerners.", "Some Cantonese like neither peppers nor sweets."]


In [None]:
API_RESPONSE = get_completion(
    [{"role": "user", "content": QUERY_PROMPT.format(context=test_context,question=test_question,choices_text=test_choices)}],
    model="gpt-4o-mini",
    logprobs=True,
    top_logprobs=10,
)
top_two_logprobs = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs

print(top_two_logprobs)

[TopLogprob(token='[', bytes=[91], logprob=-0.01817628), TopLogprob(token='0', bytes=[48], logprob=-4.268176), TopLogprob(token="['", bytes=[91, 39], logprob=-5.643176), TopLogprob(token='1', bytes=[49], logprob=-7.893176), TopLogprob(token='-', bytes=[45], logprob=-10.143176), TopLogprob(token='[\n', bytes=[91, 10], logprob=-11.393176), TopLogprob(token='Prob', bytes=[80, 114, 111, 98], logprob=-11.393176), TopLogprob(token='Here', bytes=[72, 101, 114, 101], logprob=-11.643176), TopLogprob(token='Probability', bytes=[80, 114, 111, 98, 97, 98, 105, 108, 105, 116, 121], logprob=-12.393176), TopLogprob(token='Option', bytes=[79, 112, 116, 105, 111, 110], logprob=-12.518176)]


In [None]:
html_content = ""
for i, logprob in enumerate(top_two_logprobs, start=1):
    html_content += (
        f"<span style='color: cyan'>Output token {i}:</span> {logprob.token}, "
        f"<span style='color: darkorange'>logprobs:</span> {logprob.logprob}, "
        f"<span style='color: magenta'>linear probability:</span> {np.round(np.exp(logprob.logprob)*100,2)}%<br>"
    )
display(HTML(html_content))
print("\n")





