In [1]:
# !pip install openai
from openai import OpenAI
import dspy
import json
import pandas as pd

In [2]:
# Evaluate correctness of the answer - depends on format - will we pass two integers here as proposed by Vineet's logic?
def evaluate_answer(model_answer, correct_answer):
    if model_answer == "A" and correct_answer == 0:
      return True
    elif model_answer == "B" and correct_answer == 1:
      return True
    elif model_answer == "C" and correct_answer == 2:
      return True
    elif model_answer == "D" and correct_answer == 3:
      return True
    else:
      return False


In [49]:
# load dataset
train_df=pd.read_csv('train_df.csv')
train_df=train_df.fillna('#')

In [4]:
lm = dspy.OpenAI(model='gpt-4o-mini', api_key='<add_your_key_here>')


In [5]:
dspy.settings.configure(lm=lm)

### Methods for confidence scoring

In [54]:
def ask_llm_mc_question_1hop(context, question, choices):
    '''
    Ask LLM to respond with correct answer and associated confidence (only for correct option)
    '''
    # Format the choices - using characters for now - maybe change to integers?
    choices_text = '\n'.join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])

    # Construct the prompt
    if context!='#':
        prompt = f"""
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Please choose the best answer by returning the letter (A, B, C, etc.) corresponding to the correct choice.  Just the letter, nothing else.
        Provide the probability that your guess is correct. Give ONLY the probability, no
        other words or explanation.
        Finally return a tuple (choice, probability) where probability is between 0 and 1.
        """
    else:
        prompt = f"""
        Question: {question}
        Options: {choices_text}
        Please choose the best answer by returning the letter (A, B, C, etc.) corresponding to the correct choice.  Just the letter, nothing else.
        Provide the probability that your guess is correct. Give ONLY the probability, no
        other words or explanation.
        Finally return a tuple (choice, probability) where probability is between 0 and 1.
        """

    # Call the OpenAI API with the constructed prompt
    response = lm(prompt, temperature=0, n=1)

    # print(f"Prompt: {prompt}")
    # print(f"Response: {response}")
    response=response[0]
    # Check if there's a comma in the response for splitting
    if ',' in response:
        answer, conf = response.split(',')
    else:
        answer = response
        conf = 0.0  # or set to a default value like 0.0
    # answer,conf = response[0].split(',')
    # print(f"Answer: {answer}")
    # print(f"Confidence: {conf}")

    return answer,conf


In [55]:
test_context = "Some Cantonese don't like chili, so some southerners don't like chili."
test_question = "Which of the following can guarantee the above argument?"
test_choices = ["Some Cantonese love chili.", "Some people who like peppers are southerners.", "All Cantonese are southerners.", "Some Cantonese like neither peppers nor sweets."]
test_correct_answer = 2

# Get the LLM's answer
ask_llm_mc_question_1hop(test_context, test_question, test_choices)


('(C', ' 0.8)')

In [106]:
def ask_llm_mc_question_1hop_withconf(context, question, choices):
    '''
    Ask LLM to respond with  confidence for all  options
    '''
    # Format the choices - using characters for now - maybe change to integers?
    choices_text = '\n'.join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)])

    # Construct the prompt

    if context!='#':
        prompt = f"""
        Provide the probability (between 0 and 1) for each option being correct, given the following:
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
        """
    else:
        prompt = f"""
        Provide the probability (between 0 and 1) for each option being correct, given the following:
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
    """

    # Call the OpenAI API with the constructed prompt
    response = lm(prompt, temperature=0, n=1)

    # print(f"Prompt: {prompt}")
    # print(f"Response: {response}")
    # print(response)
    # conf = response[0].split(',')
    # print(f"Answer: {answer}")
    # print(f"Confidence: {conf}")

    return response


### Iterate over dataset and store results

In [None]:
from tqdm import tqdm
import ast

commonsense_qa_trainset_results_conf={}
for idx,row in tqdm(train_df[train_df['ds_name']=='commonsense_qa'].iterrows()):
    answer=ask_llm_mc_question_1hop_withconf(row['context'], row['question'], ast.literal_eval(row['options']))
    commonsense_qa_trainset_results_conf[idx]=(answer)

## TODO: Logit based expt 
see here : https://cookbook.openai.com/examples/using_logprobs

In [59]:
from openai import OpenAI
from math import exp
import numpy as np


In [61]:
client = OpenAI(api_key='<add_key_here>')



In [62]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4o-mini",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [63]:
QUERY_PROMPT = """Provide the probability (between 0 and 1) for each option being correct, given the following:
        Context: {context}
        Question: {question}
        Options: {choices_text}
        Return exactly 4 probabilities. No explanation is needed, only the probabilities.
"""


In [68]:
test_context = "Some Cantonese don't like chili, so some southerners don't like chili."
test_question = "Which of the following can guarantee the above argument?"
test_choices = ["Some Cantonese love chili.", "Some people who like peppers are southerners.", "All Cantonese are southerners.", "Some Cantonese like neither peppers nor sweets."]


In [67]:
API_RESPONSE = get_completion(
    [{"role": "user", "content": QUERY_PROMPT.format(context=test_context,question=test_question,choices_text=test_choices)}],
    model="gpt-4o-mini",
    logprobs=True,
    top_logprobs=10,
)
top_two_logprobs = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs

print(top_two_logprobs)

[TopLogprob(token='[', bytes=[91], logprob=-0.01817628), TopLogprob(token='0', bytes=[48], logprob=-4.268176), TopLogprob(token="['", bytes=[91, 39], logprob=-5.643176), TopLogprob(token='1', bytes=[49], logprob=-7.893176), TopLogprob(token='-', bytes=[45], logprob=-10.143176), TopLogprob(token='[\n', bytes=[91, 10], logprob=-11.393176), TopLogprob(token='Prob', bytes=[80, 114, 111, 98], logprob=-11.393176), TopLogprob(token='Here', bytes=[72, 101, 114, 101], logprob=-11.643176), TopLogprob(token='Probability', bytes=[80, 114, 111, 98, 97, 98, 105, 108, 105, 116, 121], logprob=-12.393176), TopLogprob(token='Option', bytes=[79, 112, 116, 105, 111, 110], logprob=-12.518176)]


In [85]:
html_content = ""
for i, logprob in enumerate(top_two_logprobs, start=1):
    html_content += (
        f"<span style='color: cyan'>Output token {i}:</span> {logprob.token}, "
        f"<span style='color: darkorange'>logprobs:</span> {logprob.logprob}, "
        f"<span style='color: magenta'>linear probability:</span> {np.round(np.exp(logprob.logprob)*100,2)}%<br>"
    )
display(HTML(html_content))
print("\n")





