In [None]:
!pip install pandas
!pip install openai
!pip install backoff
!pip install anthropic
!pip install -q -U google-generativeai

In [None]:
import re
import time
import os
import openai
import anthropic
import google.generativeai as genai
import backoff
import pandas as pd
import requests
import json

### OPENAI's GPT

In [4]:
## ChatGPT function call
client_OpenAI = openai.OpenAI()
CHATGPT = 'gpt-3.5-turbo'
FURBO = 'gpt-4-0125-preview'

@backoff.on_exception(backoff.expo, openai.RateLimitError, max_time=6000)
def chat_completions_with_backoff(**kwargs):
    return client_OpenAI.chat.completions.create(**kwargs)

def gptQuery(prompt, history = [], model=CHATGPT, temperature = 0, n=1, echo = False):
  out = chat_completions_with_backoff(model=model,
                                     messages=history + [{"role":"user","content":prompt}],
                                     temperature=temperature, max_tokens = 2048,
                                     n=n)
  if echo: 
     print(history + [{"role": "user", "content": prompt}])
     print(out.choices[0].message.content.strip()) 
  if n == 1:
     return out.choices[0].message.content.strip()
  return [response.message.content.strip() for response in out.choices ]

print(gptQuery(prompt= "Hello Test", model =CHATGPT))

Hello! How can I assist you today?


### Anthropic's Claude

In [None]:
## anthropic's function call
client_anthropic = anthropic.Anthropic()
OPUS = "claude-3-opus-20240229"
SONNET = "claude-3-sonnet-20240229"

# Claude's fommating: 
## [{"role":"user","content":"Hello"}, {"role","assistant","content":"Greeting"}]
# no n for claude -> only once.
def claudeQuery(prompt, history = [], model=SONNET, temperature = 0, echo=False):
    out = client_anthropic.messages.create(
        model = model,
        max_tokens=2048,
        temperature=temperature,
        system="",
        messages= history + [{"role": "user", "content": prompt}]
    )
    if echo: 
        print(history + [{"role": "user", "content": prompt}])
        print(out.content[0].text)
    time.sleep(1)
    return out.content[0].text

print(claudeQuery(prompt = "Hello Test", model = SONNET))

### Google's Gemini

In [14]:
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
GEMINI1 = "gemini-1.0-pro"

## turn everything off
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE"
  },
]

## Gemini's history formatting:
## [{"role":"user","parts":["Hello"]}, 
##  {"role":"model","parts":["Greetings!"]}]

## Only one model is avaiable right now. 
def geminiQuery(prompt, history = [], model = GEMINI1, temperature = 0, echo=False):
    # Set up the model
    generation_config = {
        "temperature": temperature,
        "top_p": 1,
        "top_k": 1,
        "max_output_tokens": 2048,
    }

    model = genai.GenerativeModel(model_name=model,
                                  generation_config=generation_config,
                                  safety_settings=safety_settings)

    convo = model.start_chat(history=history)

    convo.send_message(prompt)
    if echo: print(convo.last)

    return convo.last.text

print(geminiQuery("Hello Test."))

Hello there! How can I assist you today?


### OpenRouter's Models

In [16]:
MISTRAL7B = "mistralai/mistral-7b-instruct:free"
MIXTRAL87B = "mistralai/mixtral-8x7b-instruct"
GEMINIPRO = "google/gemini-pro"

def queryOpenRouter(prompt, history = [], model = MIXTRAL87B, temperature = 0, echo=False):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={ "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}"},
        data=json.dumps({
                "model": model, # Optional
                "messages": history + [{"role": "user", "content": prompt}],
                "provider": { "allow_fallbacks": False },  
                "temperature": temperature
                })
    )
    response = response.json()
    time.sleep(1)
    if echo:
        print(response['choices'][0]['message']['content'].strip())
    return response['choices'][0]['message']['content'].strip()

print(queryOpenRouter("Hello Test."))

Hello there! How can I assist you today in this roleplay chat?


In [6]:
## Helper function to call different models
def query(prompt, history, model, **args):
    if model == CHATGPT or model == FURBO:
        return gptQuery(prompt, history = history, model = model, **args)
    elif model == SONNET or model == OPUS:
        return claudeQuery(prompt, history = history, model = model, **args)
    elif model == GEMINI1:
        return geminiQuery(prompt, history = history, model = model, **args)
    elif model == MIXTRAL87B or model == MISTRAL7B:
        return queryOpenRouter(prompt, history=history, model=model, **args)

## Experiment 1: The free-choice paradigm

In [7]:
perspectives = {'self':"",
               'roleplay':"Act as an average American living in Florida and answer the following questions in order.\n",
               'predict':"Your task is to predict how an average American living in Florida would answer the following questions in order.\n"}
#perspectives = {'predict':"Your task is to predict how an average american living in Florida would answer the following questions in order.\n"}

# itemes (Alphabetically sorted)
items = {'object': ['Blender','Coffee maker','Desk lamp','Kettle',
                    'Photo Frame','Radio','Sandwich maker','Sleeping bag','Table Fan','Toaster'],
         'food': ['Chicken wings','French fries','Fried chicken','Grilled Cheeses','Hamburgers',
                  'Mashed potatoes','Pasta','Pizza','Salad','Sushi'],
         'fruit': ['Apple','Banana','Cherry','Kiwi','Mango',
                   'Orange','Papaya','Pineapple','Strawberry','Watermelon']}

# Flow: Rate1 -> Decision -> Rate2
## Rate1: All of them at once (all) vs. one at a time separately (indi).
## Decision: Manually identify the top pair and the bottom pair.
### - Target: high desirability: [8, 7]
### - Identify at most 5 pairs if possible.
### - High disso = [0, 1]. 
### - Low disso = >= 3. 
## Rate2: [All of them at once (all) vs. one at a time separately (indi)] x [With the previous rating (with) vs. Without (wout)]

rating_q_all = "Q: Please rate the desirability of the following ten {to_be_replaced}s from 0 to 10, where 0 is 'definitely not at all desirable' and 10 is 'extremely desirable.' Only write down numbers separeted by commas inside [ ]." 
rating_q_indi = "Q: Please rate the desirability of the following {to_be_replaced} from 0 to 10, where 0 is 'definitely not at all desirable' and 10 is 'extremely desirable.' ONLY write a number inside [ ]." 

decision_question = "Q: If you must choose one, which of the following {to_be_replaced}s would you prefer to have?"

THRESHOLD_UPPER = 9
THRESHOLD_LOWER = 7

TOP_THRESHOLD = 1
BOTTOM_THRESHOLD = 2

NUM_PAIRS = 5

def findDissonantPairs(items, ratings):
    # return top and bottom n pairs based on rating (rank later if time)
    # Top - [0, 1]
    # BOttom - >= 3
    assert len(items) == len(ratings) 

    sorted_items = sorted(zip(ratings, items)) # low to high
    # Find Top
    top = []
    bottom = []
    for i in range(len(sorted_items)-1, -1, -1):
        if sorted_items[i][0] > THRESHOLD_UPPER:
            continue
        if sorted_items[i][0] < THRESHOLD_LOWER:
            break
        for j in range(i-1, -1, -1):
            diff = sorted_items[i][0] - sorted_items[j][0]
            if diff <= TOP_THRESHOLD: 
                top.append( (diff, sorted_items[i][1], sorted_items[j][1]) )
            if diff >= BOTTOM_THRESHOLD:
                bottom.append( (diff, sorted_items[i][1], sorted_items[j][1]) )

    return sorted(top), sorted(bottom, reverse=True)


def updateHistory(prompt, response, model=CHATGPT):
    # return a list
    ## OpenAI's, Claude's, and OpenRouter's fommating: 
    ## [{"role":"user","content":"Hello"}, {"role","assistant","content":"Greeting"}]
    return [{"role":"user","content":prompt}, {"role":"assistant","content":response}]


def genOneAnswerEX1(perspective, item, items, first_ratings, choices, decision, 
                    forced_decision, second_ratings, prior):
    result = {}
    result['perspective'] = perspective
    result['item'] = item
    for i in range(len(items)):
        result[items[i]+'_1'] = first_ratings[i]
        result[items[i]+'_2'] = second_ratings[i]
    result['choices_1'] = choices[1]
    result['choices_2'] = choices[2]
    result['choices_diff'] = choices[0]
    result['decision'] = decision
    result['forced_decision'] = forced_decision 
    result['prior'] = prior
    return result

## Weaker Models still don't follow the instruction all the time.
def parseResponse(response):
    brackets_content = re.findall(r'\[(.*?)\]', response)
    number_pattern = r'-?\d+\.?\d*'
    all_numbers = []
    # Not number case
    if not brackets_content:
        return response
    # Number
    for content in brackets_content:
        # Find all numbers within each matched content inside brackets
        numbers = re.findall(number_pattern, content)
        all_numbers.extend(numbers)
    return ','.join(all_numbers)

def parseAnswerToList(response):
    try:
        return [int(x.strip()) for x in response.split(',')]
    except:
        return [-1] * 10

def genAnwswerExperiment1(model, **args):

    results = []
    # Loop through three perspectives
    for perspective in perspectives:
        print(f"Begin ---- {perspective} ---- ")
        # First ask to rate: all ten of them.
        for key in items: 
            rating_q_all_items = rating_q_all.replace("{to_be_replaced}",key) + '\n' + ', '.join(items[key])
            rating_q_all_items_inst = perspectives[perspective] + rating_q_all_items
            first_ratings_all = parseResponse(query(rating_q_all_items_inst, [], model, **args))
            first_ratings_all_list = parseAnswerToList(first_ratings_all)
            print(first_ratings_all_list)
            
            ## Individual (Not used for now since weaker models struggle to do somehow.)
            # ratings_indi = ""
            # for i in range(len(items[key])):
            #     rating_q_indi_item_temp = rating_q_indi.replace("{to_be_replaced}",key) + '\n' + key + ": " + items[key][i]
            #     print(rating_q_indi_item_temp)
            #     temp = query(rating_q_indi_item_temp, model = model)
            #     print(temp)
            #     ratings_indi += temp + ','
            # ratings_indi = ratings_indi[:len(ratings_indi)-1]
            # print(ratings_indi)

            # Then we sort them by rating. 
            top_pairs, bottom_pairs = findDissonantPairs(items[key], first_ratings_all_list)
            pairs = top_pairs[:min(NUM_PAIRS, len(top_pairs))] + bottom_pairs[:min(NUM_PAIRS, len(bottom_pairs))]
            temp_history = updateHistory(rating_q_all_items_inst, f'[{first_ratings_all}]') # putting [ ] back in
            for pair in pairs:
                print(pair)
                # Then we ask for decisions. 
                decision_q = decision_question.replace("{to_be_replaced}",key) + f" {pair[1]} or {pair[2]}? Only output one of the {key}s." 
                decision_with_rating = parseResponse(query(decision_q, temp_history, model, **args))
                decision_q_inst = perspectives[perspective] + decision_q
                decision_wout_rating = parseResponse(query(decision_q_inst, [], model, **args))
                print("===================")
                # We force the two decisions by putting the words in the mouth. 
                # Create history -> assign the decision 
                # prompt to rate again
                for d in [pair[1], pair[2]]:
                    second_rating_all_wh = parseResponse(query(rating_q_all_items, temp_history + updateHistory(decision_q, d), model, **args))
                    second_rating_all_wouth = parseResponse(query(rating_q_all_items, updateHistory(decision_q_inst, d), model, **args))
                    ## save
                    results.append(genOneAnswerEX1(perspective,key,items[key], first_ratings_all_list, pair,
                                                        decision_with_rating, d, 
                                                        parseAnswerToList(second_rating_all_wh), 'with'))
                    results.append(genOneAnswerEX1(perspective,key,items[key], first_ratings_all_list, pair,
                                                        decision_wout_rating, d, 
                                                        parseAnswerToList(second_rating_all_wouth), 'without'))
                print("************************************")
        
    return results


In [None]:
# Running the test #

# ChatGPT struggles to answers questions individually or refuse to answer.
# ChatGPT struggles to answers rating and occationally answer all 0.
# ChatGPT: I'm sorry, but I am not able to provide ratings for the same set of fruits again. If you have any other questions or need assistance with something else, feel free to ask!
# GEMINI API has 500 error isssues far too often. Unstable. 
# Claude like to output more than what instruction says like pretending and other stuffs -> need to parse results

#sonnet_ex1 = genAnwswerExperiment1(SONNET, echo=True)
#sonnet_ex1_df = pd.DataFrame(sonnet_ex1)
#sonnet_ex1_df.to_csv('experiment1_'+SONNET+'.csv',index=False)

#opus_ex1 = genAnwswerExperiment1(OPUS, echo=True)
#opus_ex1_df = pd.DataFrame(opus_ex1)
#opus_ex1_df.to_csv('experiment1_'+OPUS+'.csv',index=False)

#chatgpt_ex1 = genAnwswerExperiment1(CHATGPT, echo=True)
#chatgpt_ex1_df = pd.DataFrame(chatgpt_ex1)
#chatgpt_ex1_df.to_csv('experiment1_'+CHATGPT+'.csv',index=False)

#furbo_ex1 = genAnwswerExperiment1(FURBO, echo=True)
#furbo_ex1_df = pd.DataFrame(furbo_ex1)
#furbo_ex1_df.to_csv('experiment1_'+FURBO+'.csv',index=False)

mixtral_ex1 = genAnwswerExperiment1(MIXTRAL87B, echo=True)
mixtral_ex1_df = pd.DataFrame(mixtral_ex1)
mixtral_ex1_df.to_csv('experiment1_MIXTRAL-8x7b-instruct.csv',index=False)


## Experiment 2: Hurricane

In [18]:
## Hurricane Experiment
perspectives = {'self':"",
               'roleplay':"Act as an average american living in Florida and answer the following questions in order.\n",
               'predict':"Your task is to predict how an average american living in Florida would answer the following questions in order.\n"}
#perspectives = {'predict':"Your task is to predict how an average american living in Florida would answer the following questions in order.\n"} # for testing

instruction = "Instruction: You will be presented with a sequence of hurricane messages and asked about your thoughts on the hurricane and its impact. Please read the messages carefully before answering questions.\n"

## First Message same for all conditions.
first_message = '''This is a hurricane update from National Hurricane Center:
- The storm is expected to be a category 3 hurricane and make landfall on the Florida east coast in 2 days. However, there is still a lot of uncertainty about the hurricane’s impacts.
- Our model roughly estimates a maximum sustained wind speed of approximately 120 mph +/- 30 mph (90 - 150 mph).
- Our model roughly estimates the storm is likely to cause approximately 12 inches +/- 8 inches (4 - 20 inches) of flooding.
'''

second_message_cat4_high = '''This is a hurricane update from National Hurricane Center:
- The storm is now expected to be a category 4 hurricane and make landfall on the Florida east coast in 24 hours. However, there is still a lot of uncertainty about the hurricane's impacts.
- Our model roughly estimates a maximum sustained wind speed of approximately 140 mph +/- 20 mph (120 - 160 mph). - Our model roughly estimates the storm is likely to cause approximately 16 inches +/- 8 inches (8 - 24 inches) of flooding.
- Authority has warned that it is too late to evacuate and too soon to return.
'''

second_message_cat4_low = '''This is a hurricane update from National Hurricane Center:
- The storm is now expected to be a category 4 hurricane and make landfall on the Florida east coast in 24 hours. As the hurricane gets closer, the predictions of the hurricane's impacts have become more accurate.
- Our model predicts with high confidence that the maximum sustained wind speed will be 140 mph +/- 5 mph (135 - 155 mph).
- Our model predicts with high confidence that the storm will cause 16 inches +/- 2 inches (14 - 18 inches) of flooding.
- Authority has warned that it is too late to evacuate and too soon to return.
'''

second_message_cat2_high = '''This is a hurricane update from National Hurricane Center:
- The storm is now expected to be a category 2 hurricane and make landfall on the Florida east coast in 24 hours. However, there is still a lot of uncertainty about the hurricane's impacts.
- Our model roughly estimates a maximum sustained wind speed of approximately 100 mph +/- 20 mph (80 - 120 mph). - Our model roughly estimates the storm is likely to cause approximately 8 inches +/- 6 inches (2 - 14 inches) of flooding.
- Authority has warned that it is too late to evacuate and too soon to return.
'''

## Second Messages (4 for each condition)
second_messages = {
    'worse':second_message_cat4_high,
    'better':second_message_cat2_high,
    'uncertainty':second_message_cat4_low,
    'utility':second_message_cat4_high
}
flooding_question = "When the hurricane makes landfall at the Florida east coast in 2 days, how high the Flood depth (inch) would be?\n"
windspeed_question = "When the hurricane makes landfall at the Florida east coast in 2 days,, what would maximum sustained wind speed (mph) be?\n"
question_instruction = "Only answer one number for each question and put both of them in [ ] (e.g., [1, 2])."


#After the first message
context_high_util = "Now, imagine that you happen to be in Palm Bay, a city on the east coast of Florida on the predicted path of the hurricane. You currently stay in a one-story, single-detached house.\n"
context_low_util  = "Now, imagine that you happen to be in Palm Bay, a city on the east coast of Florida on the predicted path of the hurricane. You currently stay on the fifth floor of a well-built apartment.\n"

context_condition = {
    'worse':context_high_util,
    'better':context_high_util,
    'uncertainty':context_high_util,
    'utility':context_low_util
}

decision_question = """In this situation, what would you do? 
Stay: Stay in your place, a one-story, single-detached house, and ride out the storm.
Evacuate: Evacuate to a hotel up north paying at least $150 per night.
Only output 'Stay' or 'Evacuate'.
"""

def genOneDataEx2(n, perspective, condition,  first_ratings, decision, forced_decision, second_ratings, prior):
    result = {}
    result['n'] = n #n = random, n  = 0 == temperature 0. 
    result['perspective'] = perspective
    result['condition'] = condition
    
    first_rating_list = first_ratings.split(',')
    second_rating_list = second_ratings.split(',')

    result['flood_1'] = first_rating_list[0]
    result['wind_1']  = first_rating_list[1]
    result['flood_2'] = second_rating_list[0]
    result['wind_2']  = second_rating_list[1]
    
    result['decision'] = decision
    result['forced_decision'] = forced_decision 
    result['prior'] = prior
    return result

def genAnswerExperiment2(model, start = 0, end = 1, **args):
    results = []
    for perspective in perspectives:
        print(f'====={perspective}=====')
        for condition in second_messages:
            print(f'----{condition}----')
            temperature = 0
            for i in range(start, end):
                if i > 0: temperature = 1.0
                # first query 
                first_query = f"{perspectives[perspective]} {instruction} {first_message} {flooding_question} {windspeed_question} {question_instruction}"
                first_responses = parseResponse(query(first_query, [], model, temperature = temperature, **args))
                temp_history = updateHistory(first_query, f'[{first_responses}]')
                
                # decision 
                decision_query_with = f"{context_condition[condition]} {decision_question}"
                decision_responses_with = parseResponse(query(decision_question, temp_history, model, temperature = temperature, **args))
                
                decision_query_wout = f"{perspectives[perspective]} {instruction} {first_message} {context_condition[condition]} {decision_question}"
                decision_responses_wout = parseResponse(query(decision_query_wout, [], model, temperature = temperature, **args))

                # Second query 
                for d in ['Stay','Evacuate']:
                    print(f'>>>{d}<<<')
                    full_history = temp_history + updateHistory(decision_query_with, d)
                    only_d_history = updateHistory(decision_query_wout, d)
                    second_query = f"[Next Day]\n {second_messages[condition]} {flooding_question} {windspeed_question} {question_instruction} " 
                    
                    second_responses_with = parseResponse(query(second_query, full_history, model, temperature = temperature, **args))
                    second_responses_wout = parseResponse(query(second_query, only_d_history, model, temperature = temperature, **args))

                    ## Save 
                    results.append(genOneDataEx2(i, perspective, condition, first_responses, decision_responses_with, 
                                                d, second_responses_with, "with"))
                    results.append(genOneDataEx2(i, perspective, condition, first_responses, decision_responses_wout, 
                                                d, second_responses_wout, "without"))

    return results

In [None]:
## Run and Save data
## Note: Manually clean up after

# sonnet_ex2 = genAnswerExperiment2(SONNET, 0, 21, echo=True)
# sonnet_ex2_df = pd.DataFrame(sonnet_ex2)
# sonnet_ex2_df.to_csv('experiment2_'+SONNET+'.csv',index=False)

# opus_ex2 = genAnswerExperiment2(OPUS, 0, 11, echo=True)
# opus_ex2_df = pd.DataFrame(opus_ex2)
# opus_ex2_df.to_csv('experiment2_'+OPUS+'0-10.csv',index=False)

# opus_ex2 = genAnswerExperiment2(OPUS, 11, 21, echo=True)
# opus_ex2_df = pd.DataFrame(opus_ex2)
# opus_ex2_df.to_csv('experiment2_'+OPUS+'11-20.csv',index=False)

# chatgpt_ex2 = genAnswerExperiment2(CHATGPT, 0, 21, echo=True)
# chatgpt_ex2_df = pd.DataFrame(chatgpt_ex2)
# chatgpt_ex2_df.to_csv('experiment2_'+CHATGPT+'.csv',index=False)

# furbo_ex2 = genAnswerExperiment2(FURBO, 0, 21,echo=True)
# furbo_ex2_df = pd.DataFrame(furbo_ex2)
# furbo_ex2_df.to_csv('experiment2_'+FURBO+'.csv',index=False)

# mixtral_ex2 = genAnswerExperiment2(MIXTRAL87B, 0, 6,echo=True)
# mixtral_ex2_df = pd.DataFrame(mixtral_ex2)
# mixtral_ex2_df.to_csv('experiment2_MIXTRAL-8x7b-instruct_0-5.csv',index=False)

# mixtral_ex2 = genAnswerExperiment2(MIXTRAL87B, 6, 11,echo=True)
# mixtral_ex2_df = pd.DataFrame(mixtral_ex2)
# mixtral_ex2_df.to_csv('experiment2_MIXTRAL-8x7b-instruct_6-10.csv',index=False)

# mixtral_ex2 = genAnswerExperiment2(MIXTRAL87B, 11, 16,echo=True)
# mixtral_ex2_df = pd.DataFrame(mixtral_ex2)
# mixtral_ex2_df.to_csv('experiment2_MIXTRAL-8x7b-instruct_11-15.csv',index=False)

# mixtral_ex2 = genAnswerExperiment2(MIXTRAL87B, 16, 21,echo=True)
# mixtral_ex2_df = pd.DataFrame(mixtral_ex2)
# mixtral_ex2_df.to_csv('experiment2_MIXTRAL-8x7b-instruct_16-21.csv',index=False)