In [None]:
import re, json, pickle
import numpy as np
from difflib import SequenceMatcher

In [None]:
good_questions = np.load('Data/EEDI/good_questions.npy') # question_id's to use
with open('Data/EEDI/good_questions_statement.json', 'r') as f:
    good_questions_statement = json.load(f) # key: question_id, value: problem statement
with open('Data/EEDI/good_questions_answer.json', 'r') as f:
    good_questions_answer = json.load(f) # key: question_id, value: problem answer
with open('Data/EEDI/surveys.pkl', 'rb') as f:
    surveys = pickle.load(f) # key: question id, value: survey samples
with open('Data/EEDI/synthetic_profiles.pkl', 'rb') as f:
    synthetic_profiles = pickle.load(f) # key: question id, value: synthetic profiles
with open('Data/EEDI/good_questions_answer_to_letter.json', 'r') as f:
    good_questions_answer_to_letter = json.load(f) # key: question_id, value: answer to letter mapping

# calculate the similarity between two strings
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
import openai
api_key = '' # insert OpenAI API key here
client = openai.OpenAI(api_key = api_key)

In [None]:
# example of using the OpenAI API to simulate an answer
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are simulating the behaviors of humans with certain specified characteristics to help with a survey study."},
        {
            "role": "user",
            "content": synthetic_profiles[0]['Prompt'].values[0]
        }
    ]
)

print(completion.choices[0].message.content)

I think I might struggle a bit with this problem since it seems a little tricky. But I'll give it my best shot! I think the answer could be that it's **sometimes true**, so I’ll go with that.

My answer is [[B]].


In [None]:
# call api to obtain simulated answers
synthetic_answers_raw = {}
num_of_synthetic_answers = 50
for i in range(len(good_questions)):
    synthetic_answers_raw[int(good_questions[i])] = []
    for j in range(num_of_synthetic_answers):
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are simulating the behaviors of humans with certain specified characteristics to help with a survey study."},
                {
                    "role": "user",
                    "content": synthetic_profiles[int(good_questions[i])]['Prompt'].values[j]
                }
            ],
            seed = 1
        )
        synthetic_answers_raw[int(good_questions[i])].append(completion.choices[0].message.content)
        print('Question ' + str(i) + ' Answer ' + str(j) + ' Done', end = '\r')
    
    with open('Data/EEDI/synthetic answers/raw/synthetic_answers_raw_4o.pkl', 'wb') as f:
        pickle.dump(synthetic_answers_raw, f)

Question 411 Answer 49 Done

In [None]:
# find all the answers from the raw answers
synthetic_answers = {}
for i in range(len(good_questions)):
    synthetic_answers[int(good_questions[i])] = []
    for j in range(num_of_synthetic_answers):
        # use the double brackets to indicate the answer
        answer = re.findall(r'\[\[(.*?)\]', synthetic_answers_raw[int(good_questions[i])][j])
        answer = answer[0]
        # extract the capital letter, if there is any
        answer_letter = re.findall(r'[A-Z]', answer)
        # if not found, use similarity to find the most likely answer and the answer letter associated with it
        if len(answer_letter) == 0:
            ans_to_letter_i = good_questions_answer_to_letter[str(int(good_questions[i]))]
            sims = [similarity(answer, ans_to_letter_i[key]) for key in ans_to_letter_i.keys()]
            max_sim_ind = np.argmax(sims)
            answer_letter = ['A', 'B', 'C', 'D'][max_sim_ind]
        else:
            answer_letter = answer_letter[0]
        synthetic_answers[int(good_questions[i])].append(answer_letter)

In [None]:
# dataset uses 1, 2, 3, 4 to represent A, B, C, D, so we need to convert the letters to numbers
dict_letter_to_number = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
synthetic_answers_number = {}
for i in range(len(good_questions)):
    answers_num_i = []
    for j in range(num_of_synthetic_answers):
        answers_num_i.append(dict_letter_to_number[synthetic_answers[int(good_questions[i])][j]])
    synthetic_answers_number[int(good_questions[i])] = answers_num_i

In [None]:
# calculate the correctness of the synthetic answers
synthetic_answers_iscorrect = {}
for i in range(len(good_questions)):
    iscorrect_i = []
    for j in range(num_of_synthetic_answers):
        iscorrect_i.append(int(synthetic_answers_number[int(good_questions[i])][j] == good_questions_answer[str(int(good_questions[i]))]))
    synthetic_answers_iscorrect[int(good_questions[i])] = iscorrect_i

In [None]:
with open('Data/EEDI/synthetic answers/iscorrect/synthetic_answers_iscorrect_4o.pkl', 'wb') as f:
    pickle.dump(synthetic_answers_iscorrect, f)