In [1]:
file_path = '../data/topics/topics.txt'


import xml.etree.ElementTree as ET
import pandas as pd

def load_topics(path):
    with open(path) as f:
        root = ET.fromstring(f.read())
    topic_dict = {}
    for topic in root.findall("topic"):
        topic_id = topic.findtext("id")
        topic_query = topic.findtext("query")
        if topic_id and topic_query:
            topic_dict[topic_id] = topic_query.strip() # .lower()
    topics = pd.DataFrame(topic_dict.items(), columns=["qid", "query"]) 
    # topics["query"] = topics["query"].str.replace(r'\W+', ' ', regex=True)
    return topics


In [3]:
import openai
import time
import pandas as pd
import os

openai.organization = os.environ["OPENAI_ORG"]
openai.api_key = os.environ["OPENAI_API_KEY"]

parameters = {
    "temperature": 0.75,
    "max_tokens": 512,
    "number_of_answers": 10
}

prompts=["<query>", "Q: <query>\nA:", "Question: <query>\nAnswer:", "You are a helpful medical knowledge assistant. Provide useful, complete, and scientifically-grounded answers to common consumer search queries about health.\nQuestion: <query>\nComplete Answer:"]
promp_identifiers=["no_prompt", "q", "question", "multimedqa"]



def generate_chatgpt_answers(topics, max_retries=3, max_rows=100, pre_prompt="<query>", out_file="answers/chatgpt.csv"):
    generated_answers = 0
    for index, row in topics.iterrows():
        if row['answer_0'] != "":
            continue
        if generated_answers >= max_rows:
            break
        generated_answers += 1
        prompt = row['query']
        if "<query>" in pre_prompt:
            prompt = pre_prompt.replace("<query>", prompt)
        else:
            prompt = pre_prompt + " " + prompt
        for i in range(max_retries):
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=parameters["temperature"],
                    max_tokens=parameters["max_tokens"],
                    n=parameters["number_of_answers"],
                )
                for i, choice in enumerate(response.choices):
                    # add column for each answer
                    topics.at[index, f'answer_{i}'] = choice.message.content
                # save intermediate answers after each row
                topics.iloc[:index+1].to_csv(out_file, index=False)
                break
            except Exception as e:
                print(f"Error on topic: {prompt}. Retrying ({i+1}/3)...")
                print(f"Error message: {e}")
                time.sleep(30)
                if i == 2:
                    topics.at[index, 'answer'] = ""
    return topics

for i, prompt in enumerate(prompts):
    out_file = f'answers/chatgpt_{promp_identifiers[i]}.csv'
    topics = load_topics(file_path)
    topics['answer_0'] = ""
    topics.to_csv(out_file, index=False)
    all_answers_chatgpt = pd.read_csv(out_file)
    # replace Nan with empty string
    all_answers_chatgpt = all_answers_chatgpt.fillna("")
    questions_with_answers = generate_chatgpt_answers(all_answers_chatgpt, max_rows=100, pre_prompt=prompt, out_file=out_file)
    questions_with_answers.to_csv(out_file, index=False)
    # merge topics with all_answers_chatgpt
