In [1]:
import json
import openai
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [6]:
with open('../secrets.json','r') as f:
    secrets=json.load(f)

In [None]:
openai.api_key=secret['openai']

In [None]:
base_prompt="""The follow text snippet is a conversation between multiple individuals asking some technical questions:\n"""

question_plugin="""
Based on this conversation, the question that was raised was: {question}"""

question_summary_plugin=f"""
Use three top keywords to summarize this question (just give me the words separated by comma): """

answer_plugin="""
The answer to the question was: {answer}"""

answer_summary_plugin=f"""
Use three top keywords to summarize the answer (just give me the words separated by comma): """

question_user_plugin="""
The person or people who asked the question was/were (just give me his/her/their name(s) separated by comma): {asker}"""

answer_user_plugin="""
The person or people who answered the question was/were (just give me his/her/their name(s) separated by comma): {answerer}"""

answer_rating_plugin=f"""
How would you raise this answer on a scale from 0-10 (just give me the score): """

In [None]:
def getResponse(prompt, max_tokens, temperature=0.2):
    completion=openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=temperature
    )
    response=completion.choices[0].text
    return response

In [None]:
def send2GPT(prompt, maxlen=1600):
    estimated_token=max(len(prompt.split()),maxlen)
    output=getResponse(
        prompt, 
        estimated_token,
        0.01
    ).lstrip().rstrip()
    return output

In [None]:
def priceEstimator(prompt, response, unit_price=0.02, discount=0.75):
    prompt_price=len(prompt.split())*discount*unit_price/1000
    response_price=len(response.split())*discount*unit_price/1000
    return prompt_price+response_price

In [None]:
with open('data/pythondev.xml','r',encoding='utf8') as f:
    data=f.read()
    
bs_data=BeautifulSoup(data,'xml')
messages=bs_data.find_all('message')

In [None]:
dataframe={
    'conversation_id':[],
    'user':[],
    'text':[],
}

for message in messages:
    dataframe['conversation_id'].append(message.get('conversation_id'))
    text=message.find_all('text')
    dataframe['user'].append(message.find('user').text)
    if len(text) != 1:
        print(f'parsing has failed, expect one text message, instead got {len(text)}')
        break
    else:
        dataframe['text'].append(text[0].text)
dataframe=pd.DataFrame(dataframe)

In [None]:
text_data=[]

for conversation in dataframe.conversation_id.unique():
    subset=dataframe[dataframe['conversation_id']==conversation]
    text=[]
    for _,row in subset.iterrows():
        text.append(f"{row['user']}: {row['text']}")
    text="\n\n".join(text)
    text_data.append(text)

In [None]:
qa_data={
    'question':[],
    'asker':[],
    'question_summary':[],
    'answer':[],
    'answerer':[],
    'answer_summary':[],
    'was_resolved':[],
    'answer_rating':[],
}
total_pricing=0
i=0
# while total_price < 20:
for i in range(30):
    print(f'currently processing item: {i+1}')
    conversation="\n```\n"+text_data[i]+"\n```\n"
    pricing=0

    # what is the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=''
    )
    question=send2GPT(prompt)
    qa_data['question'].append(question)
    pricing+=priceEstimator(prompt, question)
    
    # who asked the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+question_user_plugin.format(
        asker=''
    )
    asker=send2GPT(prompt)
    qa_data['asker'].append(asker)
    pricing+=priceEstimator(prompt, asker)
    
    # question summary
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+question_summary_plugin
    question_summary=send2GPT(prompt)
    qa_data['question_summary'].append(question_summary)
    pricing+=priceEstimator(prompt, question_summary)
    
    # what is the answer?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(answer='')
    answer=send2GPT(prompt)
    qa_data['answer'].append(answer)
    pricing+=priceEstimator(prompt, answer)
    
    # who answered the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_user_plugin.format(
        answerer=''
    )
    answerer=send2GPT(prompt)
    qa_data['answerer'].append(answerer)
    pricing+=priceEstimator(prompt, answerer)
    
    # answer summary
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+answer_summary_plugin
    answer_summary=send2GPT(prompt)
    qa_data['answer_summary'].append(answer_summary)
    pricing+=priceEstimator(prompt, answer_summary)
    
    # was resolved?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+"Was the question/issue resolved (only answer Yes or No):"
    was_resolved=send2GPT(prompt)
    qa_data['was_resolved'].append(was_resolved)
    pricing+=priceEstimator(prompt, was_resolved)
    
    # answer rating
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+answer_rating_plugin
    answer_rating=send2GPT(prompt)
    qa_data['answer_rating'].append(answer_rating)
    pricing+=priceEstimator(prompt, answer_rating)
    
    total_pricing+=pricing
    # i+=1
    print(f'getting this data cost: {pricing}')
    print(f'current total cost: {total_pricing}')
    print('----')
    print()
    

In [None]:
qa_df=pd.DataFrame(qa_data)

In [None]:
qa_df