In [None]:
# import necessary libraries for data loading, tagging and presentation
import os
import time
import json
import pickle
import openai
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from IPython.display import clear_output
# load open AI API keys
with open('../../../secrets.json','r') as f:
    secrets=json.load(f)
openai.api_key=secrets['openai']

In [None]:
# loading data, in this personal practice project, we used data from:
# Software-related-Slack-Chats-with-Disentangled-Conversations [https://github.com/preethac/Software-related-Slack-Chats-with-Disentangled-Conversations]
# special thanks to the contributors: Kostadin Damevski and Preetha Chatterjee

# loading data in subfolders and pack into one unfiromed dataframe for text concatenation
encoded_names=[]
directories=[]
for folder in os.listdir('data/raw_data/data'):
    subfolders=os.listdir(f'data/raw_data/data/{folder}')
    for subfolder in subfolders:
        files=os.listdir(f'data/raw_data/data/{folder}/{subfolder}')
        for i,f in enumerate(files):
            directories.append(f'data/raw_data/data/{folder}/{subfolder}/{f}')
            encoded_names.append(f'{folder}-{subfolder}-{i}')

In [None]:
# create a dataframe spaned by conversation_id, user, and text
DataFrame=pd.DataFrame()
for encoded_name, directory in zip(encoded_names, directories):
    with open(directory,'r',encoding='utf8') as f:
        data=f.read()
        
    bs_data=BeautifulSoup(data,'xml')
    messages=bs_data.find_all('message')

    dataframe={
        'conversation_id':[],
        'user':[],
        'text':[],
    }

    # using BeautifulSoup to get message, user info, as well as creation of conversation ID
    for message in messages:
        dataframe['conversation_id'].append(encoded_name+'-'+message.get('conversation_id'))
        text=message.find_all('text')
        dataframe['user'].append(message.find('user').text)
        if len(text) > 1: # if more than one text tag identified, append all content
            text2append=''
            for t in text:
                text2append+=t.text
            dataframe['text'].append(text2append)
        elif len(text) < 1:
            dataframe['text'].append('') # if nothing identified under text tag, then just append an empty string
        else:
            dataframe['text'].append(text[0].text)
    dataframe=pd.DataFrame(dataframe)
    DataFrame=pd.concat([dataframe, DataFrame])

In [None]:
# grab text data by joining all converation text corresponding to the same converstaion ID
text_data=[]
for conversation in tqdm(DataFrame.conversation_id.unique()):
    subset=DataFrame[DataFrame['conversation_id']==conversation]
    text=[]
    for _,row in subset.iterrows():
        text.append(f"{row['user']}: {row['text']}")
    text="\n\n".join(text)
    text_data.append(text)

In [None]:
# rearraging and shuffling text_data, so that we can perform data tagging not in sequential fashion, i.e.
# not just conversation regarding one technique will be parsed if something was broken in the loop
text_data=list(np.array(text_data)[np.random.choice(len(text_data), len(text_data), replace=False)])
with open('data/text_data/text_data.pickle','wb') as f:
    pickle.dump(text_data, f)

In [None]:
# you can start from here to load raw text data from now on
with open('data/text_data/text_data.pickle','rb') as f:
    text_data=pickle.load(f)

In [None]:
# prompt skelton
base_prompt="""The follow text snippet is a conversation between multiple individuals asking some technical questions:\n"""
# introduces this is a conversation regarding technical questions

question_plugin="""
Based on this conversation, the question that was raised was: {question}"""
# prompt to identify the raised question in the conversation

question_summary_plugin=f"""
Use three top keywords to summarize this question (just give me the words separated by comma): """
# prompt for summarizing what question was raised into 3 keywords

answer_plugin="""
The answer to the question was: {answer}"""
# prompt for getting the answers

answer_summary_plugin=f"""
Use three top keywords to summarize the answer (just give me the words separated by comma): """
# prompt for summarizing what answer was proposed into 3 keywords

question_user_plugin="""
The person or people who asked the question was/were (just give me his/her/their name(s) separated by comma): {asker}"""
# prompt asking who asked the question

answer_user_plugin="""
The person or people who answered the question was/were (just give me his/her/their name(s) separated by comma): {answerer}"""
# prompt asking who answer the question

answer_rating_plugin=f"""
How would you raise this answer on a scale from 0-10 (just give me the score): """
# prompt for rating the answer

In [None]:
# price estimation
# added a 1.2X discount to counter for potential price overflow due to underestimation
# unit price default set to the price for 1k tokens using gpt-3.5-turbo
def priceEstimator(prompt, response, unit_price=0.002, discount=1.2):
    prompt_price=len(prompt.split())*discount*unit_price/1000
    response_price=len(response.split())*discount*unit_price/1000
    return prompt_price+response_price

In [None]:
# send to GPT with prompt, token to complete, as well as a very low temperature to make sure deterministic of the answers
def send2GPT(prompt, maxlen=1500):
    estimated_token=min(4097-int(np.floor(len(prompt.split())*1.5)),maxlen)
    # estimated token for completion, this was updated
    # 1.5X applied to counter for potential gap between estimation and actual identified tokens by OpenAI
    output=getResponse(
        prompt, 
        estimated_token,
        0.01
    ).lstrip().rstrip()
    return output

In [None]:
# getting responses by calling OpenAI ChatCompletion API
def getResponse(prompt, max_tokens, temperature=0.2):
    completion=openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[
            {"role": "user", "content": prompt}
        ], # chat completion needs to provide role and content in an array
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=temperature
    )
    response=completion.choices[0]['message']['content'].strip() # getting the output message
    return response

In [None]:
# calling OpenAI for data distillation
qa_data={
    'question':[],
    'asker':[],
    'question_summary':[],
    'answer':[],
    'answerer':[],
    'answer_summary':[],
    'was_resolved':[],
    'answer_rating':[],
    'conversation':[],
}
total_pricing=0 # total price so far
i=0
current_unixtime=int(datetime.now().timestamp())
while ((total_pricing < 50) & (i < 15000)): # default set as $50 and 15,000 conversations to begin with
    print(f'[INFO] currently processing item: {i+1}')
    conversation="\n```\n"+text_data[i]+"\n```\n"
    pricing=0

    # what is the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=''
    )
    print(f'[DEBUG] question prompt: {prompt}')
    question=send2GPT(prompt)
    print(f'[DEBUG] question identified: {question}')
    print()
    pricing+=priceEstimator(prompt, question)
    
    # who asked the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+question_user_plugin.format(
        asker=''
    )
    print(f'[DEBUG] asker prompt: {prompt}')
    asker=send2GPT(prompt)
    print(f'[DEBUG] asker identified: {prompt}')
    print()
    pricing+=priceEstimator(prompt, asker)
    
    # question summary
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+question_summary_plugin
    print(f'[DEBUG] question summary prompt: {prompt}')
    question_summary=send2GPT(prompt)
    print(f'[DEBUG] question summary identified: {question_summary}')
    print()
    pricing+=priceEstimator(prompt, question_summary)
    
    # what is the answer?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(answer='')
    print(f'[DEBUG] answer prompt: {prompt}')
    answer=send2GPT(prompt)
    print(f'[DEBUG] answer identified: {answer}')
    print()
    pricing+=priceEstimator(prompt, answer)
    
    # who answered the question?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_user_plugin.format(
        answerer=''
    )
    print(f'[DEBUG] answerer prompt: {prompt}')
    answerer=send2GPT(prompt)
    print(f'[DEBUG] answerer identified: {answerer}')
    print()
    pricing+=priceEstimator(prompt, answerer)
    
    # answer summary
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+answer_summary_plugin
    print(f'[DEBUG] answer summary prompt: {prompt}')
    answer_summary=send2GPT(prompt)
    print(f'[DEBUG] answer summary identified: {answer_summary}')
    print()
    pricing+=priceEstimator(prompt, answer_summary)
    
    # was resolved?
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+"Was the question/issue resolved (only answer Yes or No, and answer No if the answer is vague or unclear):"
    print(f'[DEBUG] was resolved prompt: {prompt}')
    was_resolved=send2GPT(prompt)
    print(f'[DEBUG] was resolved identifed: {was_resolved}')
    print()
    pricing+=priceEstimator(prompt, was_resolved)
    
    # answer rating
    prompt=base_prompt+conversation+question_plugin.format(
        question=question
    )+'\n'+answer_plugin.format(
        answer=answer
    )+'\n'+answer_rating_plugin
    print(f'[DEBUG] answer rate prompt: {prompt}')
    answer_rating=send2GPT(prompt)
    print(f'[DEBUG] answer rate identified: {answer_rating}')
    print()
    pricing+=priceEstimator(prompt, answer_rating)
    
    # aggregating price information, and move to the next text to parse
    total_pricing+=pricing
    print(f'[INFO] getting this data cost: {pricing}')
    print(f'[INFO] current total cost: {total_pricing}')
    
    # append all acquired data to create the dataframe later, since anything may cause issue above
    # then the lengths of arrays would be different
    print('[INFO] appending data to dataframe')
    qa_data['question'].append(question)
    qa_data['asker'].append(asker)
    qa_data['question_summary'].append(question_summary)
    qa_data['answer'].append(answer)
    qa_data['answerer'].append(answerer)
    qa_data['answer_summary'].append(answer_summary)
    qa_data['was_resolved'].append(was_resolved)
    qa_data['answer_rating'].append(answer_rating)
    qa_data['conversation'].append(text_data[i])
    i+=1 # update the iteration of text to crawl
    print('[INFO] data has been appended to the dataframe')

    # wait to prevent being blocked
    wait_time=np.random.randint(30,np.random.randint(60, min(120, 61+i//100)))
    print(f'[DEBUG] pending with wait: {wait_time}s')
    time.sleep(wait_time) # time for wait preventing OpenAI to be angry about the crawling here

    # save checkpoints, with parsed data, and index of raw text parsed so far
    print('[INFO] saving data checkpoints')
    with open('data/checkpoints/chkpt.pickle','wb') as f:
        pickle.dump(
            {
                'qa_data':qa_data,
                'idx':i
            },f
        )
    pd.DataFrame(qa_data).to_csv(f'data/distilled_data/qa_{current_unixtime}.csv',index=False)
    print('[INFO] checkpoint saved')
    
    print('----')
    print()
    time.sleep(6) # wait for 6 seconds then clear out the screen
    clear_output()