In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import time


In [2]:
import os

path_dataset = '../Dataset'

path_so = os.path.join(path_dataset, 'Stack Overflow')
path_ts = os.path.join(path_dataset, 'Tool-specific Others')
path_labeling = os.path.join(path_dataset, 'Labeling')  

path_so_raw = os.path.join(path_so, 'Raw')
path_ts_raw = os.path.join(path_ts, 'Raw')
path_so_filtered = os.path.join(path_so, 'Filtered')
path_ts_filtered = os.path.join(path_ts, 'Filtered')

if not os.path.exists(path_dataset):
    os.makedirs(path_dataset)

if not os.path.isdir(path_so):
    os.mkdir(path_so)

if not os.path.isdir(path_ts):
    os.mkdir(path_ts)

if not os.path.isdir(path_labeling):
    os.mkdir(path_labeling)

if not os.path.isdir(path_so_raw):
    os.mkdir(path_so_raw)

if not os.path.isdir(path_ts_raw):
    os.mkdir(path_ts_raw)

if not os.path.isdir(path_so_filtered):
    os.mkdir(path_so_filtered)

if not os.path.isdir(path_ts_filtered):
    os.mkdir(path_ts_filtered)


In [4]:
tool2tag = {
    'Amazon SageMaker': {'amazon-sagemaker', 'amazon-sagemaker-experiments', 'amazon-sagemaker-studio'},
    'Azure Machine Learning': {'azure-machine-learning-service', 'azure-machine-learning-studio', 'azure-machine-learning-workbench'},
    'ClearML': {'clearml'},
    'Comet': {'comet-ml'},
    'DVC': {'dvc'},
    'Kedro': {'kedro'},
    'MLflow': {'mlflow'},
    'MLRun': {'mlrun'},
    'Neptune': {'neptune'},
    'Optuna': {'optuna'},
    'Sacred': {'python-sacred'},
    'Vertex AI': {'google-cloud-vertex-ai'},
    'Weights & Biases': {'wandb'}
}

tools_keywords = {
    'Amazon SageMaker': ['amazon sagemaker', 'aws sagemaker', 'sagemaker'],
    'Azure Machine Learning': ['microsoft azure machine learning', 'azure machine learning', 'microsoft azure ml', 'microsoft azureml', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'Comet': ['comet'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Vertex AI': ['google vertex ai', 'vertex ai'],
    'Weights & Biases': ['weights & biases', 'weights and biases', 'wandb']
}

ignore_tools = {

}


In [5]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [45]:
import requests
import json

# function to scrape the posts from the tool-specific discussion fora


def scrape_post(base_url, page_suffix, file_name):
    page = -1
    post_list = []

    while True:
        page = page + 1
        page_url = base_url + page_suffix + str(page)
        topic_list = requests.get(page_url).json()['topic_list']

        for topic in topic_list['topics']:
            post_url = base_url + 't/' + \
                topic['slug'] + '/' + str(topic['id'])

            post = {}
            post['Question_title'] = topic['title']
            post['Question_link'] = post_url
            post['Question_creation_time'] = topic['created_at']
            post['Question_answer_count'] = topic['posts_count'] - 1
            post['Question_score'] = topic['like_count']
            post['Question_view_count'] = topic['views']
            post['Question_has_accepted_answer'] = topic['has_accepted_answer']
            comments = requests.get(
                post_url + '.json').json()['post_stream']['posts']
            post['Question_body'] = comments[0]['cooked']
            
            answer_list = []
            for comment in comments[1:]:
                answer = {}
                answer['Answer_creation_time'] = comment['created_at']
                answer['Answer_body'] = comment['cooked']
                answer['Answer_score'] = comment['score']
                answer['Answer_has_accepted'] = comment['accepted_answer']
                answer_list.append(answer)                
            post['Answer_list'] = answer_list
            
            post_list.append(post)
            time.sleep(5)

        if 'more_topics_url' not in topic_list.keys():
            break

    with open(os.path.join(path_ts_raw, file_name), 'w') as outfile:
        json_post_list = json.dumps(post_list, indent='\t')
        outfile.write(json_post_list)


In [40]:
# scrape posts from Guild AI
base_url = 'https://my.guild.ai/'
page_suffix = 'c/troubleshooting/6.json?page='
file_name = 'Guild AI.json'
scrape_post(base_url, page_suffix, file_name)

In [49]:
# scrape posts from Weights & Biases
base_url = 'https://community.wandb.ai/'
page_suffix = 'c/w-b-support/36.json?page='
file_name = 'Weights & Biases.json'
scrape_post(base_url, page_suffix, file_name)

In [41]:
# scrape posts from SigOpt
base_url = 'https://community.sigopt.com/'
page_suffix = 'c/general-discussion/9.json?page='
file_name = 'SigOpt.json'
scrape_post(base_url, page_suffix, file_name)

In [42]:
# scrape posts from DVC
base_url = 'https://discuss.dvc.org/'
page_suffix = 'c/questions/9.json?page='
file_name = 'DVC.json'
scrape_post(base_url, page_suffix, file_name)

In [55]:
import glob

# exclude tool-specific posts with negative upvote count
df_ts_questions = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_ts_raw, '*.json')):
    repos = pd.read_json(file_name)
    if 'Question_score' in repos.columns:
        repos = repos[repos['Question_score'] > -1]
    repos['Tool'] = os.path.split(file_name)[1].split('.')[0]
    df_ts_questions = pd.concat([df_ts_questions, repos], ignore_index=True)

df_ts_questions.to_json(os.path.join(path_ts_filtered,
                                     'questions.json'), orient='records', indent=4)

# keep only posts with at least one answer
df_ts_answers = df_ts_questions[df_ts_questions['Question_answer_count'] > 0]

for tool in df_ts_answers['Tool'].unique().tolist():
    number_accepted_answer = df_ts_answers[df_ts_answers['Tool']
                                           == tool]['Question_has_accepted_answer'].sum()
    if number_accepted_answer > 0:
        df_ts_answers = df_ts_answers.drop(df_ts_answers[(df_ts_answers['Tool'] == tool) & (
            df_ts_answers['Question_has_accepted_answer'] == False)].index)

In [56]:
df_ts_question_summary = df_ts_questions.groupby(
    'Tool').count()['Question_title'].reset_index()
df_ts_answer_summary = df_ts_answers.groupby(
    'Tool').count()['Question_title'].reset_index()

df_ts_question_summary.columns = ['Tool', '#Question']
df_ts_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_ts_question_summary, df_ts_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,528,167
1,Azure Machine Learning,1435,343
2,DVC,348,330
3,Domino,13,4
4,Guild AI,118,109
5,MLFlow,280,143
6,Polyaxon,43,34
7,SigOpt,15,7
8,Vertex AI,297,32
9,Weights & Biases,735,117


In [4]:
df = pd.read_json(os.path.join(
    path_so_raw, 'bq-results-20230321-204446-1679431620160.json'), lines=True)
df['Question_tags'] = df['Question_tags'].str.split('|')
df.head()

Unnamed: 0,Question_id,Question_title,Question_body,Question_answer_count,Question_comment_count,Question_creation_time,Question_favorite_count,Question_last_edit_time,Question_score,Question_tags,Question_view_count,Answer_body,Answer_comment_count,Answer_creation_time,Answer_last_edit_time,Answer_score
0,3846656,How does the 3 way merge in Mercurial/Meld work?,<p>I'm working on a project where I have a com...,2,0,2010-10-02 16:52:51.763000+00:00,11.0,2012-11-02 14:42:02.530000+00:00,33,"[mercurial, merge, dvcs, 3-way-merge]",15819,<p><strong>Local</strong> is r133</p>\n\n<p><s...,1.0,2010-10-02 16:58:23.490000+00:00,2010-10-02 20:30:59.770000+00:00,35.0
1,1745000,Can I clone part of a Mercurial repository?,<p>Is it possible to clone part of a Mercurial...,8,2,2009-11-16 21:34:33.500000+00:00,11.0,2012-01-20 08:31:22.290000+00:00,45,"[mercurial, dvcs]",18159,"<p>Yes you can. I'm sure you've moved on, but...",3.0,2011-02-14 20:13:26.563000+00:00,2017-05-31 11:39:12+00:00,37.0
2,12070220,Mercurial - cannot commit merge with missing f...,<p>I have done a 'hg merge' however when I att...,3,0,2012-08-22 09:33:57.333000+00:00,2.0,NaT,35,"[mercurial, dvcs]",17364,<p>Try <code>hg status</code> and look for fil...,2.0,2012-08-22 10:19:13.800000+00:00,2012-09-21 12:28:37.273000+00:00,56.0
3,3981043,How to abort a merge in mercurial?,<p>I goofed up a merge. I'd like to revert th...,3,1,2010-10-20 18:24:00.770000+00:00,11.0,2017-07-13 16:44:14.087000+00:00,90,"[version-control, mercurial, merge, dvcs, undo]",31732,<p><code>hg update -C &lt;one of the two merge...,7.0,2010-10-20 18:28:03.953000+00:00,NaT,113.0
4,15056327,How do I synchronise two remote Git repositories?,"<p>I have two repository urls, and I want to s...",6,0,2013-02-24 20:33:38.440000+00:00,11.0,2013-02-24 21:08:23.193000+00:00,27,"[git, version-control, github, dvcs]",58999,"<p>Git branches do not have ""heads"" in the Mer...",9.0,2013-02-24 22:33:15.687000+00:00,2013-02-25 02:21:02.693000+00:00,30.0


In [5]:
# create tag collection
tags = set()
for key, value in tool2tag.items():
    tags = tags.union(value)

In [6]:
# split tags
df['Question_valid_tags'] = [[] for _ in range(len(df))]
for index, row in df.iterrows():
    df.at[index, 'Question_valid_tags'] = list(
        tags.intersection(set(row['Question_tags'])))


In [7]:
# count post number with different tags
arity = 0
while True:
    post_number = df[df['Question_valid_tags'].map(len) > arity].shape[0]
    if post_number < 1:
        break
    arity = arity + 1
    print(f'Posts with at least {arity} tags has {post_number} in total.')


Posts with at least 1 tags has 5308 in total.
Posts with at least 2 tags has 220 in total.
Posts with at least 3 tags has 18 in total.


In [8]:
# exclude Stack Overflow posts with unrelated tags
df_valid = df[df['Question_valid_tags'].map(len) > 0]
df_valid['Question_link'] = df_valid['Question_id'].apply(
    lambda x: f'https://stackoverflow.com/questions/{x}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['Question_link'] = df_valid['Question_id'].apply(


In [9]:
# exclude Stack Overflow posts with negative upvote count
df_qualified = df_valid[df_valid['Question_score'] > -1]

In [10]:
# create a map from tag to tool
tag2tool = dict()
for key, value in tool2tag.items():
    for elem in value:
        tag2tool.setdefault(elem, key)

In [11]:
# extract Stack Overflow post collection with multiple tags based on the tool map
for index, row in df_qualified.iterrows():
    tags = set()
    for tag in row['Question_valid_tags']:
        tags.add(tag2tool[tag])
    df_qualified.at[index, 'Question_valid_tags'] = sorted(list(tags))

In [12]:
# check how the posts with more than one tags look like
df_multiply_tagged = df_qualified[df_qualified['Question_valid_tags'].map(
    len) > 1]
df_multiply_tagged['Question_valid_tags'].value_counts()

[Amazon SageMaker, MLflow]                 16
[Azure Machine Learning, MLflow]           11
[Kedro, MLflow]                             4
[Azure Machine Learning, Kedro, MLflow]     2
[MLflow, Sacred]                            1
[DVC, MLflow]                               1
[Kedro, Neptune]                            1
Name: Question_valid_tags, dtype: int64

In [13]:
# create Stack Overflow post collection with exclusive tags
multiply_tagged_posts_split = []
df_qualified.assign(Tool='')

for index, row in df_qualified.iterrows():
    tags = row['Question_valid_tags']
    df_qualified.at[index, 'Tool'] = tags[0]
    if len(tags) > 1:
        for tag in tags[1:]:
            series = row.copy()
            series['Tool'] = tag
            multiply_tagged_posts_split.append(series)

df_multiply_tagged_posts_split = pd.DataFrame(multiply_tagged_posts_split)
df_qualified_exclusive_tagged = pd.concat(
    [df_qualified, df_multiply_tagged_posts_split], ignore_index=True)
del df_qualified_exclusive_tagged['Question_valid_tags']

# keep Stack Overflow posts with accepted answers
df_qualified_exclusive_tagged_completed = df_qualified_exclusive_tagged.dropna(
    subset=['Answer_body'])

df_qualified_exclusive_tagged.to_json(os.path.join(
    path_so_filtered, 'questions.json'), indent=4, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qualified.at[index, 'Tool'] = tags[0]


In [14]:
df_so_question_summary = df_qualified_exclusive_tagged.groupby(
    'Tool').count()['Question_id'].reset_index()
df_so_answer_summary = df_qualified_exclusive_tagged_completed.groupby(
    'Tool').count()['Question_id'].reset_index()

df_so_question_summary.columns = ['Tool', '#Question']
df_so_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_so_question_summary, df_so_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,2248,739
1,Azure Machine Learning,1546,594
2,ClearML,40,20
3,Comet,10,4
4,DVC,91,49
5,Kedro,149,60
6,MLflow,552,129
7,Neptune,8,3
8,Optuna,141,37
9,Sacred,10,7


In [15]:
# create question dataset

df_question_so = pd.read_json(os.path.join(path_so_filtered, 'questions.json'))
df_question_ts = pd.read_json(os.path.join(path_ts_filtered, 'questions.json'))

df_question_so['Platform'] = 'Stack Overflow'
df_question_ts['Platform'] = 'Tool-specific'

df_questions = pd.concat([df_question_so, df_question_ts], ignore_index=True)
del df_questions['Question_tags']

df_questions.to_json(os.path.join(
    path_labeling, 'original.json'), indent=4, orient='records')

In [20]:
# add potential field to questions for later filling
df_questions = pd.read_json(os.path.join(path_labeling, 'original.json'))

df_questions['Question_original_content'] = ''
df_questions['Question_gpt_summary_original'] = ''
df_questions['Question_gpt_summary'] = ''
df_questions['Question_preprocessed_content'] = ''
df_questions['Answer_original_content'] = ''
df_questions['Answer_gpt_summary_original'] = ''
df_questions['Answer_gpt_summary'] = ''
df_questions['Answer_preprocessed_content'] = ''

df_questions.to_json(os.path.join(path_labeling, 'topic_modeling.json'), indent=4, orient='records')

In [23]:
# # add potential field to questions for later filling
# df_questions = pd.read_json(os.path.join(path_labeling, 'original.json'))
# df_optuna = pd.read_json(os.path.join(path_labeling, 'optuna.json'))
# df_previous = pd.read_json(os.path.join(path_labeling, 'topic_modeling.json'))


# df_questions['Question_original_content'] = ''
# df_questions['Question_original_content_gpt_summary'] = ''
# df_questions['Question_preprocessed_content'] = ''
# df_questions['Answer_original_content'] = ''
# df_questions['Answer_original_content_gpt_summary'] = ''
# df_questions['Answer_preprocessed_content'] = ''


# for index, row in df_questions.iterrows():
#     for index_2, row_2 in df_previous.iterrows():
#         if row['Question_link'] == row_2['Question_link']:
#             df_questions.at[index, 'Question_original_content_gpt_summary'] = row_2['Question_original_content_gpt_summary']
#             df_questions.at[index, 'Answer_original_content_gpt_summary'] = row_2['Answer_original_content_gpt_summary']
#             break

# # # Experiment 1: feed the original content to BerTopic
# # df_questions['Question_original_content'] = ''

# # # Experiment 2: feed the original content to GPT model and get the generated summary, then feed the summary to BerTopic
# # df_questions['Question_original_content_gpt_summary'] = ''

# # # Experiment 3: feed the preprocessed content to BerTopic
# # df_questions['Question_preprocessed_content'] = ''

# # # Experiment 4: feed the original content to BerTopic
# # df_questions['Answer_original_content'] = ''

# # # Experiment 5: feed the original content to GPT model and get the generated summary, then feed the summary to BerTopic
# # df_questions['Answer_original_content_gpt_summary'] = ''

# # # Experiment 6: feed the preprocessed content to BerTopic
# # df_questions['Answer_preprocessed_content'] = ''

# df_questions.to_json(os.path.join(path_labeling, 'topic_modeling_new.json'), indent=4, orient='records')

In [6]:
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from bs4 import BeautifulSoup
import re

# content filtering patterns
regex = r"(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|((https?:\/)?\/[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)"


def preprocess_text(text, remove_code=False):          
    soup = BeautifulSoup(text, 'html.parser')
    remove_tags = ['script', 'style']
    remove_tags.append('code') if remove_code else None
    for tag in soup(remove_tags):
        tag.decompose()
    text = soup.get_text()
    
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')
    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')
    
    text = re.sub(regex, ' ', text, 0, re.DOTALL) if remove_code else text
    text = preprocess_string(text)
    text = remove_stopwords(' '.join(text))
    return text


In [7]:
# create prompt for gpt model
prompt_question = 'Please write a one-sentence summary of the user\'s encountered challenges. For instance, you could begin with a sentence such as: "The user XXXXXX."\n###'
prompt_answer = 'Below is a question-answer pair. Given the context of the question, extract any possible solutions (if any) from the answer and make a high-level, human-readable, and concise summary of them.\n###'

import random

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

In [14]:
# Experiment 1

df_questions = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    question = preprocess_text(row['Question_title']) + ' ' + preprocess_text(str(row['Question_body']))

    if len(question.split()) < 6 or len(question) < 30:
        df_questions.drop(index, inplace=True)
        print(question)
    else:
        df_questions.at[index, 'Question_original_content'] = question

df_questions.to_json(os.path.join(path_labeling,
                     'topic_modeling.json'), indent=4, orient='records')

yolo azur support yolov import
endpoint deploi webservic grei 
studio error durind deploi autom 
data result point 
test fail custom speech 
frame datafram work 
abl select jupyt notebook 
releas user releas readi
send slack alert failur slack
support spark 
optim gener discuss board optim
advanc nlu standard nlu 
export googl automl translat model 
 
googl translat javascript api 
text speech googl cloud python 
deploi pretrain fasttext model code
integr wai integr articl resourc
 
 start journei data scienc engin
issu handov protocol facebook dialogflow 
cloud vision text coordin format 
 discuss space talk thing relat
support help differ categori


In [17]:
# # Experiment 2

# df_questions = pd.read_json(os.path.join(
#     path_labeling, 'topic_modeling.json'))

# df_questions['Question_gpt_summary_original'] = df_questions['Question_original_content_gpt_summary']
# df_questions['Question_gpt_summary'] = ''

# df_questions['Answer_gpt_summary_original'] = df_questions['Answer_original_content_gpt_summary']
# df_questions['Answer_gpt_summary'] = ''

# del df_questions['Question_original_content_gpt_summary']
# del df_questions['Answer_original_content_gpt_summary']

# for index, row in df_questions.iterrows():
#     content = row['Question_gpt_summary_original']
#     df_questions.at[index, 'Question_gpt_summary'] = preprocess_text(content)
#     content = row['Answer_gpt_summary_original']
#     df_questions.at[index, 'Answer_gpt_summary'] = preprocess_text(content)

# df_questions.to_json(os.path.join(
#     path_labeling, 'topic_modeling.json'), indent=4, orient='records')


In [None]:
# Experiment 2

df_questions = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    if index % 50 == 0:
        print(f'persisting on question {index}')
        df_questions.to_json(os.path.join(
            path_labeling, 'topic_modeling.json'), indent=4, orient='records')
        
    if row['Question_gpt_summary_original']:
        continue

    try:
        question = prompt_question + 'Title: ' + row['Question_title'] + ' Body: ' + row['Question_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-3.5-turbo',
            messages=[
                {"role": "system", "content": "You are a helpful summarizer."},
                {"role": "user", "content": question},
            ],
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        content = response['choices'][0]['message']['content'].strip()
        df_questions.at[index, 'Question_gpt_summary_original'] = content
        df_questions.at[index, 'Question_gpt_summary'] = preprocess_text(content)
    except Exception as e:
        # output unsuccesful requests
        print(f'{e} on question {row["Question_link"]}')

    time.sleep(5)

df_questions.to_json(os.path.join(
    path_labeling, 'topic_modeling.json'), indent=4, orient='records')


In [24]:
df_questions = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

assert (df_questions.shape[0] == df_questions.dropna(
    subset=['Question_gpt_summary_original']).shape[0])

# output the number of asset-management-related Q&A questions
df_questions.shape[0]

9001

In [25]:
# # Experiment 3

# df_questions = pd.read_json(os.path.join(
#     path_labeling, 'topic_modeling.json'))

# for index, row in df_questions.iterrows():
#     if row['Question_gpt_summary']:
#         continue
#     df_questions.at[index, 'Question_gpt_summary'] = preprocess_text(row['Question_gpt_summary_original'])

# df_questions.to_json(os.path.join(
#     path_labeling, 'topic_modeling.json'), indent=4, orient='records')

In [36]:
# Experiment 3

df_questions = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    content = preprocess_text(row['Question_title'], remove_code=True) + ' ' + preprocess_text(str(row['Question_body']), remove_code=True)
    df_questions.at[index, 'Question_preprocessed_content'] = content

df_questions.to_json(os.path.join(
    path_labeling, 'topic_modeling.json'), indent=4, orient='records')

In [8]:
# Experiment 4

df_answers = pd.read_json(os.path.join(path_labeling, 'topic_modeling.json'))

for index, row in df_answers.iterrows():
    answer = ''
    if row['Answer_body']:
        answer = row['Answer_body']
    elif row['Answer_list']:
        if row['Question_has_accepted_answer']:
            if 'Answer_has_accepted' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    if comment['Answer_has_accepted']:
                        answer = comment['Answer_body']
                        break
        elif 'Answer_body' in row['Answer_list'][0]:
            for comment in row['Answer_list']:
                answer += comment['Answer_body'] + '\n'

    df_answers.at[index, 'Answer_original_content'] = preprocess_text(answer)

df_answers.to_json(os.path.join(path_labeling, 'topic_modeling.json'), indent=4, orient='records')

In [10]:
import openai

openai.api_key = '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'

In [None]:
# Experiment 5

df_answers = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

for index, row in df_answers.iterrows():
    if index % 50 == 0:
        print(f'persisting on answer {index}')
        df_answers.to_json(os.path.join(
            path_labeling, 'topic_modeling.json'), indent=4, orient='records')
    
    answer = ''
    if row['Answer_body']:
        answer = row['Answer_body']
    elif row['Answer_list']:
        if row['Question_has_accepted_answer']:
            if 'Answer_has_accepted' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    if comment['Answer_has_accepted']:
                        answer = comment['Answer_body']
                        break
        elif 'Answer_body' in row['Answer_list'][0]:
            for comment in row['Answer_list']:
                answer += comment['Answer_body'] + '\n'

    if not answer or row['Answer_gpt_summary_original']:
        continue

    try:
        answer = prompt_answer + 'Question: ' + row['Question_gpt_summary'] + ' Answer: ' + answer + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-3.5-turbo',
            messages=[
                {"role": "system", "content": "You are a helpful summarizer."},
                {"role": "user", "content": answer},
            ],
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        content = response['choices'][0]['message']['content'].strip()
        df_answers.at[index, 'Answer_gpt_summary_original'] = content
        df_answers.at[index, 'Answer_gpt_summary'] = preprocess_text(content)
    except Exception as e:
        # output unsuccesful requests
        print(f'{e} on answer {row["Question_link"]}')

    time.sleep(1)

df_answers.to_json(os.path.join(
    path_labeling, 'topic_modeling.json'), indent=4, orient='records')


In [12]:
df_questions = pd.read_json(os.path.join(path_labeling, 'topic_modeling.json'))
df_questions = df_questions[df_questions['Answer_original_content'] != '']

assert (df_questions.shape[0] == df_questions.dropna(
    subset=['Answer_gpt_summary_original']).shape[0])

# output the number of asset-management-related Q&A answers
len(df_questions)


4906

In [5]:
# sample size is based on the recommendation from https://www.calculator.net/sample-size-calculator.html

sample_size = 369

df_questions = pd.read_json(os.path.join(
    path_labeling, 'topic_modeling.json'))

df_sample = df_questions[df_questions['Answer_gpt_summary_original'] != ''].sample(n=sample_size, random_state=42)

df_sample.to_json(os.path.join(
    path_labeling, 'sample.json'), indent=4, orient='records')


In [37]:
# Experiment 6

df_answers = pd.read_json(os.path.join(path_labeling, 'topic_modeling.json'))

for index, row in df_answers.iterrows():
    answer = ''
    if row['Answer_body']:
        answer = row['Answer_body']
    elif row['Answer_list']:
        if row['Question_has_accepted_answer']:
            if 'Answer_has_accepted' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    if comment['Answer_has_accepted']:
                        answer = comment['Answer_body']
                        break
        elif 'Answer_body' in row['Answer_list'][0]:
            for comment in row['Answer_list']:
                answer += comment['Answer_body'] + '\n'

    df_answers.at[index, 'Answer_preprocessed_content'] = preprocess_text(answer, remove_code=True)

df_answers.to_json(os.path.join(path_labeling, 'topic_modeling.json'), indent=4, orient='records')