In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

import gensim.corpora as corpora
import pandas as pd
import time
import glob
import json


In [13]:
import os

path_dataset = '../Dataset'

path_so = os.path.join(path_dataset, 'Stack Overflow')
path_ts = os.path.join(path_dataset, 'Tool-specific Others')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_so_raw = os.path.join(path_so, 'Raw')
path_ts_raw = os.path.join(path_ts, 'Raw')
path_so_filtered = os.path.join(path_so, 'Filtered')
path_ts_filtered = os.path.join(path_ts, 'Filtered')

path_labeling_question = os.path.join(path_labeling, 'Question')
path_labeling_answer = os.path.join(path_labeling, 'Answer')
    
if not os.path.exists(path_dataset):
    os.makedirs(path_dataset)

if not os.path.isdir(path_so):
    os.mkdir(path_so)

if not os.path.isdir(path_ts):
    os.mkdir(path_ts)

if not os.path.isdir(path_labeling):
    os.mkdir(path_labeling)

if not os.path.isdir(path_so_raw):
    os.mkdir(path_so_raw)

if not os.path.isdir(path_ts_raw):
    os.mkdir(path_ts_raw)

if not os.path.isdir(path_so_filtered):
    os.mkdir(path_so_filtered)

if not os.path.isdir(path_ts_filtered):
    os.mkdir(path_ts_filtered)

if not os.path.exists(path_labeling_question):
    os.makedirs(path_labeling_question)

if not os.path.exists(path_labeling_answer):
    os.makedirs(path_labeling_answer)


In [6]:
tool2tag = {
    'Amazon SageMaker': {'amazon-sagemaker', 'amazon-sagemaker-experiments', 'amazon-sagemaker-studio'},
    'Azure Machine Learning': {'azure-machine-learning-service', 'azure-machine-learning-studio', 'azure-machine-learning-workbench'},
    'ClearML': {'clearml'},
    'Comet': {'comet-ml'},
    'DVC': {'dvc'},
    'Kedro': {'kedro'},
    'MLflow': {'mlflow'},
    'MLRun': {'mlrun'},
    'Neptune': {'neptune'},
    'Sacred': {'python-sacred'},
    'Vertex AI': {'google-cloud-vertex-ai'},
    'Weights & Biases': {'wandb'}
}

tools_keywords = {
    'Amazon SageMaker': ['amazon sagemaker', 'aws sagemaker', 'sagemaker'],
    'Azure Machine Learning': ['microsoft azure machine learning', 'azure machine learning', 'microsoft azure ml', 'microsoft azureml', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'Comet': ['comet'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'Neptune': ['neptune'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Vertex AI': ['google vertex ai', 'vertex ai'],
    'Weights & Biases': ['weights & biases', 'weights and biases', 'wandb']
}

ignore_tools = {
    
}


In [14]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
import wandb

wandb_project = 'asset-management-project'
wandb.login()

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
import requests
import json

# function to scrape the posts from the tool-specific discussion fora

def scrape_post(base_url, page_suffix, file_name):
    page = -1
    post_list = []
    
    while True:
        page = page + 1
        page_url = base_url + page_suffix + str(page)
        topic_list = requests.get(page_url).json()['topic_list']

        for topic in topic_list['topics']:
            post_url = base_url + 't/' + \
            topic['slug'] + '/' + str(topic['id'])

            post = {}
            post['Question_title'] = topic['title']
            post['Question_link'] = post_url
            post['Question_creation_time'] = topic['created_at']
            post['Question_answer_count'] = topic['posts_count'] - 1
            post['Question_score'] = topic['like_count']
            post['Question_view_count'] = topic['views']
            post['Question_has_accepted_answer'] = topic['has_accepted_answer']
            comments = requests.get(post_url + '.json').json()['post_stream']['posts']
            post['Question_body'] = comments[0]['cooked']
            post['Answer_list'] = comments[1:]
            post_list.append(post)

            time.sleep(2)

        if 'more_topics_url' not in topic_list.keys():
            break
        
    with open(os.path.join(path_ts_raw, file_name), 'w') as outfile:
        json_post_list = json.dumps(post_list, indent='\t')
        outfile.write(json_post_list)

In [8]:
# scrape posts from Guild AI
base_url = 'https://my.guild.ai/'
page_suffix = 'c/troubleshooting/6.json?page='
file_name = 'Guild AI.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [7]:
# scrape posts from Weights & Biases
base_url = 'https://community.wandb.ai/'
page_suffix = 'c/w-b-support/36.json?page='
file_name = 'Weights & Biases.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [5]:
# scrape posts from SigOpt
base_url = 'https://community.sigopt.com/'
page_suffix = 'c/general-discussion/9.json?page='
file_name = 'SigOpt.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [5]:
# scrape posts from DVC
base_url = 'https://discuss.dvc.org/'
page_suffix = 'c/questions/9.json?page='
file_name = 'DVC.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [28]:
df_ts_questions = pd.DataFrame()

# exclude tool-specific posts with negative upvote count
for file_name in glob.glob(os.path.join(path_ts_raw, '*.json')):
    repos = pd.read_json(file_name)
    if 'Question_score' in repos.columns:
        repos = repos[repos['Question_score'] > -1]
    repos['Tool'] = os.path.split(file_name)[1].split('.')[0]
    df_ts_questions = pd.concat([df_ts_questions, repos], ignore_index=True)
    
df_ts_answers = df_ts_questions[df_ts_questions['Question_answer_count'] > 0]
for tool in df_ts_answers['Tool'].unique().tolist():
    number_accepted_answer = df_ts_answers[df_ts_answers['Tool']
                                            == tool]['Question_has_accepted_answer'].sum()
    if number_accepted_answer > 0:
        df_ts_answers = df_ts_answers.drop(df_ts_answers[(df_ts_answers['Tool'] == tool) & (
            df_ts_answers['Question_has_accepted_answer'] == False)].index)

df_ts_questions.to_json(os.path.join(path_ts_filtered,
              'questions.json'), orient='records', indent=4)

In [94]:
# keep only posts with at least one answer
df_ts_question_summary = df_ts_questions.groupby(
    'Tool').count()['Question_title'].reset_index()
df_ts_answer_summary = df_ts_answers.groupby(
    'Tool').count()['Question_title'].reset_index()

df_ts_question_summary.columns = ['Tool', '#Question']
df_ts_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_ts_question_summary, df_ts_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,528,167
1,Azure Machine Learning,1435,343
2,DVC,315,300
3,Domino,13,4
4,Guild AI,115,108
5,MLFlow,280,143
6,Polyaxon,43,34
7,SigOpt,15,7
8,Vertex AI,297,32
9,Weights & Biases,583,92


In [16]:
df = pd.read_json(os.path.join(
    path_so_raw, 'bq-results-20230201-032754-1675222092237.json'), lines=True)
df['Question_tags'] = df['Question_tags'].str.split('|')
df.head()

Unnamed: 0,Question_id,Question_title,Question_body,Question_answer_count,Question_comment_count,Question_creation_time,Question_favorite_count,Question_score,Question_tags,Question_view_count,...,Owner_up_votes,Owner_down_votes,Owner_views,Answer_body,Answer_comment_count,Answer_creation_time,Answer_score,Owner_location,Question_last_edit_time,Answer_last_edit_time
0,70098779,How to connect to MLFlow tracking server that ...,<p>I want to connect to remote tracking server...,1,0,2021-11-24 15:30:11.310000+00:00,1.0,1,"[authorization, tracking, mlflow]",2102,...,0,0,11,"<p><a href=""https://mlflow.org/docs/latest/tra...",2.0,2021-11-24 17:01:13.483000+00:00,2.0,,NaT,NaT
1,38927230,"Panda AssertionError columns passed, passed da...",<p>I am working on Azure ML implementation on ...,1,0,2016-08-12 22:23:17.197000+00:00,3.0,7,"[python, pandas, dataframe, nltk, azure-machin...",48200,...,136,55,339,<p>Try this:</p>\n\n<pre><code>dataframe_outpu...,0.0,2016-08-12 22:26:09.603000+00:00,13.0,"Toronto, ON, Canada",NaT,NaT
2,68773463,AccessDeniedException on sagemaker:CreateDomai...,<p>I am trying to use the AWS SageMaker Studio...,1,0,2021-08-13 13:49:08.683000+00:00,,0,"[amazon-web-services, amazon-iam, amazon-sagem...",366,...,0,0,11,,,NaT,,,NaT,NaT
3,67701971,How to label a text with multiple paragraphs i...,<p>I was trying setup a single label labeling ...,0,2,2021-05-26 09:16:33.420000+00:00,,1,"[amazon-web-services, text, amazon-sagemaker, ...",161,...,75,10,147,,,NaT,,"Zürich, Suïssa",2021-05-26 11:54:00.030000+00:00,NaT
4,48398509,How to Invoke AWS Sagemaker API with c# .NET?,<p>I have trained and deployed a model in AWS ...,1,0,2018-01-23 09:42:48.607000+00:00,,0,"[c#, asp.net, amazon-web-services, aws-sdk, am...",743,...,34,1,124,,,NaT,,Pune India,NaT,NaT


In [25]:
# create tag collection
tags = set()
for key, value in tool2tag.items():
    tags = tags.union(value)

In [26]:
# split tags
df['Question_valid_tags'] = [[] for _ in range(len(df))]
for index, row in df.iterrows():
    df.at[index, 'Question_valid_tags'] = list(tags.intersection(set(row['Question_tags'])))

In [27]:
# count post number with different tags
arity = 0
while True:
    post_number = df[df['Question_valid_tags'].map(len) > arity].shape[0]
    if post_number < 1:
        break
    arity = arity + 1
    print(f'Posts with at least {arity} tags has {post_number} in total.')


Posts with at least 1 tags has 5130 in total.
Posts with at least 2 tags has 220 in total.
Posts with at least 3 tags has 18 in total.


In [28]:
# exclude Stack Overflow posts with unrelated tags
df_valid = df[df['Question_valid_tags'].map(len) > 0]
df_valid['Question_link'] = df_valid['Question_id'].apply(
    lambda x: f'https://stackoverflow.com/questions/{x}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['Question_link'] = df_valid['Question_id'].apply(


In [29]:
# exclude Stack Overflow posts with negative upvote count
df_qualified = df_valid[df_valid['Question_score'] > -1]

In [30]:
# create a map from tag to tool
tag2tool = dict()
for key, value in tool2tag.items():
    for elem in value:
        tag2tool.setdefault(elem, key)

In [31]:
# extract Stack Overflow post collection with multiple tags based on the tool map
for index, row in df_qualified.iterrows():
    tags = set()
    for tag in row['Question_valid_tags']:
        tags.add(tag2tool[tag])
    df_qualified.at[index, 'Question_valid_tags'] = sorted(list(tags))

In [32]:
# check how the posts with more than one tags look like
df_multiply_tagged = df_qualified[df_qualified['Question_valid_tags'].map(
    len) > 1]
df_multiply_tagged['Question_valid_tags'].value_counts()

[Amazon SageMaker, MLFlow]                 16
[Azure Machine Learning, MLFlow]           11
[Kedro, MLFlow]                             4
[Azure Machine Learning, Kedro, MLFlow]     2
[DVC, MLFlow]                               1
[MLFlow, Sacred]                            1
[Kedro, Neptune]                            1
Name: Question_valid_tags, dtype: int64

In [33]:
# create Stack Overflow post collection with exclusive tags
multiply_tagged_posts_split = []
df_qualified.assign(Tool='')

for index, row in df_qualified.iterrows():
    tags = row['Question_valid_tags']
    df_qualified.at[index, 'Tool'] = tags[0]
    if len(tags) > 1:
        for tag in tags[1:]:
            series = row.copy()
            series['Tool'] = tag
            multiply_tagged_posts_split.append(series)

df_multiply_tagged_posts_split = pd.DataFrame(multiply_tagged_posts_split)
df_qualified_exclusive_tagged = pd.concat(
    [df_qualified, df_multiply_tagged_posts_split], ignore_index=True)
del df_qualified_exclusive_tagged['Question_valid_tags']

# keep Stack Overflow posts with accepted answers
df_qualified_exclusive_tagged_completed = df_qualified_exclusive_tagged.dropna(
    subset=['Answer_body'])

df_qualified_exclusive_tagged.to_json(os.path.join(
    path_so_filtered, 'questions.json'), indent=4, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qualified.at[index, 'Tool'] = tags[0]


In [34]:
df_so_question_summary = df_qualified_exclusive_tagged.groupby(
    'Tool').count()['Question_id'].reset_index()
df_so_answer_summary = df_qualified_exclusive_tagged_completed.groupby(
    'Tool').count()['Question_id'].reset_index()

df_so_question_summary.columns = ['Tool', '#Question']
df_so_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_so_question_summary, df_so_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,2233,737
1,Azure Machine Learning,1530,586
2,ClearML,40,20
3,Comet,10,4
4,DVC,91,49
5,Kedro,149,60
6,MLFlow,551,129
7,Neptune,8,3
8,Sacred,10,7
9,Vertex AI,341,112


In [43]:
# create question dataset

df_question_so = pd.read_json(os.path.join(path_so_filtered, 'questions.json'))
df_question_ts = pd.read_json(os.path.join(path_ts_filtered, 'questions.json'))

df_question_so['Platform'] = 'Stack Overflow'
df_question_ts['Platform'] = 'Tool-specific'

df_questions = pd.concat([df_question_so, df_question_ts], ignore_index=True)
df_questions.to_json(os.path.join(path_labeling, 'original.json'), indent=4, orient='records')

In [44]:
# add potential field to questions for later filling
df_questions = pd.read_json(os.path.join(path_labeling_question, 'original.json'))

# Experiment 1: feed the original content to BerTopic
df_questions['Question_original_content_preprocessed_text'] = ''

# Experiment 2: feed the original content to text-davinci-003 model and get the generated summary, then feed the summary to BerTopic
df_questions['Question_original_content_gpt_summary'] = ''

# Experiment 3: feed the preprocessed content to BerTopic
df_questions['Question_preprocessed_content'] = ''

df_questions.to_json(os.path.join(path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')

In [13]:
# Experiment 1

df_questions = pd.read_json(os.path.join(path_labeling_question, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    title = row['Question_title'].lower()
    body = str(row['Question_body']).lower()
    content = 'Title: ' + title + '; Content: ' + body
    
    for tool_keyword in tools_keywords[row['Tool']]:
        if tool_keyword in content:
            content = content.replace(tool_keyword, '')
    
    df_questions.at[index, 'Question_original_content_preprocessed_text'] = ' '.join(content.split())
    
df_questions.to_json(os.path.join(path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')

In [16]:
# Experiment 2

import random

question_prompt = 'Please write a one-sentence summary of the user\'s encountered challenges. For instance, you could begin with a sentence such as: "The user XXXX".\n"""'

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1


In [17]:
# Experiment 2

df_questions = pd.read_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    if row['Question_original_content_gpt_summary']:
        continue
    
    try:
        response = retry_with_backoff(
            openai.Completion.create,
            model='text-davinci-003',
            prompt=question_prompt +
            row['Question_original_content_preprocessed_text'] + '"""\n',
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_questions.at[index, 'Question_original_content_gpt_summary'] = response['choices'][0]['text'].strip()
    except Exception as e:
        print(f'{e} on question {index}')
        
    if index % 50 == 0:
        print(f'persisting on question {index}')
        df_questions.to_json(os.path.join(
            path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')
        
    time.sleep(5)

df_questions.to_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')


This model's maximum context length is 4097 tokens, however you requested 5260 tokens (5060 in your prompt; 200 for the completion). Please reduce your prompt; or completion length. on question 110
persisting on question 150
This model's maximum context length is 4097 tokens, however you requested 5293 tokens (5093 in your prompt; 200 for the completion). Please reduce your prompt; or completion length. on question 153
This model's maximum context length is 4097 tokens, however you requested 4372 tokens (4172 in your prompt; 200 for the completion). Please reduce your prompt; or completion length. on question 181
persisting on question 200
This model's maximum context length is 4097 tokens, however you requested 4348 tokens (4148 in your prompt; 200 for the completion). Please reduce your prompt; or completion length. on question 216
This model's maximum context length is 4097 tokens, however you requested 5660 tokens (5460 in your prompt; 200 for the completion). Please reduce your pr

In [None]:
# Experiment 2

# output unsuccesful summary requests
for index, row in df_questions.iterrows():
    if not row['Question_original_content_gpt_summary']:
        print(row['Issue_link'])

In [None]:
df_questions = pd.read_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'))
assert (df_questions.shape[0] == df_questions.dropna(
    subset=['Question_original_content_gpt_summary']).shape[0])

# output the number of asset-management-related discussion posts
df_questions.shape[0]

In [7]:
# sample size is based on the recommendation from https://www.calculator.net/sample-size-calculator.html

sample_size = 368

df_questions = pd.read_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'))

df_sample = df_questions.sample(n=sample_size, random_state=42)

df_sample.to_json(os.path.join(
    path_labeling_question, 'sample.json'), indent=4, orient='records')


In [29]:
from bs4 import BeautifulSoup
import re

# content filtering patterns
regex_filter = r"(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|((https?:\/)?\/[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]]+)|(info(rmation)?)"

def preprocess_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    for tag in soup(['code']):
        tag.decompose()
    text = soup.get_text()
    text = re.sub(regex_filter, ' ', text, flags=re.S)
    text = ' '.join(text.split())
    return text


In [37]:
# Experiment 3

df_questions = pd.read_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'))

for index, row in df_questions.iterrows():
    title = row['Question_title'].lower().encode('ascii', errors='ignore').decode('ascii')
    body = row['Question_body'].lower().encode('ascii', errors='ignore').decode('ascii')    
    content = 'Title: ' + preprocess_text(title) + '; Content: ' + preprocess_text(body)

    for tool_keyword in tools_keywords[row['Tool']]:
        if tool_keyword in content:
            content = content.replace(tool_keyword, '')

    df_questions.at[index, 'Question_preprocessed_content'] = ' '.join(content.split())

df_questions.to_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')


In [20]:
# remove "title" and "content" from the content
# remove "The user" from the beginning of the summary

df_questions = pd.read_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'))

df_questions['Question_preprocessed_content'] = df_questions['Question_preprocessed_content'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))
df_questions['Question_original_content_preprocessed_text'] = df_questions['Question_original_content_preprocessed_text'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))
df_questions['Question_original_content_gpt_summary'] = df_questions['Question_original_content_gpt_summary'].apply(
    lambda x: x.removeprefix('The user '))

df_questions.to_json(os.path.join(
    path_labeling_question, 'topic_modeling.json'), indent=4, orient='records')


In [55]:
# create answer dataset

df_questions = pd.read_json(os.path.join(path_labeling_question, 'original.json'))
df_answers = []

for index, row in df_questions.iterrows():
    if row['Answer_body']:
        df_answers.append(row)
    elif row['Answer_list']:
        df_answers.append(row)

df_answers = pd.concat(df_answers, axis=1, ignore_index=True).T
df_answers.to_json(os.path.join(path_labeling_answer, 'original.json'), indent=4, orient='records')

In [56]:
# add potential field to questions for later filling
df_answers = pd.read_json(os.path.join(path_labeling_answer, 'original.json'))

# Experiment 1: feed the original content to BerTopic
df_answers['Answer_original_content_preprocessed_text'] = ''

# Experiment 2: feed the original content to text-davinci-003 model and get the generated summary, then feed the summary to BerTopic
df_answers['Answer_original_content_gpt_summary'] = ''

# Experiment 3: feed the preprocessed content to BerTopic
df_answers['Answer_preprocessed_content'] = ''

df_answers.to_json(os.path.join(path_labeling_answer, 'topic_modeling.json'), indent=4, orient='records')

In [8]:
# Experiment 4

with open(os.path.join(path_labeling_answer, 'topic_modeling.json'), encoding='utf8') as answer_file:
    df_answers = json.load(answer_file)
    
    for row in df_answers:
        answer = ''
        if row['Answer_body']:
            answer = row['Answer_body']
        else:
            if row['Question_has_accepted_answer']:
                if 'Answer_has_accepted' in row['Answer_list'][0]:
                    for comment in row['Answer_list']:
                        if comment['Answer_has_accepted']:
                            answer = comment['Answer_body']
                            break
                else:
                    for comment in row['Answer_list']:
                        if comment['accepted_answer']:
                            answer = comment['cooked']
                            break
            elif 'Answer_body' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    answer += comment['Answer_body'] + '\n'
            elif 'cooked' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    answer += comment['cooked'] + '\n'
                    
        answer = answer.lower().encode('ascii', errors='ignore').decode('ascii')
    
        for tool_keyword in tools_keywords[row['Tool']]:
            if tool_keyword in answer:
                answer = answer.replace(tool_keyword, '')
        
        row['Answer_original_content_preprocessed_text'] = 'Answer: ' + ' '.join(answer.split())

with open(os.path.join(path_labeling_answer, 'topic_modeling.json'), 'w') as outfile:
    json_post_list = json.dumps(df_answers, indent='\t')
    outfile.write(json_post_list)

In [11]:
# Experiment 5

import random

answer_prompt = 'Please list the solutions (if any) from the following answer. For instance, you could begin with a sentence such as: "There are three solutions together: [1]. XXX; [2]. YYY; [3]. ZZZ.".\n"""'

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1


In [None]:
# Experiment 5

df_answers = pd.read_json(os.path.join(
    path_labeling_answer, 'topic_modeling.json'))

for index, row in df_answers.iterrows():
    if row['Question_original_content_gpt_summary']:
        continue
    
    try:
        response = retry_with_backoff(
            openai.Completion.create,
            model='text-davinci-003',
            prompt=question_prompt +
            row['Answer_original_content_gpt_summary'] + '"""\n',
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_answers.at[index, 'Question_original_content_gpt_summary'] = response['choices'][0]['text'].strip()
    except Exception as e:
        print(f'{e} on answer {index}')
        
    if index % 50 == 0:
        print(f'persisting on answer {index}')
        df_answers.to_json(os.path.join(
            path_labeling_answer, 'topic_modeling.json'), indent=4, orient='records')
        
    time.sleep(5)

df_answers.to_json(os.path.join(
    path_labeling_answer, 'topic_modeling.json'), indent=4, orient='records')


In [10]:
from bs4 import BeautifulSoup
import re

# content filtering patterns
regex_filter = r"(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|((https?:\/)?\/[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]]+)|(info(rmation)?)"

def preprocess_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    for tag in soup(['code']):
        tag.decompose()
    text = soup.get_text()
    text = re.sub(regex_filter, ' ', text, flags=re.S)
    text = ' '.join(text.split())
    return text


In [11]:
# Experiment 6

with open(os.path.join(path_labeling_answer, 'topic_modeling.json'), encoding='utf8') as answer_file:
    df_answers = json.load(answer_file)
    
    for row in df_answers:
        answer = ''
        if row['Answer_body']:
            answer = row['Answer_body']
        else:
            if row['Question_has_accepted_answer']:
                if 'Answer_has_accepted' in row['Answer_list'][0]:
                    for comment in row['Answer_list']:
                        if comment['Answer_has_accepted']:
                            answer = comment['Answer_body']
                            break
                else:
                    for comment in row['Answer_list']:
                        if comment['accepted_answer']:
                            answer = comment['cooked']
                            break
            elif 'Answer_body' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    answer += comment['Answer_body'] + '\n'
            elif 'cooked' in row['Answer_list'][0]:
                for comment in row['Answer_list']:
                    answer += comment['cooked'] + '\n'
        
        answer = answer.lower().encode('ascii', errors='ignore').decode('ascii')
        answer = preprocess_text(answer)
    
        for tool_keyword in tools_keywords[row['Tool']]:
            if tool_keyword in answer:
                answer = answer.replace(tool_keyword, '')
        
        row['Answer_preprocessed_content'] = 'Answer: ' + ' '.join(answer.split())

with open(os.path.join(path_labeling_answer, 'topic_modeling.json'), 'w') as outfile:
    json_post_list = json.dumps(df_answers, indent='\t')
    outfile.write(json_post_list)