In [5]:
import pandas as pd
import numpy as np
import time
import os
import re

In [6]:
path_dataset = '../Dataset'

path_stack_overflow = os.path.join(path_dataset, 'Stack Overflow')
path_tool_specific = os.path.join(path_dataset, 'Tool-specific')
path_labeling = os.path.join(path_dataset, 'Labeling')  

if not os.path.exists(path_dataset):
    os.makedirs(path_dataset)

if not os.path.isdir(path_stack_overflow):
    os.mkdir(path_stack_overflow)

if not os.path.isdir(path_tool_specific):
    os.mkdir(path_tool_specific)

if not os.path.isdir(path_labeling):
    os.mkdir(path_labeling)


In [7]:
tool_tag_mapping = {
    'Amazon SageMaker': {'amazon-sagemaker', 'amazon-sagemaker-experiments', 'amazon-sagemaker-studio', 'amz-sagemaker-distributed-training', 'amazon-sagemaker-debugger', 'amazon-sagemaker-clarify', 'amazon-sagemaker-compilers', 'amazon-sagemaker-experiments', 'amazon-sagemaker-neo'},
    'Azure Machine Learning': {'azureml-python-sdk', 'azuremlsdk', 'azure-machine-learning-service', 'azure-machine-learning-studio', 'azure-machine-learning-workbench', 'azure-ml-pipelines', 'azure-ml-component'},
    'ClearML': {'clearml'},
    'Comet': {'comet-ml'},
    'DVC': {'dvc'},
    'Kedro': {'kedro'},
    'MLflow': {'mlflow'},
    'MLRun': {'mlrun'},
    'Neptune': {'neptune', 'neptune-python-utils'},
    'Optuna': {'optuna'},
    'Sacred': {'python-sacred'},
    'Vertex AI': {'google-cloud-vertex-ai', 'vertex-ai-pipeline'},
    'Weights & Biases': {'wandb'}
}

tool_no_accepted_answer = {
    'Domino', 
    'DVC', 
    'Guild AI"', 
    'MLflow', 
    'Polyaxon', 
    'SigOpt'
}

regex_tag = r'<(.*?)>'


In [4]:
# create tag collection
tool_tags = set()
for key, value in tool_tag_mapping.items():
    tool_tags = tool_tags.union(value)

# create a map from tag to tool
tag2tool = dict()
for key, value in tool_tag_mapping.items():
    for elem in value:
        tag2tool.setdefault(elem, key)

df = pd.read_csv(os.path.join(path_stack_overflow, 'original.csv'))

# split tags
df['Question_valid_tags'] = [[] for _ in df.index]
for index, row in df.iterrows():
    tags = re.findall(regex_tag, row['Question_tags'])
    df.at[index, 'Question_valid_tags'] = list(tool_tags.intersection(tags))
    df.at[index, 'Question_tag_count'] = len(tags)
    if pd.isna(row['Question_body']):
        df.drop(index, inplace=True)

# exclude Stack Overflow posts with unrelated tags
df = df[df['Question_valid_tags'].map(len) > 0]

df['Tools'] = [[] for _ in df.index]
# extract Stack Overflow post collection with multiple tags based on the tool map
for index, row in df.iterrows():
    tags = set()
    for tag in row['Question_valid_tags']:
        tags.add(tag2tool[tag])
    df.at[index, 'Tools'] = list(tags)

df['Question_self_closed'] = df['Poster_id'] == df['Answerer_id']
df['Question_link'] = df['Question_id'].apply(lambda x: f'https://stackoverflow.com/questions/{x}')

del df['Poster_id']
del df['Answerer_id']
del df['Question_id']
del df['Question_tags']
del df['Question_valid_tags']

df.to_json(os.path.join(path_stack_overflow, 'filtered.json'), indent=4, orient='records')
len(df)


6768

In [36]:
# function to scrape the posts from the tool-specific discussion fora

import requests

def scrape_post(base_url, page_suffix, file_name):
    page = -1
    posts = pd.DataFrame()
    
    post_url_lst = set()

    while True:
        page = page + 1
        page_url = base_url + page_suffix + str(page)
        topic_list = requests.get(page_url).json()['topic_list']

        for topic in topic_list['topics']:
            post_url = base_url + 't/' + topic['slug'] + '/' + str(topic['id'])
                
            if post_url in post_url_lst:
                continue
            
            post_url_lst.add(post_url)

            post = {}
            post['Question_title'] = topic['title']
            post['Question_link'] = post_url
            post['Question_created_time'] = topic['created_at']
            post['Question_comment_count'] = topic['posts_count'] - 1
            post['Question_score_count'] = topic['like_count']
            post['Question_view_count'] = topic['views']
            
            comments = requests.get(post_url + '.json').json()['post_stream']['posts']
            post['Question_body'] = comments[0]['cooked']
            post['Question_closed_time'] = np.nan
            post['Comment_body'] = np.nan
            post["Question_self_closed"] = np.nan
            
            if topic['has_accepted_answer']:
                for comment in comments[1:]:
                    if comment['accepted_answer']:
                        post['Question_closed_time'] = comment['created_at']
                        post['Comment_body'] = comment['cooked']
                        post['Question_self_closed'] = comment['username'] == comments[0]['username']
                        break
            
            post = pd.DataFrame([post])
            posts = pd.concat([posts, post], ignore_index=True)
            time.sleep(5)

        if 'more_topics_url' not in topic_list.keys():
            break
    
    posts.to_json(os.path.join(path_tool_specific, file_name + '.json'), indent=4, orient='records')

In [38]:
# scrape posts from DVC
base_url = 'https://discuss.dvc.org/'
page_suffix = 'c/questions/9.json?page='
file_name = 'DVC'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from Guild AI
base_url = 'https://my.guild.ai/'
page_suffix = 'c/troubleshooting/6.json?page='
file_name = 'Guild AI'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from SigOpt
base_url = 'https://community.sigopt.com/'
page_suffix = 'c/general-discussion/9.json?page='
file_name = 'SigOpt'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from Weights & Biases
base_url = 'https://community.wandb.ai/'
page_suffix = 'c/w-b-support/36.json?page='
file_name = 'Weights & Biases'
scrape_post(base_url, page_suffix, file_name)


In [None]:
import subprocess

path_code = 'Scrape'

subprocess.run(['python', os.path.join(path_code, 'Amazon SageMaker.py')])
subprocess.run(['python', os.path.join(path_code, 'Azure Machine Learning.py')])
subprocess.run(['python', os.path.join(path_code, 'Domino.py')])
subprocess.run(['python', os.path.join(path_code, 'MLflow.py')])
subprocess.run(['python', os.path.join(path_code, 'Polyaxon.py')])
subprocess.run(['python', os.path.join(path_code, 'Vertex AI.py')])


In [8]:
# create question dataset

import glob

df_questions_ts = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_tool_specific, '*.json')):
    posts = pd.read_json(file_name)
    tool_name = os.path.split(file_name)[1].split('.')[0]
    posts['Tools'] = [[tool_name] for _ in posts.index]
    df_questions_ts = pd.concat([df_questions_ts, posts], ignore_index=True)

print(df_questions_ts.shape[0])

df_question_so = pd.read_json(os.path.join(path_stack_overflow, 'filtered.json'))

df_question_so['Platform'] = 'Stack Overflow'
df_questions_ts['Platform'] = 'Tool-specific'

df_questions = pd.concat([df_question_so, df_questions_ts], ignore_index=True)
df_questions.to_json(os.path.join(path_labeling, 'questions.json'), indent=4, orient='records')


5452
