In [1]:
import pandas as pd
import numpy as np
import time
import os

In [2]:
path_dataset = '../Dataset'

path_stack_overflow = os.path.join(path_dataset, 'Stack Overflow')
path_tool_specific = os.path.join(path_dataset, 'Tool-specific')
path_labeling = os.path.join(path_dataset, 'Labeling')  

if not os.path.exists(path_dataset):
    os.makedirs(path_dataset)

if not os.path.isdir(path_stack_overflow):
    os.mkdir(path_stack_overflow)

if not os.path.isdir(path_tool_specific):
    os.mkdir(path_tool_specific)

if not os.path.isdir(path_labeling):
    os.mkdir(path_labeling)


In [3]:
tool_tags = {
    'Amazon SageMaker': {'amazon-sagemaker', 'amazon-sagemaker-experiments', 'amazon-sagemaker-studio'},
    'Azure Machine Learning': {'azure-machine-learning-service', 'azure-machine-learning-studio', 'azure-machine-learning-workbench'},
    'ClearML': {'clearml'},
    'Comet': {'comet-ml'},
    'DVC': {'dvc'},
    'Kedro': {'kedro'},
    'MLflow': {'mlflow'},
    'MLRun': {'mlrun'},
    'Neptune': {'neptune'},
    'Optuna': {'optuna'},
    'Sacred': {'python-sacred'},
    'Vertex AI': {'google-cloud-vertex-ai'},
    'Weights & Biases': {'wandb'}
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sage maker', 'sagemaker'],
    'Azure Machine Learning': ['azure machine learning', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai', 'guildai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex ai', 'vertexai'],
    'Weights & Biases': ['weights and biases', 'wandb', 'weights & biases', 'weights&biases', 'w & b', 'w&b']
}

tool_no_accepted_answer = {
    'Domino', 
    'DVC', 
    'Guild AI"', 
    'MLflow', 
    'Polyaxon', 
    'SigOpt'
}


In [4]:
import google.auth
import pandas_gbq

credentials, _ = google.auth.default()

pandas_gbq.context.credentials = credentials
pandas_gbq.context.project = 'stack-overflow-dataset-330612'

with open(os.path.join(path_stack_overflow, 'bigquery.sql'), 'r') as sql_file:
    sql = sql_file.read()
    df = pandas_gbq.read_gbq(sql)
    df['Question_tags'] = df['Question_tags'].str.split('|')
    df['Question_favorite_count'] = df['Question_favorite_count'].fillna(0)

len(df)

Downloading: 100%|[32m██████████[0m|


6999

In [5]:
# create tag collection
tags = set()
for key, value in tool_tags.items():
    tags = tags.union(value)


In [6]:
# split tags
df['Question_valid_tags'] = [[] for _ in range(len(df))]
for index, row in df.iterrows():
    df.at[index, 'Question_valid_tags'] = list(tags.intersection(set(row['Question_tags'])))


In [9]:
# count post number with different tags
arity = 0
while True:
    post_number = df[df['Question_valid_tags'].map(len) > arity].shape[0]
    if post_number < 1:
        break
    arity = arity + 1
    print(f'Posts with at least {arity} tags has {post_number} in total.')


Posts with at least 1 tags has 5308 in total.
Posts with at least 2 tags has 220 in total.
Posts with at least 3 tags has 18 in total.


In [10]:
# exclude Stack Overflow posts with unrelated tags
df_valid = df[df['Question_valid_tags'].map(len) > 0]
df_valid['Question_link'] = df_valid['Question_id'].apply(lambda x: f'https://stackoverflow.com/questions/{x}')
len(df_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['Question_link'] = df_valid['Question_id'].apply(lambda x: f'https://stackoverflow.com/questions/{x}')


5308

In [11]:
# exclude Stack Overflow posts with negative upvote count
df_qualified = df_valid[df_valid['Question_score_count'] > -1]
len(df_qualified)

5175

In [12]:
# create a map from tag to tool
tag2tool = dict()
for key, value in tool_tags.items():
    for elem in value:
        tag2tool.setdefault(elem, key)
        

In [14]:
# extract Stack Overflow post collection with multiple tags based on the tool map
for index, row in df_qualified.iterrows():
    tags = set()
    for tag in row['Question_valid_tags']:
        tags.add(tag2tool[tag])
    df_qualified.at[index, 'Tools'] = list(tags)

del df_qualified['Question_valid_tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qualified.at[index, 'Tools'] = sorted(list(tags))


In [17]:
df_qualified['Challenge_self_resolution'] = df_qualified['Poster_id'] == df_qualified['Answerer_id']

del df_qualified['Poster_id']
del df_qualified['Answerer_id']
del df_qualified['Question_id']
del df_qualified['Question_tags']

df_qualified.to_json(os.path.join(
    path_stack_overflow, 'questions.json'), indent=4, orient='records')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qualified['Challenge_self_resolution'] = df_qualified['Poster_id'] == df_qualified['Answerer_id']


In [None]:
# function to scrape the posts from the tool-specific discussion fora

import requests

def scrape_post(base_url, page_suffix, file_name):
    page = -1
    posts = pd.DataFrame()
    
    post_url_lst = set()

    while True:
        page = page + 1
        page_url = base_url + page_suffix + str(page)
        topic_list = requests.get(page_url).json()['topic_list']

        for topic in topic_list['topics']:
            post_url = base_url + 't/' + \
                topic['slug'] + '/' + str(topic['id'])
                
            if post_url in post_url_lst:
                continue
            
            post_url_lst.add(post_url)

            post = {}
            post['Question_title'] = topic['title']
            post['Question_link'] = post_url
            post['Question_created_time'] = topic['created_at']
            post['Question_answer_count'] = topic['posts_count'] - 1
            post['Question_score_count'] = topic['like_count']
            post['Question_view_count'] = topic['views']
            
            comments = requests.get(
                post_url + '.json').json()['post_stream']['posts']
            post['Question_body'] = comments[0]['cooked']
            post['Question_closed_time'] = np.nan
            post['Answer_body'] = np.nan
            post["Question_self_closed"] = np.nan
            
            if topic['has_accepted_answer']:
                for comment in comments[1:]:
                    if comment['accepted_answer']:
                        post['Question_closed_time'] = comment['created_at']
                        post['Answer_body'] = comment['cooked']
                        post['Question_self_closed'] = comment['username'] == comments[0]['username']
                        break
            
            post = pd.DataFrame([post])
            posts = pd.concat([posts, post], ignore_index=True)
            time.sleep(5)

        if 'more_topics_url' not in topic_list.keys():
            break
    
    posts.to_json(os.path.join(path_tool_specific, file_name + '.json'), indent=4, orient='records')

In [None]:
# scrape posts from DVC
base_url = 'https://discuss.dvc.org/'
page_suffix = 'c/questions/9.json?page='
file_name = 'DVC'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from Guild AI
base_url = 'https://my.guild.ai/'
page_suffix = 'c/troubleshooting/6.json?page='
file_name = 'Guild AI'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from SigOpt
base_url = 'https://community.sigopt.com/'
page_suffix = 'c/general-discussion/9.json?page='
file_name = 'SigOpt'
scrape_post(base_url, page_suffix, file_name)

# scrape posts from Weights & Biases
base_url = 'https://community.wandb.ai/'
page_suffix = 'c/w-b-support/36.json?page='
file_name = 'Weights & Biases'
scrape_post(base_url, page_suffix, file_name)


In [None]:
import subprocess

path_code = 'Scrape'

subprocess.run(['python', os.path.join(path_code, 'Amazon SageMaker.py')])
subprocess.run(['python', os.path.join(path_code, 'Azure Machine Learning.py')])
subprocess.run(['python', os.path.join(path_code, 'Domino.py')])
subprocess.run(['python', os.path.join(path_code, 'MLflow.py')])
subprocess.run(['python', os.path.join(path_code, 'Polyaxon.py')])
subprocess.run(['python', os.path.join(path_code, 'Vertex AI.py')])


In [19]:
import glob

# exclude tool-specific posts with negative upvote count
df_questions_ts = pd.DataFrame()
total_post = 0

for file_name in glob.glob(os.path.join(path_tool_specific, '*.json')):
    posts = pd.read_json(file_name)
    total_post += len(posts)    
    if 'Question_score_count' in posts.columns:
        posts = posts[posts['Question_score_count'] > -1]
    posts['Tools'] = [list(os.path.split(file_name)[1].split('.')[0]) for _ in range(len(posts))]
    df_questions_ts = pd.concat([df_questions_ts, posts], ignore_index=True)
    
print(total_post, df_questions_ts.shape[0])


4982 4981


In [11]:
# create question dataset

df_question_so = pd.read_json(os.path.join(path_stack_overflow, 'questions.json'))

df_question_so['Platform'] = 'Stack Overflow'
df_questions_ts['Platform'] = 'Tool-specific'

df_questions = pd.concat([df_question_so, df_questions_ts], ignore_index=True)
df_questions['Question_body'] = df_questions['Question_body'].fillna('')

df_questions = df_questions.reindex(sorted(df_questions.columns), axis=1)
df_questions.to_json(os.path.join(path_labeling, 'questions.json'), indent=4, orient='records')
