In [33]:
import os
import re
import time
import spacy
import pickle
import openai
import random
import enchant
import textstat
import warnings
import itertools
import subprocess
import webbrowser
import collections
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from langdetect import detect 
from collections import namedtuple
from scipy.stats import kruskal, shapiro, mannwhitneyu, kendalltau
from gensim.parsing.preprocessing import remove_stopwords, strip_short, strip_punctuation, STOPWORDS

In [35]:
path_dataset = '../../Dataset'
path_result = '../../Result'

path_rq1 = os.path.join(path_result, 'RQ1')

random_state = 42

warnings.filterwarnings('ignore')

# subprocess.run(['python', '-m' 'spacy', 'download', 'en_core_web_sm'])
nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

prompt_summary = '''Refine the title of the following post to make it short and clear in simple English.\n###'''

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['amazon', 'aws', 'maker', 'sage'],
    'Azure Machine Learning': ['aml', 'azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['databricks', 'mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['google', 'gcp', 'vertex'],
    'Weights & Biases': ['biases', 'wandb', 'weights']
}

tools_keyword_set = set(itertools.chain(*tools_keyword_mapping.values()))

keywords_image = {
    ".jpg", 
    ".png", 
    ".jpeg", 
    ".gif", 
    ".bmp", 
    ".webp", 
    ".svg", 
    ".tiff"
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

error_words_basics = {
    'break',
    'bug',
    'ceas',
    'clos',
    'crash',
    'defect',
    'error',
    'fail',
    'fault',
    'finish',
    'flaw',
    'glitch',
    'halt',
    'malfunction',
    'mistak',
    'paus',
    'shutdown',
    'stop',
    'terminat',
}

stop_words_se = {
    'ability',
    'abilities',
    'accident',
    'accidents',
    'acknowledgement',
    'action',
    'actions',
    'activities',
    'activity',
    'advantage',
    'advantages',
    'advice',
    'ai',
    'alternative',
    'alternatives',
    'announcement',
    'announcements',
    'anomaly'
    'anomalies'
    'answer',
    'answers',
    'appreciation',
    'approach',
    'approaches',
    'article',
    'articles',
    'assistance',
    'attempt',
    'author',
    'behavior',
    'behaviour',
    'benefit',
    'benefits',
    'bit',
    'bits',
    'block',
    'blocks',
    'blog',
    'blogs',
    'body',
    'building',
    'case',
    'cases',
    'categories',
    'categorization',
    'category',
    'cause',
    'causes',
    'challenge',
    'challenges',
    'change',
    'changes',
    # 'char',
    'check',
    'choice',
    'choices',
    'clarification',
    'code',
    'collection',
    'com',
    'combination',
    'commmunication',
    'community',
    'communities',
    'company',
    'companies',
    # 'computer',
    # 'computers',
    # 'concept',
    # 'concepts',
    'concern',
    'concerns',
    # 'condition',
    # 'conditions',
    'confirmation',
    'confusion',
    'consideration',
    # 'content',
    # 'contents',
    'context',
    # 'count',
    'couple',
    'couples',
    'course',
    'courses',
    'cross',
    # 'custom',
    'customer',
    'customers',
    'day',
    'days',
    'demand',
    'description',
    'descriptions',
    'desire',
    'detail',
    'details',
    'devops',
    'difference',
    'differences',
    'difficulties',
    'difficulty',
    'disadvantage',
    'disadvantages',
    'discrepancies',
    'discrepancy',
    'discussion',
    'discussions',
    'dislike',
    'distinction',
    'effect',
    'end',
    # 'engineering',
    'enquiries',
    'enquiry',
    'evidence',
    'example',
    'examples',
    'exception',
    'exceptions',
    'existence',
    'exit',
    'expectation',
    'experience',
    'expert',
    'experts',
    'explanation',
    'explanations',
    'fact',
    'facts',
    'favorite',
    'favorites',
    # 'feature',
    # 'features',
    'feedback',
    'feedbacks',
    'fix',
    'fixes',
    # 'float',
    # 'form',
    # 'forms',
    'forum'
    'forums'
    'functionality',
    'functionalities',
    'future',
    'goal',
    'goals',
    'guarantee',
    'guidance',
    'guideline',
    'guide',
    'guides',
    'guy',
    'guys',
    'harm',
    'hello',
    'help',
    'hour',
    'hours',
    'ibm',
    'idea',
    'ideas',
    'individual',
    'individuals',
    'info',
    'information',
    'inquiries',
    'inquiry',
    'insight',
    'instruction',
    'instructions',
    # 'int',
    'intelligence',
    'intent',
    'interest',
    'introduction',
    'investigation',
    'invitation',
    'ipynb',
    'issue',
    'issues',
    'kind',
    'kinds',
    'lack',
    'learning',
    'level',
    'levels',
    'look',
    'looks',
    'lot',
    'lots',
    'luck',
    'machine',
    'major',
    'manner',
    'manners',
    'manual',
    'mark',
    'means',
    'meaning',
    # 'message',
    # 'messages',
    'method',
    'methods',
    'ml',
    'mlops',
    'minute',
    'minutes',
    'month',
    'months',
    'need',
    'needs',
    'number',
    'numbers',
    'offer',
    'one',
    'ones',
    'opinion',
    'opinions',
    'org',
    'organization',
    'organizations',
    'outcome',
    'part',
    'parts',
    'past',
    'people',
    'person',
    'persons',
    'perspective',
    'perspectives',
    'place',
    'places',
    'point',
    'points',
    'post',
    'posts',
    'practice',
    'practices',
    'problem',
    'problems',
    'product',
    'products',
    'program',
    'programs',
    'project',
    'projects',
    'proposal',
    'proposals',
    'purpose',
    'purposes',
    'py',
    # 'python',
    'qa',
    'question',
    'questions',
    'reason',
    'reasons',
    'research',
    'result',
    'results',
    'scenario',
    'scenarios',
    'science',
    'screenshot',
    'screenshots',
    'second',
    'seconds',
    'section',
    'sense',
    'sentence',
    'shortcoming',
    'shortcomings',
    'show',
    'shows',
    'situation',
    'software',
    'solution',
    'solutions',
    'start',
    # 'state',
    # 'statement',
    # 'states',
    # 'status',
    # 'step',
    # 'steps',
    # 'string',
    'student',
    'students',
    'study',
    'stuff',
    'success',
    'suggestion',
    'suggestions',
    'summary',
    'summaries',
    'surprise',
    'support',
    'supports',
    'talk',
    'task',
    'tasks',
    'teacher',
    'teachers',
    'team',
    'teams',
    'technique',
    'techniques',
    'technologies',
    'technology',
    'term',
    'terms',
    'tip',
    'tips',
    'thank',
    'thanks',
    'thing',
    'things',
    'thought',
    'thoughts',
    'three',
    'title',
    'today',
    'tomorrow',
    # 'tool',
    # 'tools',
    'total',
    'trouble',
    'troubles',
    'truth',
    'try',
    'tutorial',
    'tutorials',
    'two',
    'understand',
    'understanding',
    'usage',
    'use',
    'user',
    'users',
    'uses',
    # 'value',
    # 'values',
    'view',
    'viewpoint',
    'way',
    'ways',
    'weakness',
    'weaknesses',
    'week',
    'weeks',
    'word',
    'words',
    'work',
    'workaround',
    'workarounds',
    'works',
    'yeah',
    'year',
    'years',
    'yesterday',
}

stop_words_ml = {
    'ad',
    'ads',
    'advertisement',
    'advertisements',
    'algorithm',
    'algorithms',
    'analysis',
    'anomaly',
    'asr',
    'audio',
    'autoencoder',
    'automl',
    'autopilot',
    'bert',
    'bi',
    'chatbot',
    'classification',
    'classifier',
    'clustering',
    'cnn',
    'cv',
    'decision',
    'detection',
    'dimensionality',
    'embedding',
    'estimation',
    'extraction',
    'forecast',
    'forecaster',
    'forecasts',
    'forecasting',
    'forest',
    'fraud',
    'gan',
    'gesture',
    'gpt',
    'ica',
    'kmeans',
    'k-means',
    'knn',
    'language',
    'languages',
    'lda',
    'lstm',
    'mining',
    'modelling',
    'ngram',
    'n-gram',
    'nlp',
    'nmf',
    'ocr',
    'pca',
    'processing',
    'recognition',
    'recommendation',
    'recommendations',
    'recommender',
    'reduction',
    'regression',
    'regressor',
    'reinforcement',
    'rf',
    'rl',
    'rnn',
    'segmentation',
    'sentiment',
    'series',
    'sound',
    'spam',
    'speech',
    'stt',
    'strategy',
    'strategies',
    'svd',
    'svm',
    # 'text',
    # 'texts',
    'time',
    'timeseries'
    'topic',
    'topics',
    'transformer',
    'translation',
    'translator',
    'tree',
    'trend',
    'trends',
    'tsne',
    'tts',
    'vae',
    'video',
    'vision',
    'voice',
}

stop_words = STOPWORDS.union(stop_words_se.union(stop_words_ml))


In [87]:
def count_code_line(block_list):
    total_loc = 0
    for block in block_list:
        for line in block.splitlines():
            if line.strip():
                total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    for tag in soup.find_all(['code', 'blockquote', 'a']):
        tag.decompose()
    clean_text = soup.get_text(separator=' ')
    return clean_text, links, code_line1 + code_line2

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern1 = r"\!?\[.*?\]\((.*?)\)"
    links1 = re.findall(link_pattern1, text)
    clean_text = re.sub(link_pattern1, '', text)
    link_pattern2 = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links2 = re.findall(link_pattern2, clean_text)
    clean_text = re.sub(link_pattern2, '', clean_text)
    links = links1 + links2
    return clean_text, links

def split_content(content):
    clean_text, links1, code_line1 = extract_styles(content)
    clean_text, code_line2 = extract_code(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_line = code_line1 + code_line2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_line'])
    return content_collection(clean_text, links, code_line)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def extract_nouns_verbs(text, verb):
    doc = nlp(text)
    if verb:
        words = [token.text for token in doc if (token.pos_ == "VERB") or (token.pos_ == "NOUN")]
    else:
        words = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(words)

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        # In case the detection fails (e.g. if the text is too short or doesn't contain enough features)
        return False

def extract_english(text):
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

def remove_words_with_substring(text, substring_list):
    words = text.split()
    for substring in substring_list:
        words = [word for word in words if substring not in word]
    return ' '.join(words)

def preprocess_text(text, verb=True):
    clean_text = text.lower()
    clean_text = strip_punctuation(clean_text)
    clean_text = extract_english(clean_text)
    clean_text = remove_words_with_substring(clean_text, error_words_basics)
    clean_text = remove_words_with_substring(clean_text, tools_keyword_set)
    clean_text = remove_stopwords(clean_text, stop_words)
    clean_text = extract_nouns_verbs(clean_text, verb)
    # clean_text = strip_short(clean_text)
    return clean_text

def analyze_links(links):
    image_links = 0
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([image in link for image in keywords_image]):
            image_links += 1
        elif any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['image', 'documentation', 'tool', 'issue', 'patch', 'tutorial', 'example'])
    return link_analysis(image_links, documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0)
            time.sleep(sleep)
            x += 1

def find_duplicates(in_list):  
    duplicates = []
    unique = set(in_list)
    for each in unique:
        count = in_list.count(each)
        if count > 1:
            duplicates.append(each)
    return duplicates

def remove_nan(in_list):
    return [elem for elem in in_list if pd.notna(elem)]

def input_challenge_type(link):
    print(link)
    
    choice_mapping = {
        "K": 'knowledge',
        "N": 'na',
        "P": 'problem',
    }
    valid_options = ["P", "K", "N"]
    
    webbrowser.open(link)
    choice = input("Choose an option - 'P', 'K', or 'N': ").upper()

    while choice not in valid_options:
        print("Invalid choice. Please choose 'P', 'K', or 'N' only.")
        choice = input("Choose an option - 'P', 'K', or 'N': ").upper()

    return choice_mapping[choice]


In [39]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']
    df_issues.at[index, 'Challenge_comment_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_comment_body'] = row['Issue_comment_body']
    df_issues.at[index, 'Challenge_comment_score'] = row['Issue_comment_score']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))
df_discussions = pd.read_json(os.path.join(path_dataset, 'discussions.json'))
df_qa = pd.concat([df_questions, df_discussions], ignore_index=True)

for index, row in df_qa.iterrows():
    df_qa.at[index, 'Challenge_title'] = row['Question_title']
    df_qa.at[index, 'Challenge_body'] = row['Question_body']
    df_qa.at[index, 'Challenge_link'] = row['Question_link']
    df_qa.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_qa.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_qa.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_qa.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_qa.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_qa.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_qa.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_qa.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_qa.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_qa.at[index, 'Challenge_self_closed'] = row['Question_self_closed']
    df_qa.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_qa.at[index, 'Challenge_comment_body'] = row['Question_comment_body']
    df_qa.at[index, 'Challenge_comment_score'] = row['Question_comment_score']

    df_qa.at[index, 'Solution_body'] = row['Answer_body']
    df_qa.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_qa.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_qa.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_qa.at[index, 'Solution_comment_score'] = row['Answer_comment_score']
    df_qa.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

df = pd.concat([df_issues, df_qa], ignore_index=True)
df = df[df.columns.drop(list(df.filter(regex=r'(Issue|Question|Answer)_')))]
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [None]:
# name = 'topics'

# df = pd.read_json(os.path.join(path_dataset, f'{name}.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'original.json'))

# for index, row in df.iterrows():
#     if 'Git' not in row['Platform']:
#         continue
#     for i2, r2 in df_old.iterrows():
#         if 'Git' not in r2['Platform']:
#             continue
#         if row['Challenge_link'] == r2['Challenge_link']:
#             df.at[index, 'Tools'] = r2['Tools']
#             break

# df.to_json(os.path.join(path_dataset, f'{name}.json'), indent=4, orient='records')

In [76]:
# Filter out non-English posts

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    clean_title = remove_words_with_substring(row['Challenge_title'].lower(), tools_keyword_set)
    if not is_english(clean_title):
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')

In [None]:
df = pd.read_json(os.path.join(path_dataset, 'labels.json'))

for index, row in df.iterrows():
    df.at[index, 'Challenge_type'] = input_challenge_type(row['Challenge_link'])
    if index % 50 == 49:
        df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')
    
df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')

In [None]:
# df = pd.read_json(os.path.join(path_dataset, 'labels.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'labels+.json'))
# print(len(df))

# # df_difference = pd.concat([df_old, df, df]).drop_duplicates('Challenge_link', keep=False, ignore_index=True)
# df_difference = pd.concat([df, df_old, df_old]).drop_duplicates('Challenge_link', keep=False, ignore_index=True)

# df_all = pd.concat([df_old, df], ignore_index=True)
# df_duplicate = df_all[df_all.duplicated(['Challenge_link'], keep=False)]
# df_duplicate = df_duplicate.drop_duplicates('Challenge_link', ignore_index=True)

# df_new = pd.concat([df_difference, df_duplicate], ignore_index=True)
# print(len(df_new))

# df_new.to_json(os.path.join(path_dataset, 'labels++.json'), indent=4, orient='records')

In [90]:
df = pd.read_json(os.path.join(path_dataset, 'labels+.json'))

# mark = False

for index, row in df.iterrows():
    # if (not mark) and (row['Challenge_link'] != 'https://github.com/allegroai/clearml-server/issues/128'):
    #     continue  
    # mark = True
    if pd.notna(row['Challenge_type']):
        continue
    if ('crash' in row['Challenge_body'].lower()) and ('error' in row['Challenge_body'].lower()):
        print(row['Challenge_link'])
        df.at[index, 'Challenge_type'] = 'problem'

df.to_json(os.path.join(path_dataset, 'labels+.json'), indent=4, orient='records')

https://stackoverflow.com/questions/65084685
https://github.com/kubeflow/pipelines/issues/1624
https://stackoverflow.com/questions/60092509


In [None]:
# Discard all the rows that do not have a challenge type

df = pd.read_json(os.path.join(path_dataset, 'labels.json'))

df = df[df['Challenge_type'] != 'na']
    
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [None]:
# Generate GPT summaries

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary']):
        continue

    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + '\nBody: ' + row['Challenge_body'] + '###\nTitle: '
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=50,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(5)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


In [None]:
# Post level preprocessing

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    clean_title = preprocess_text(row['Challenge_title'])
    df.at[index, 'Challenge_preprocessed_title'] = clean_title
    
    clean_summary = preprocess_text(row['Challenge_gpt_summary'])
    df.at[index, 'Challenge_preprocessed_gpt_summary'] = clean_summary
    
    body_analyzer = split_content(row['Challenge_body'])
    clean_body = preprocess_text(body_analyzer.text)
    df.at[index, 'Challenge_preprocessed_content'] = clean_title + ' ' + clean_body
    
    # link_analyzer = analyze_links(challenge_analyzer.links)
    # text_analyzer = analyze_text(challenge_analyzer.text)
    
    # df.at[index, 'Challenge_code_count'] = challenge_analyzer.code_line
    # df.at[index, 'Challenge_word_count'] = text_analyzer.word_count
    # df.at[index, 'Challenge_readability'] = text_analyzer.readability
    # df.at[index, 'Challenge_reading_time'] = text_analyzer.reading_time
    # df.at[index, 'Challenge_link_count_image'] = link_analyzer.image
    # df.at[index, 'Challenge_link_count_documentation'] = link_analyzer.documentation
    # df.at[index, 'Challenge_link_count_example'] = link_analyzer.example
    # df.at[index, 'Challenge_link_count_issue'] = link_analyzer.issue
    # df.at[index, 'Challenge_link_count_patch'] = link_analyzer.patch
    # df.at[index, 'Challenge_link_count_tool'] = link_analyzer.tool
    # df.at[index, 'Challenge_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Challenge_comment_body']):
    #     comment_analyzer = split_content(row['Challenge_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Challenge_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Challenge_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Challenge_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Challenge_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Challenge_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Challenge_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Challenge_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Challenge_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Challenge_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Challenge_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Challenge_comment_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Solution_body']):
    #     solution_analyzer = split_content(row['Solution_body'])
    #     link_analyzer = analyze_links(solution_analyzer.links)
    #     text_analyzer = analyze_text(solution_analyzer.text)
        
    #     df.at[index, 'Solution_code_count'] = solution_analyzer.code_line
    #     df.at[index, 'Solution_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_link_count_tutorial'] = link_analyzer.tutorial
        
    # if pd.notna(row['Solution_comment_body']):
    #     comment_analyzer = split_content(row['Solution_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Solution_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Solution_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_comment_link_count_tutorial'] = link_analyzer.tutorial
    
    df.at[index, 'Challenge_solved_time'] = (row['Challenge_closed_time'] - row['Challenge_created_time']) / pd.Timedelta(hours=1)

df = df.reindex(sorted(df.columns), axis=1)
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [5]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df['State'] = df['Challenge_solved_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

# df_discussion = df.sample(33, random_state=42)
# df_discussion['Platform'] = 'GitHub Discussion'
# df = pd.concat([df, df_discussion], ignore_index=True)

df = df.explode('Tools')
categories = ['Platform', 'Tools', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.pdf'))


In [83]:
df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df_inquiry = []

for name, group in df.groupby(['Challenge_type']):
    info = {
        'Type': name,
        'Prevalence (%)': round(len(group) / len(df) * 100, 2),
        'Unresolved rate (%)': round(len(group[group['Challenge_closed_time'].isna()]) / len(group) * 100, 2),
        'Median resolution time (hours)': round(group['Challenge_solved_time'].median(), 2),
    }
    df_inquiry.append(info)

df_inquiry = pd.DataFrame(df_inquiry)
print(df_inquiry.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{lrrr}
\toprule
     Type &  Prevalence (\%) &  Unresolved rate (\%) &  Median resolved time (hours) \\
\midrule
knowledge &           45.06 &                67.76 &                         21.96 \\
  problem &           54.94 &                65.40 &                         67.75 \\
\bottomrule
\end{tabular}



In [22]:
df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df = df[df['Challenge_solved_time'].notna()]
df_problem = df[df['Challenge_type'] == 'problem']['Challenge_solved_time'].tolist()
df_knowledge = df[df['Challenge_type'] == 'knowledge']['Challenge_solved_time'].tolist()

print("P-value of problem inquiry:", shapiro(df_problem)[1])
print("P-value of knowledge inquiry:", shapiro(df_knowledge)[1])
print("P-value of two type of inquiries:", mannwhitneyu(df_problem, df_knowledge)[1])


P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 0.0
P-value of two type of inquiries: 1.1262472909473114e-24


In [None]:
prompt_topic = '''You will be given a set of terms refering to specific software engineering topics. Please summarize each topic in a phrase and attach one sentence description in the asset management context. Also, you must guarantee that those phrases are not duplicate with one another.###\n'''

with open(os.path.join(path_rq1, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

    topic_term_list = []
    for index, topic in enumerate(topic_terms):
        terms = ', '.join([term[0] for index, term in enumerate(topic)])
        topic_term = f'Topic {index}: {terms}'
        topic_term_list.append(topic_term)

    prompt = prompt_topic + '\n'.join(topic_term_list) + '\n###\n'
    completion = openai.ChatCompletion.create(
        model='gpt-3.5-turbo-16k',
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0,
        max_tokens=8000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        timeout=1000,
        stream=False)

    topics = completion.choices[0].message.content
    print(topics)

In [4]:
topics = '''Topic 0: Docker - A platform for building, packaging, and distributing applications in containers. In the asset management context, Docker is used to create reproducible and isolated environments for deploying machine learning models.
Topic 1: Columns - The vertical sections of a table that hold data of a specific type. In the asset management context, columns are used to represent features or attributes of a dataset.
Topic 2: Model - A representation of a real-world process or system that can be used to make predictions or generate outputs. In the asset management context, models are trained and deployed to make predictions on new data.
Topic 3: Labels - Tags or annotations assigned to data points to indicate their class or category. In the asset management context, labels are used for supervised learning tasks to train models.
Topic 4: Model Prediction - The output or result generated by a trained machine learning model when given input data. In the asset management context, model predictions are used to make decisions or generate insights.
Topic 5: API Gateway - A service that acts as a single entry point for multiple APIs, providing centralized management and control. In the asset management context, an API gateway can be used to expose machine learning models as RESTful APIs.
Topic 6: Logs - Recorded events or messages that provide information about the execution of a program or system. In the ML asset managementset managementset managementset managementset managementset management context, logs are used to track and monitor the performance of machine learning models.
Topic 7: Plots - Visual representations of data or mathematical functions. In the asset management context, plots are used to analyze and visualize the performance or behavior of machine learning models.
Topic 8: PyTorch - An open-source machine learning framework that provides a flexible and efficient way to build and train deep learning models. In the asset management context, PyTorch is used for developing and deploying deep learning models.
Topic 9: Custom Training - The process of training a machine learning model using custom algorithms or techniques tailored to a specific problem or domain. In the asset management context, custom training allows for more specialized and optimized models.
Topic 10: Hyperparameter - A parameter whose value is set before the learning process begins and remains constant during training. In the asset management context, hyperparameters are tuned to optimize the performance of machine learning models.
Topic 11: Parameters - Variables or values that are used as inputs to a function or model. In the asset management context, parameters are used to configure and customize the behavior of machine learning models.
Topic 12: Bucket - A logical container for storing objects (files) in cloud storage systems. In the asset management context, buckets are used to store and organize data, models, and other artifacts.
Topic 13: TensorFlow Model - A machine learning model built using the TensorFlow framework. In the asset management context, TensorFlow models are trained and deployed for various tasks such as image classification or natural language processing.
Topic 14: Custom Model - A machine learning model that is built from scratch or customized to fit specific requirements or constraints. In the asset management context, custom models are developed and deployed to solve unique problems.
Topic 15: Lookup Error - An error raised when a key or index used on a mapping or sequence is invalid, In the asset management context, lookup error can occur when incompatible data types are used in model training or inference.
Topic 16: Notebook Instance - A virtual environment that provides an interactive interface for writing and running code, typically used for data exploration and experimentation. In the asset management context, notebook instances are used for developing and testing machine learning models.
Topic 17: Pipeline Component - A modular unit or step in a data processing or machine learning pipeline. In the asset management context, pipeline components are used to define and execute the steps involved in training and deploying models.
Topic 18: Version - A specific release or iteration of a software or model. In the asset management context, versions are used to track and manage changes to models, code, or configurations.
Topic 19: Directory - A location or folder that stores files and other directories. In the asset management context, directories are used to organize and manage data, code, and other resources.
Topic 20: Sweep - A process of systematically exploring a range of hyperparameter values to find the optimal configuration for a machine learning model. In the asset management context, sweeps are used to automate hyperparameter tuning.
Topic 21: Web Service - A service or application that provides functionality or data over the internet using standard web protocols. In the asset management context, web services can be used to expose machine learning models as APIs.
Topic 22: Authentication - The process of verifying the identity of a user or system. In the asset management context, authentication is used to control access to machine learning models and resources.
Topic 23: Deployment - The process of making a machine learning model or application available for use in a production environment. In the asset management context, deployment involves deploying trained models to serve predictions or perform tasks.
Topic 24: Git Repo - A repository or storage space for version-controlled code and project files using the Git version control system. In the asset management context, Git repos are used to manage and track changes to machine learning models and code.
Topic 25: Port - A communication endpoint or interface in a computer network. In the asset management context, ports are used to specify the network location or address where machine learning models or services can be accessed.
Topic 26: Keras - A high-level deep learning framework that runs on top of other deep learning libraries such as TensorFlow or Theano. In the asset management context, Keras is used for building and training neural networks.
Topic 27: Compute Instances - Virtual machines or instances that provide computational resources for running applications or processes. In the asset management context, compute instances are used to train and deploy machine learning models.
Topic 28: Cluster - A group of interconnected computers or servers that work together to perform a task or provide a service. In the asset management context, clusters can be used for distributed training or parallel processing of machine learning tasks.
Topic 29: Permission - The authorization or access rights granted to a user or system to perform certain actions or access specific resources. In the ML asset managementset managementset managementset managementset managementset management context, permissions are used to control access to machine learning models, data, or resources.
Topic 30: Team - A group of individuals working together on a project or towards a common goal. In the asset management context, teams collaborate to develop, train, and deploy machine learning models.
Topic 31: Script RStudio - A script or program written in the R programming language and executed in the RStudio integrated development environment. In the asset management context, RStudio scripts can be used for data analysis, model training, or experimentation.
Topic 32: Metrics Evaluation - The process of assessing or measuring the performance or quality of a machine learning model using various metrics or evaluation criteria. In the asset management context, metrics evaluation is used to assess the effectiveness of models.
Topic 33: Data Studio - A platform or tool for visualizing, analyzing, and exploring data. In the asset management context, data studios are used to gain insights from data, perform data preprocessing, or create visualizations.
Topic 34: Storage - The act or process of storing or saving data or information. In the asset management context, storage refers to the systems or services used to store and manage data, models, or artifacts.
Topic 35: Inference Pipeline - A sequence or flow of steps or processes that transform input data into predictions or outputs. In the asset management context, inference pipelines are used to process and generate predictions from input data.
Topic 36: Training - The process of teaching or learning from data to improve the performance or accuracy of a machine learning model. In the asset management context, training involves feeding data to models to optimize their parameters or weights.
Topic 37: Endpoint - A specific URL or network address that provides access to a service or resource. In the asset management context, endpoints are used to expose machine learning models or APIs for making predictions or serving requests.
Topic 38: Config - Short for configuration, it refers to the settings or parameters that define the behavior or operation of a system or application. In the asset management context, config files are used to specify the configuration of machine learning models or systems.
Topic 39: Training Model - A machine learning model that has been trained on a specific dataset to make predictions or perform a task. In the asset management context, training models are developed and optimized for specific tasks or domains.
Topic 40: Object Attribute - A characteristic or property of an object in object-oriented programming. In the asset management context, object attributes can represent features, parameters, or metadata associated with machine learning models or data.
Topic 41: Model Registry - A centralized repository or database for storing and managing machine learning models. In the asset management context, model registries are used to track, version, and organize trained models.
Topic 42: Batch Transform - The process of applying a trained machine learning model to a large batch or set of input data to generate predictions or outputs. In the asset management context, batch transforms are used for bulk inference or processing of data.
Topic 43: Data - Information or facts that are collected, stored, or processed. In the asset management context, data refers to the input or training data used to develop and train machine learning models.
Topic 44: Apache Spark - An open-source distributed computing system designed for big data processing and analytics. In the asset management context, Apache Spark can be used for distributed training or processing of large-scale machine learning tasks.
Topic 45: Component - A modular or self-contained unit that performs a specific function or task. In the asset management context, components are used to build and assemble machine learning pipelines or systems.
Topic 46: Memory - The temporary storage or workspace used by a computer or system to hold data or instructions. In the asset management context, memory is used to store and manipulate data during the training or inference process.
Topic 47: Pipeline Data - The input or output data that flows through a machine learning pipeline or system. In the asset management context, pipeline data represents the data being processed or transformed by the pipeline.
Topic 48: Huggingface Model - A machine learning model built using the Hugging Face library, which specializes in natural language processing tasks. In the asset management context, Hugging Face models are used for tasks such as text classification or language generation.
Topic 49: Model Endpoint - The network address or URL where a trained machine learning model can be accessed or invoked to make predictions or serve requests. In the asset management context, model endpoints are used to expose models as APIs.
Topic 50: Batch Predictions - The process of making predictions or generating outputs for a large batch or set of input data using a trained machine learning model. In the asset management context, batch predictions are used for bulk inference or processing of data.
Topic 51: Network - A collection of interconnected nodes or devices that can communicate and exchange data. In the ML asset managementset managementset managementset managementset managementset management context, networks are used to connect and transfer data between different components or systems in a machine learning workflow.
Topic 52: Environment Variables - Variables or settings that define the operating environment or behavior of a system or application. In the ML asset managementset management context, environment variables can be used to configure or customize the execution of machine learning models or workflows.
Topic 53: Python - A popular programming language known for its simplicity and readability. In the ML asset managementset management context, Python is widely used for developing and implementing machine learning models and workflows.
Topic 54: Pandas - A Python library for data manipulation and analysis, particularly for working with structured data. In the asset management context, Pandas is used for data preprocessing, cleaning, and transformation.
Topic 55: Training Job - A specific instance or execution of the training process for a machine learning model. In the asset management context, training jobs are launched to train models on specific datasets or configurations.
Topic 56: Blob Storage - A type of cloud storage service that stores unstructured data as blobs or binary large objects. In the asset management context, blob storage is used to store and manage large files or datasets.
Topic 57: Stream Analytics - The process of analyzing and processing real-time or streaming data to extract insights or perform actions. In the asset management context, stream analytics can be used for real-time monitoring or processing of data in machine learning workflows.
Topic 58: Model Neo - A framework or platform for optimizing and deploying machine learning models on edge devices or hardware accelerators. In the asset management context, Neo is used to optimize and deploy models for efficient inference.
Topic 59: Script - A sequence of instructions or commands written in a programming language. In the asset management context, scripts are used to automate tasks, run experiments, or train machine learning models.
Topic 60: TensorBoard - A web-based tool provided by TensorFlow for visualizing and analyzing the training process and performance of machine learning models. In the asset management context, TensorBoard is used for monitoring and debugging models.
Topic 61: Server - A computer or system that provides resources or services to other computers or clients over a network. In the asset management context, servers are centralized service that tracks and manages various aspects of machine learning experiments.
Topic 62: Custom Images - Customized or specialized images that are created for specific purposes or requirements. In the asset management context, custom images can be used to create reproducible and customized environments for training or deploying models.
Topic 63: Loss - A measure of the error or difference between predicted and actual values in a machine learning model. In the asset management context, loss is used to evaluate and optimize the performance of models during training.
Topic 64: PySpark - The Python API for Apache Spark, a distributed computing system for big data processing and analytics. In the asset management context, PySpark is used for distributed training or processing of machine learning tasks.
Topic 65: Notebook - An interactive document or interface that combines code, text, and visualizations, typically used for data analysis, exploration, and documentation. In the asset management context, notebooks are used for developing, testing, and documenting machine learning models.
Topic 66: API - Short for Application Programming Interface, it defines the methods and protocols for communication between software components or systems. In the asset management context, APIs are used to expose machine learning models or services for integration or consumption.
Topic 67: Pipeline - A sequence or flow of steps or processes that transform input data into predictions or outputs. In the asset management context, pipelines are used to automate and streamline the end-to-end process of training and deploying machine learning models.
Topic 68: File - A named collection of data or information stored on a computer or storage system. In the asset management context, files are used to store code, data, models, or other resources related to machine learning workflows.
Topic 69: Dependencies - External libraries, packages, or modules that are required for a software or system to function properly. In the ML asset managementset managementset managementset managementset managementset management context, dependencies are used to specify and manage the required software components for running machine learning models or workflows.
Topic 70: Studio - A platform or environment that provides tools, services, and resources for developing, training, and deploying machine learning models. In the asset management context, studios are used for end-to-end machine learning workflows and collaboration.
Topic 71: Experiment - A systematic process or procedure carried out to test or validate a hypothesis or idea. In the asset management context, experiments are conducted to evaluate and compare different models, algorithms, or configurations.
Topic 72: List Index - The position or location of an element in a list or array. In the asset management context, list indexes are used to access or manipulate specific elements or values in data structures.
Topic 73: Model Deployment - The process of making a trained machine learning model available for use in a production environment or system. In the asset management context, model deployment involves deploying models to serve predictions or perform tasks.
Topic 74: Database - A structured collection of data or information stored in a computer or system. In the asset management context, databases are used to store and manage structured data related to machine learning models or applications.
Topic 75: ModuleNotFoundError - An error that occurs when a required module or package cannot be found or imported. In the asset management context, ModuleNotFoundError can occur when dependencies or required libraries are not properly installed or accessible.
Topic 76: Dataset - A structured collection of data or information, typically organized in tabular form or as a set of files. In the asset management context, datasets are used for training, testing, or evaluating machine learning models.
Topic 77: Metrics - Quantitative measures or indicators used to assess the performance or quality of a machine learning model. In the asset management context, metrics are used to evaluate and compare models based on their accuracy, precision, recall, or other criteria.
Topic 78: Text - Written or printed words or characters that represent language or information. In the asset management context, text data is commonly used for natural language processing tasks, such as sentiment analysis or text classification.
Topic 79: Files Studio - The file management or storage system within a machine learning studio or platform. In the asset management context, Files Studio is used to organize, store, and manage code, data, models, or other resources.
Topic 80: Python Packages - Pre-built or reusable collections of Python modules or code that provide specific functionality or features. In the asset management context, Python packages are used to extend the capabilities of Python for machine learning tasks.
Topic 81: Instance - A single occurrence or example of something. In the asset management context, instances refer to virtual machines or computing resources used for running applications or processes.
Topic 82: Files - Collections of data or information stored on a computer or storage system. In the asset management context, files are used to store code, data, models, or other resources related to machine learning workflows.
Topic 83: Experiment Workspace - A dedicated workspace or environment for conducting experiments, managing data, and developing machine learning models. In the asset management context, experiment workspaces provide a collaborative and organized environment for experimentation.
Topic 84: Image - A visual representation or graphical file format that contains visual information or data. In the asset management context, images can represent visual data, such as images or charts, or be used to create visualizations or diagrams.
Topic 85: Job - A task or process that is executed or performed by a computer or system. In the asset management context, jobs refer to specific tasks or operations related to training, deploying, or managing machine learning models.
Topic 86: Athena Table - A table or data structure in Amazon Athena, a serverless query service for analyzing data in Amazon S3 using SQL. In the asset management context, Athena tables can be used to query and analyze data for machine learning tasks.
Topic 87: Packages RPackage - Collections of pre-built or reusable code, functions, or modules in the R programming language. In the asset management context, R packages are used to extend the capabilities of R for machine learning tasks.
Topic 88: Lambda - A serverless computing service that allows you to run code without provisioning or managing servers. In the asset management context, Lambdas can be used to execute functions or processes related to machine learning workflows.
Topic 89: Studio Domain - A specific domain or environment within a machine learning studio or platform. In the asset management context, studio domains provide dedicated spaces or resources for different projects, teams, or purposes.
Topic 90: Report - A document or presentation that provides information, analysis, or findings on a specific topic or subject. In the ML asset managementset managementset managementset managementset managementset management context, reports can be used to communicate and share insights or results from machine learning experiments or analyses.
Topic 91: Model Artifacts - The files, data, or resources that are generated or produced during the training or development of a machine learning model. In the asset management context, model artifacts include trained weights, configurations, or metadata.
Topic 92: Limit - A restriction or constraint on the resources, usage, or behavior of a system or application. In the asset management context, limits are used to control or manage the usage or availability of resources for machine learning tasks.
Topic 93: Studio Model - A machine learning model that is developed, trained, or deployed within a machine learning studio or platform. In the asset management context, studio models are managed and monitored within the studio environment.
Topic 94: Model Inference - The process of using a trained machine learning model to make predictions or generate outputs based on new or unseen data. In the asset management context, model inference is the primary task of deploying and using machine learning models.
Topic 95: Batch Model - A machine learning model that is designed or optimized for batch processing or inference on large sets of data. In the asset management context, batch models are used for bulk inference or processing of data.
Topic 96: Artifact Root - The root or top-level directory or folder where artifacts or files are stored or organized. In the asset management context, artifact roots are used to define the structure or hierarchy of stored artifacts.
Topic 97: Files Pipeline - The files or data that flow through a machine learning pipeline or system. In the asset management context, files pipelines represent the input or output data being processed or transformed by the pipeline.
Topic 98: Pipeline Parameters - The configurable settings or values that control the behavior or execution of a machine learning pipeline. In the asset management context, pipeline parameters are used to customize or adapt the pipeline for specific tasks or requirements.
Topic 99: Artifacts - Files, data, or resources that are generated, produced, or used during the development, training, or deployment of machine learning models. In the asset management context, artifacts include code, models, data, or other resources related to machine learning workflows.
Topic 100: Studio Notebook - A notebook or document within a machine learning studio or platform that provides an interactive interface for writing and running code, documenting processes, or sharing insights. In the asset management context, studio notebooks are used for developing, testing, and documenting machine learning models.
Topic 101: Group - A collection or set of individuals or entities that are related or share common characteristics or goals. In the asset management context, groups can refer to teams, projects, or entities involved in machine learning workflows.
Topic 102: Designer - A person or role responsible for designing or creating machine learning models, systems, or workflows. In the asset management context, designers play a key role in developing and optimizing machine learning solutions.
Topic 103: Model Files - The files or resources that are associated with a machine learning model, such as the model architecture, weights, or configurations. In the asset management context, model files are used to store and manage the components of a trained model.
Topic 104: Datasets - Collections of data or information that are used for training, testing, or evaluating machine learning models. In the asset management context, datasets are used to feed data into models and assess their performance.
Topic 105: Workspace Access - The ability or permission to access and use a specific workspace or environment for machine learning tasks. In the asset management context, workspace access controls who can view, edit, or execute workflows within a workspace.
Topic 106: Models - Machine learning models or algorithms that are trained or developed to perform specific tasks or make predictions. In the asset management context, models are the core components of machine learning workflows and systems.
Topic 107: Loading Model - The process of loading or importing a trained machine learning model into memory or a runtime environment for inference or use. In the asset management context, loading models is a critical step in deploying and using machine learning models.
Topic 108: Resource Students - The resources, materials, or tools provided to students for learning or practicing machine learning concepts or techniques. In the asset management context, resource students refer to the educational resources or materials available to students.
Topic 109: Model Pipeline - A sequence or flow of steps or processes that transform input data into predictions or outputs using a machine learning model. In the asset management context, model pipelines are used to automate and streamline the process of training and deploying models.
Topic 110: Files Notebook - The files or resources associated with a notebook within a machine learning studio or platform. In the asset management context, files notebooks store code, data, or other resources related to the development or execution of machine learning workflows.
Topic 111: Module - A self-contained or reusable unit of code or functionality that can be imported or used in other programs or systems. In the asset management context, modules are used to organize and encapsulate code or functions related to machine learning tasks.
Topic 112: Pipeline Step - A specific task or operation within a machine learning pipeline. In the asset management context, pipeline steps represent the individual components or processes involved in training, evaluating, or deploying machine learning models.
Topic 113: Framework - A software or toolset that provides a foundation or structure for developing or implementing applications or systems. In the asset management context, frameworks provide the building blocks and libraries for developing and deploying machine learning models.
Topic 114: Storage Accounts - Accounts or services that provide storage resources or capabilities for storing and managing data. In the ML asset managementset managementset managementset managementset managementset management context, storage accounts are used to store and manage data, models, or other resources related to machine learning workflows.
Topic 115: Quota - A predefined or limited amount or quantity of resources or usage that is allocated or available for a specific purpose. In the asset management context, quotas are used to control or manage the usage or availability of resources for machine learning tasks.
Topic 116: Output - The result or outcome generated by a machine learning model or system. In the asset management context, outputs can refer to predictions, classifications, or any other generated information or data.
Topic 117: Hyperparameter Search - The process of systematically exploring a range of hyperparameter values to find the optimal configuration for a machine learning model. In the asset management context, hyperparameter search is used to automate the tuning of model parameters.
Topic 118: Checkpoints - Saved or intermediate states of a machine learning model during the training process. In the asset management context, checkpoints are used to save and restore model weights or configurations for resuming or evaluating training.
Topic 119: Model Data - The data or input used to train or evaluate a machine learning model. In the asset management context, model data represents the features, labels, or input used to develop and optimize machine learning models.
Topic 120: File Format - The structure or organization of data or information stored in a file. In the asset management context, file formats define how data is stored, accessed, or interpreted by machine learning models or systems.
Topic 121: Pipeline Endpoint - The network address or URL where a machine learning pipeline can be accessed or invoked to perform a specific task or operation. In the asset management context, pipeline endpoints are used to trigger or execute machine learning pipelines.
Topic 122: Feature Store - A centralized repository or database for storing and managing features or attributes used in machine learning models. In the asset management context, feature stores are used to organize, share, and reuse features across different models or pipelines.
Topic 123: Data Training - The process of using labeled or annotated data to train a machine learning model. In the asset management context, data training involves feeding data into models to optimize their parameters or weights.
Topic 124: Runs - Instances or executions of a machine learning workflow or experiment. In the asset management context, runs represent specific iterations or versions of a model or pipeline.
Topic 125: YML TemplatedConfigLoader - A configuration file format and loader that uses YAML syntax to define and load configuration settings or parameters. In the asset management context, YML TemplatedConfigLoader is used to specify and load configuration settings for machine learning workflows.
Topic 126: Kernel - A software component or module that provides the core functionality or services of an operating system. In the asset management context, kernels are used to execute and run code within a notebook or development environment.
Topic 127: Run Experiment - The process of executing or running a machine learning experiment to evaluate or validate a hypothesis or idea. In the asset management context, running experiments involves training models, evaluating performance, or generating insights.
Topic 128: Workspace - A dedicated environment or space for developing, training, and deploying machine learning models. In the asset management context, workspaces provide the infrastructure, tools, and resources for end-to-end machine learning workflows.
Topic 129: PyTorch Lightning - A lightweight PyTorch wrapper or library that simplifies the training and development of PyTorch models. In the asset management context, PyTorch Lightning is used to streamline and standardize the training process.
Topic 130: CUDA Memory - The memory or storage space on a GPU (Graphics Processing Unit) that is used for processing and executing CUDA (Compute Unified Device Architecture) operations. In the asset management context, CUDA memory is used for training and running deep learning models.
Topic 131: Estimator - An object or component that encapsulates the training, evaluation, and prediction capabilities of a machine learning model. In the asset management context, estimators are used to define and configure machine learning models for training or inference.
Topic 132: Task - A specific job, operation, or activity that needs to be performed or completed. In the asset management context, tasks refer to the individual steps or processes involved in training, evaluating, or deploying machine learning models.
Topic 133: Run Command - The execution or invocation of a specific command or instruction in a command-line interface or terminal. In the asset management context, run commands are used to trigger or execute machine learning workflows or processes.
Topic 134: Model Output - The result or output generated by a machine learning model when given input data. In the asset management context, model outputs can be predictions, classifications, or any other generated information or data.
Topic 135: Artifacts UI - The user interface or graphical interface for managing, viewing, or accessing artifacts or files within a machine learning platform or system. In the ML asset managementset managementset managementset managementset managementset management context, artifacts UI provides a visual interface for interacting with stored files or resources.
Topic 136: Code - Instructions or commands written in a programming language that can be executed or interpreted by a computer or system. In the asset management context, code is used to develop, train, and deploy machine learning models and workflows.
Topic 137: Account - A user account or profile that provides access and permissions to a system, platform, or service. In the asset management context, accounts are used to manage and control access to machine learning resources or platforms.
Topic 138: Package - A collection or bundle of code, files, or resources that are distributed or installed together as a single unit. In the asset management context, packages are used to package and distribute machine learning models, libraries, or tools.
Topic 139: Scoring - A numerical value or measure that represents the performance, quality, or effectiveness of a machine learning model. In the asset management context, scores are used to evaluate and compare models based on their accuracy, precision, recall, or other criteria.'''

In [5]:
topic_list = []
for index, topic_entry in enumerate(topics.split('\n')):
    topic_name, topic_info = topic_entry.split(': ')[-1].split(' - ')
    topic = {
        'Index': index + 1,
        'Topic': topic_name,
        'Description': topic_info,
    }
    topic_list.append(topic)
topic_df = pd.DataFrame(topic_list)
print(topic_df.to_latex(index=False))

\begin{tabular}{rll}
\toprule
 Index &                     Topic &                                                                                                                                                                                                                                                                                                                                        Description \\
\midrule
     1 &                    Docker &                                                                                                                           A platform for building, packaging, and distributing applications in containers. In the asset management context, Docker is used to create reproducible and isolated environments for deploying machine learning models. \\
     2 &                   Columns &                                                                                                                                                                       

In [6]:
macro_topic2index_list = [
    ('Code Development', [15,40,59,66,72,133,136]),
    ('Code Management', [24]),
    ('Compute Management', [28,46,81,92,108,115,130]),
    ('Data Development', [1,3,7,44,54,57,64,86]),
    ('Data Management', [12,34,43,56,74,76,78,84,96,99,104,122,123]),
    # ('Documentation Management', []),
    ('Environment Management', [0,11,16,18,27,31,33,38,45,52,53,62,65,69,70,75,79,80,87,89,100,102,105,111,113,125,126,128,138]),
    ('Experiment Management', [30,61,71,83,101,118,124,127,132]),
    ('Input/Output Management', [19,68,82,110,116,120,134,135]),
    ('Model Development', [8,9,10,13,20,26,36,39,55,117,129,131]),
    ('Model Management', [2,14,41,48,91,93,103,106,107,119]),
    ('Model Serving', [4,5,21,23,35,37,42,49,50,58,73,88,94,95,139]),
    ('Network Management', [25,51]),
    ('Observability Management', [6,32,60,63,77,90]),
    ('Pipeline Management', [17,47,67,85,97,98,109,112,121]),
    # ('QA Management', []),
    ('Security Management', [22,29,114,137]),
    # ('User Interface Management', []),
    # ('Miscellaneous', []),
]

topic_list = []
macro_topic_mapping = {}
macro_topic_indexing = {}
macro_topic2index_dict = {}
for index, topic_set in enumerate(macro_topic2index_list):
    macro_topic2index_dict[topic_set[0]] = topic_set[1]
    macro_topic_indexing[index] = topic_set[0]
    topic_list.extend(topic_set[1])
    for topic in topic_set[1]:
        macro_topic_mapping[topic] = index

print(find_duplicates(topic_list))
print(len(topic_df) == len(topic_list))
print(set(range(len(topic_list))).difference(set(range(topic_df.shape[0]))))

[]
True
set()


In [43]:
# assign human-readable & high-level topics to challenges & solutions

df = pd.read_json(os.path.join(path_rq1, 'topics.json'))
df['Challenge_topic_macro'] = -1

for index, row in df.iterrows():
    if row['Challenge_topic'] in macro_topic_mapping:
        df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']])
    else:
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_rq1, 'filtered.json'), indent=4, orient='records')

df_number = pd.DataFrame()

values = []
labels = []

for index, group in df.groupby('Challenge_topic_macro'):
    topic_list = [topic + 1 for topic in macro_topic2index_dict[macro_topic_indexing[index]]]
    entry = {
        'Index': index + 1,
        'Macro-topic': macro_topic_indexing[index],
        'Percentage (%)': round(len(group)/len(df)*100, 2),
        'Unresolved rate (%)': round(len(group[group['Challenge_solved_time'].isna()])/len(group)*100, 2),
        'Median resolution time (hours)': round(group['Challenge_solved_time'].median(), 2),
        'Topic list': topic_list,
    }
    df_number = pd.concat([df_number, pd.DataFrame([entry])], ignore_index=True)
    labels.append(macro_topic_indexing[index])
    values.append(len(group))

print(df_number.to_latex(float_format="%.2f", index=False))
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()

\begin{tabular}{rlrrrl}
\toprule
 Index &              Macro-topic &  Percentage (\%) &  Unresolved rate (\%) &  Median resolution time (hours) &                                                                                                                   Topic list \\
\midrule
     1 &         Code Development &            4.08 &                64.27 &                           53.26 &                                                                                               [16, 41, 60, 67, 73, 134, 137] \\
     2 &          Code Management &            0.45 &                69.23 &                           27.06 &                                                                                                                         [25] \\
     3 &       Compute Management &            3.46 &                66.58 &                           20.23 &                                                                                              [29, 47, 82, 93, 109, 116, 131] \\


In [29]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(len(group_k))
    values_p.append(len(group_p))

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 1.3298123491267688e-08
P-value of knowledge inquiry: 1.7512710170919377e-09
P-value of two type of inquiries: 0.03528018186508468


In [37]:
for index, diff in enumerate([p - k for p, k in zip(values_p, values_k)]):
    if abs(diff) > 50:
        print(f'{index+1}: {diff}')

1: 63
6: 53
23: 57
25: 56
29: 69
46: 52
69: 58
75: 99


In [36]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df_topics = []

for name, group in df.groupby('Challenge_topic_macro'):
    topic_info = {
        'Prevalence': group['Challenge_topic_macro'].count(),
        'Unresolved rate (%)': group['Challenge_solved_time'].isna().sum() / len(group) * 100,
        'Median resolved time (h)': group['Challenge_solved_time'].median(),
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_correlation = pd.DataFrame(index=df_topics.columns, columns=df_topics.columns)

# Iterate over each pair of columns
for i, col1 in enumerate(df_topics.columns):
    for j, col2 in enumerate(df_topics.columns):
        if i < j:
            tau, pvalue = kendalltau(df_topics[col1], df_topics[col2])
            df_correlation.loc[col1, col2] = f'{tau:.2f}/{pvalue:.2f}'

print(df_correlation.to_latex())

\begin{tabular}{llll}
\toprule
{} & Prevalence & Unresolved rate (\%) & Median resolved time (h) \\
\midrule
Prevalence               &        NaN &          -0.13/0.49 &                0.19/0.32 \\
Unresolved rate (\%)      &        NaN &                 NaN &               -0.37/0.06 \\
Median resolved time (h) &        NaN &                 NaN &                      NaN \\
\bottomrule
\end{tabular}



In [16]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df = df[df['Challenge_solved_time'].notna()]
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time = group['Challenge_solved_time'].apply(lambda x: np.log(x+1))
    resolved_time_list.append(resolved_time)

print(f'Kruskal-Wallis H-test: p-value = {kruskal(*resolved_time_list).pvalue:.3f}')

Kruskal-Wallis H-test: p-value = 0.001


In [None]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time = group['Challenge_solved_time'].apply(lambda x: np.log(x+1))
    resolved_time_list.append(resolved_time)

traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
fig = go.Figure(data=traces)
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale.pdf'))

In [None]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['Challenge_solved_time'] = df['Challenge_solved_time'].apply(lambda x: np.log(x+1))
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time_list.append(group['Challenge_solved_time'])

traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
fig = go.Figure(data=traces)
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale.pdf'))

In [35]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['Challenge_solved_time'] = df['Challenge_solved_time'].apply(lambda x: np.log(x+1))
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
    
fig = px.box(df, x="Challenge_topic_macro", y="Challenge_solved_time", color="Challenge_type")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    legend_title='Inquiry type',
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale (based on inquiry types).pdf'))

In [33]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
topic_names = df['Challenge_topic_macro'].unique()
df_comparison = pd.DataFrame(index=topic_names, columns=topic_names)
# resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    for name2, group2 in df.groupby('Challenge_topic_macro'):
        if name < name2:
            continue
        values = remove_nan(group['Challenge_solved_time'])
        values2 = remove_nan(group2['Challenge_solved_time'])
        statistic, pvalue = mannwhitneyu(values, values2)
        df_comparison.at[name, name2] = f'{statistic:.4f}/{pvalue:.4f}'
        
#     # resolved_time_list.append(group['Challenge_solved_time'])


# # Iterate over each pair of columns
# for i, col1 in enumerate(df_topics.columns):
#     for j, col2 in enumerate(df_topics.columns):
#         if i < j:
#             tau, pvalue = kendalltau(df_topics[col1], df_topics[col2])
#             df_correlation.loc[col1, col2] = f'{tau:.2f}/{pvalue:.2f}'

print(df_comparison.to_latex())

\begin{tabular}{llllllllllllllll}
\toprule
{} &                 12 &                 9  &                 8  &                  5  &                 0  &                  10 &                 11 &                 2  &                 7  &                 6  &                  4  &                 13 &                 3  &                14 &                1  \\
\midrule
12 &  35378.0000/1.0000 &  31864.5000/0.3302 &  45049.0000/0.1750 &  137593.0000/0.0402 &  24097.0000/0.2051 &   75684.0000/0.0341 &  10651.0000/0.4754 &  22069.0000/0.0001 &  31622.0000/0.0173 &  37528.0000/0.3669 &   38848.0000/0.0002 &                NaN &  28486.0000/0.0010 &               NaN &  2517.0000/0.2201 \\
9  &                NaN &  25992.0000/1.0000 &  36891.5000/0.7252 &  113142.0000/0.3702 &  19738.0000/0.6766 &                 NaN &                NaN &  18317.0000/0.0016 &  25993.5000/0.1443 &  30680.0000/0.9504 &   32098.5000/0.0050 &                NaN &  23460.5000/0.0173 &               NaN &  20

In [25]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic_macro'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(group_k['Challenge_solved_time'].isna().sum()/len(group_k))
    values_p.append(group_p['Challenge_solved_time'].isna().sum()/len(group_p))

values_k = remove_nan(values_k)
values_p = remove_nan(values_p)

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 0.5501300096511841
P-value of knowledge inquiry: 0.631529688835144
P-value of two type of inquiries: 0.4553018964198552


In [29]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(group_k['Challenge_solved_time'].median())
    values_p.append(group_p['Challenge_solved_time'].median())

values_k = remove_nan(values_k)
values_p = remove_nan(values_p)

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 2.157887777458163e-18
P-value of knowledge inquiry: 7.568273346198627e-25
P-value of two type of inquiries: 2.7861333502044098e-11


In [32]:
for index, diff in enumerate([p - k for p, k in zip(values_p, values_k)]):
    if abs(diff) > 500:
        print(f'{index+1}: {diff}')

15: -2233.3722222222
96: 635.2957094444
107: -11147.100535833351
131: 1073.1950750000499
