In [9]:
import os
import re
import time
import spacy
import pickle
import openai
import random
import enchant
import textstat
import warnings
import itertools
import subprocess
import webbrowser
import collections
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from langdetect import detect 
from collections import namedtuple
from scipy.stats import kruskal, shapiro, mannwhitneyu, kendalltau
from gensim.parsing.preprocessing import remove_stopwords, strip_short, strip_punctuation, STOPWORDS

In [14]:
path_dataset = '../../Dataset'
path_result = '../../Result'

path_rq1 = os.path.join(path_result, 'RQ1')

random_state = 42

warnings.filterwarnings('ignore')

# subprocess.run(['python', '-m' 'spacy', 'download', 'en_core_web_sm'])
nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

prompt_summary = '''Refine the title of the following post to make it short and clear in simple English.\n###'''

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['amazon', 'aws', 'maker', 'sage'],
    'Azure Machine Learning': ['aml', 'azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'H2O AI Cloud': ['h2o ai', 'h2o.ai'],
    'Kedro': ['kedro'],
    'MLflow': ['databricks', 'mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['google', 'gcp', 'vertex'],
    'Weights & Biases': ['biases', 'wandb', 'weights']
}

tools_keyword_set = set(itertools.chain(*tools_keyword_mapping.values()))

keywords_image = {
    ".jpg", 
    ".png", 
    ".jpeg", 
    ".gif", 
    ".bmp", 
    ".webp", 
    ".svg", 
    ".tiff"
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

error_words_basics = {
    'break',
    'bug',
    'ceas',
    'clos',
    'crash',
    'defect',
    'error',
    'fail',
    'fault',
    'fix',
    'finish',
    'flaw',
    'glitch',
    'halt',
    'malfunction',
    'mistak',
    'paus',
    'shutdown',
    'stop',
    'terminat',
}

stop_words_se = {
    'ability',
    'abilities',
    'accident',
    'accidents',
    'acknowledgement',
    'action',
    'actions',
    'activities',
    'activity',
    'advantage',
    'advantages',
    'advice',
    'ai',
    'alternative',
    'alternatives',
    'announcement',
    'announcements',
    'anomaly'
    'anomalies'
    'answer',
    'answers',
    'appreciation',
    'approach',
    'approaches',
    'article',
    'articles',
    'assistance',
    'attempt',
    'author',
    'behavior',
    'behaviour',
    'benefit',
    'benefits',
    'bit',
    'bits',
    'block',
    'blocks',
    'blog',
    'blogs',
    'body',
    'building',
    'case',
    'cases',
    'categories',
    'categorization',
    'category',
    'cause',
    'causes',
    'challenge',
    'challenges',
    'change',
    'changes',
    # 'char',
    'check',
    'choice',
    'choices',
    'clarification',
    'code',
    'collection',
    'com',
    'combination',
    # 'commmunication',
    # 'community',
    # 'communities',
    # 'company',
    # 'companies',
    # 'computer',
    # 'computers',
    # 'concept',
    # 'concepts',
    'concern',
    'concerns',
    # 'condition',
    # 'conditions',
    'confirmation',
    'confusion',
    'consideration',
    # 'content',
    # 'contents',
    'context',
    # 'count',
    'couple',
    'couples',
    'course',
    'courses',
    'cross',
    # 'custom',
    'day',
    'days',
    'demand',
    'description',
    'descriptions',
    'desire',
    'detail',
    'details',
    'devops',
    'difference',
    'differences',
    'difficulties',
    'difficulty',
    'disadvantage',
    'disadvantages',
    'discrepancies',
    'discrepancy',
    'discussion',
    'discussions',
    'dislike',
    'distinction',
    'effect',
    'end',
    # 'engineering',
    'enquiries',
    'enquiry',
    'evidence',
    'example',
    'examples',
    'exception',
    'exceptions',
    'existence',
    'exit',
    'expectation',
    'experience',
    'expert',
    'experts',
    'explanation',
    'explanations',
    'fact',
    'facts',
    'favorite',
    'favorites',
    # 'feature',
    # 'features',
    'feedback',
    'feedbacks',
    'fix',
    'fixes',
    # 'float',
    # 'form',
    # 'forms',
    'forum'
    'forums'
    # 'functionality',
    # 'functionalities',
    'future',
    'goal',
    'goals',
    'guarantee',
    'guidance',
    'guideline',
    'guide',
    'guides',
    'guy',
    'guys',
    'harm',
    'hello',
    'help',
    'hour',
    'hours',
    'ibm',
    'idea',
    'ideas',
    'individual',
    'individuals',
    'info',
    'information',
    'inquiries',
    'inquiry',
    'insight',
    'instruction',
    'instructions',
    # 'int',
    'intelligence',
    'intent',
    'interest',
    'introduction',
    'investigation',
    'invitation',
    'ipynb',
    'issue',
    'issues',
    'kind',
    'kinds',
    'lack',
    'learning',
    'level',
    'levels',
    'look',
    'looks',
    'lot',
    'lots',
    'luck',
    'machine',
    'major',
    'manner',
    'manners',
    'manual',
    'mark',
    'means',
    'meaning',
    # 'message',
    # 'messages',
    'method',
    'methods',
    'ml',
    'mlops',
    'minute',
    'minutes',
    'month',
    'months',
    'need',
    'needs',
    'number',
    'numbers',
    'offer',
    'one',
    'ones',
    'opinion',
    'opinions',
    # 'org',
    # 'organization',
    # 'organizations',
    'outcome',
    'part',
    'parts',
    'past',
    'people',
    'person',
    'persons',
    'perspective',
    'perspectives',
    'place',
    'places',
    'point',
    'points',
    'post',
    'posts',
    'practice',
    'practices',
    'problem',
    'problems',
    'product',
    'products',
    'program',
    'programs',
    'project',
    'projects',
    'proposal',
    'proposals',
    'purpose',
    'purposes',
    'py',
    # 'python',
    'qa',
    'question',
    'questions',
    'reason',
    'reasons',
    # 'research',
    'result',
    'results',
    'scenario',
    'scenarios',
    'science',
    'screenshot',
    'screenshots',
    'second',
    'seconds',
    'section',
    'sense',
    'shortcoming',
    'shortcomings',
    'show',
    'shows',
    'situation',
    'software',
    'solution',
    'solutions',
    'start',
    # 'state',
    # 'statement',
    # 'states',
    # 'status',
    # 'step',
    # 'steps',
    # 'string',
    # 'student',
    # 'students',
    'study',
    'stuff',
    'success',
    'suggestion',
    'suggestions',
    'summary',
    'summaries',
    'surprise',
    'support',
    'supports',
    'talk',
    'talks',
    'task',
    'tasks',
    # 'teacher',
    # 'teachers',
    # 'team',
    # 'teams',
    'technique',
    'techniques',
    'technologies',
    'technology',
    'term',
    'terms',
    'tip',
    'tips',
    'thank',
    'thanks',
    'thing',
    'things',
    'thought',
    'thoughts',
    'three',
    'title',
    'today',
    'tomorrow',
    # 'tool',
    # 'tools',
    'total',
    'trouble',
    'troubles',
    'truth',
    'try',
    'tutorial',
    'tutorials',
    'two',
    'understand',
    'understanding',
    'usage',
    'use',
    'user',
    'users',
    'uses',
    # 'value',
    # 'values',
    'view',
    'viewpoint',
    'way',
    'ways',
    'weakness',
    'weaknesses',
    'week',
    'weeks',
    'word',
    'words',
    'work',
    'workaround',
    'workarounds',
    'works',
    'yeah',
    'year',
    'years',
    'yesterday',
}

stop_words_ml = {
    'ad',
    'ads',
    'advertisement',
    'advertisements',
    'algorithm',
    'algorithms',
    'analysis',
    'anomaly',
    'asr',
    'audio',
    'autoencoder',
    'automl',
    'autopilot',
    'bert',
    'bi',
    'chatbot',
    'classification',
    'classifier',
    'clustering',
    'cnn',
    'cv',
    'decision',
    'detection',
    'dimensionality',
    'embedding',
    'estimation',
    'extraction',
    'forecast',
    'forecaster',
    'forecasts',
    'forecasting',
    'forest',
    'fraud',
    'gan',
    'gesture',
    'gpt',
    'ica',
    'kmeans',
    'k-means',
    'knn',
    'language',
    'languages',
    'lda',
    'lstm',
    'mining',
    'modelling',
    'ngram',
    'n-gram',
    'nlp',
    'nmf',
    'ocr',
    'pca',
    'processing',
    'recognition',
    'recommendation',
    'recommendations',
    'recommender',
    'reduction',
    'regression',
    'regressor',
    'reinforcement',
    'rf',
    'rl',
    'rnn',
    'segmentation',
    'sentiment',
    'series',
    'sound',
    'spam',
    'speech',
    'stt',
    'strategy',
    'strategies',
    'svd',
    'svm',
    # 'text',
    # 'texts',
    'time',
    'timeseries'
    'topic',
    'topics',
    'transformer',
    'transformers',
    'translation',
    'translator',
    'tree',
    'trend',
    'trends',
    'tsne',
    'tts',
    'vae',
    'video',
    'vision',
    'voice',
}

stop_words = STOPWORDS.union(stop_words_se.union(stop_words_ml))


In [15]:
def count_code_line(block_list):
    total_loc = 0
    for block in block_list:
        for line in block.splitlines():
            if line.strip():
                total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    for tag in soup.find_all(['code', 'blockquote', 'a']):
        tag.decompose()
    clean_text = soup.get_text(separator=' ')
    return clean_text, links, code_line1 + code_line2

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern1 = r"\!?\[.*?\]\((.*?)\)"
    links1 = re.findall(link_pattern1, text)
    clean_text = re.sub(link_pattern1, '', text)
    link_pattern2 = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links2 = re.findall(link_pattern2, clean_text)
    clean_text = re.sub(link_pattern2, '', clean_text)
    links = links1 + links2
    return clean_text, links

def split_content(content):
    clean_text, links1, code_line1 = extract_styles(content)
    clean_text, code_line2 = extract_code(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_line = code_line1 + code_line2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_line'])
    return content_collection(clean_text, links, code_line)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def extract_nouns_verbs(text, verb):
    doc = nlp(text)
    if verb:
        words = [token.text for token in doc if (token.pos_ == "VERB") or (token.pos_ == "NOUN")]
    else:
        words = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(words)

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        # In case the detection fails (e.g. if the text is too short or doesn't contain enough features)
        return False

def extract_english(text):
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

def remove_words_with_substring(text, substring_list):
    words = text.split()
    for substring in substring_list:
        words = [word for word in words if substring not in word]
    return ' '.join(words)

def preprocess_text(text, verb=True):
    clean_text = text.lower()
    clean_text = strip_punctuation(clean_text)
    clean_text = extract_english(clean_text)
    clean_text = remove_words_with_substring(clean_text, error_words_basics)
    clean_text = remove_words_with_substring(clean_text, tools_keyword_set)
    clean_text = remove_stopwords(clean_text, stop_words)
    clean_text = extract_nouns_verbs(clean_text, verb)
    # clean_text = strip_short(clean_text)
    return clean_text

def analyze_links(links):
    image_links = 0
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([image in link for image in keywords_image]):
            image_links += 1
        elif any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['image', 'documentation', 'tool', 'issue', 'patch', 'tutorial', 'example'])
    return link_analysis(image_links, documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0)
            time.sleep(sleep)
            x += 1

def find_duplicates(in_list):  
    duplicates = []
    unique = set(in_list)
    for each in unique:
        count = in_list.count(each)
        if count > 1:
            duplicates.append(each)
    return duplicates

def remove_nan(in_list):
    return [elem for elem in in_list if pd.notna(elem)]

def input_challenge_type(link):
    print(link)
    
    choice_mapping = {
        "K": 'knowledge',
        "N": 'na',
        "P": 'problem',
    }
    valid_options = ["P", "K", "N"]
    
    webbrowser.open(link)
    choice = input("Choose an option - 'P', 'K', or 'N': ").upper()

    while choice not in valid_options:
        print("Invalid choice. Please choose 'P', 'K', or 'N' only.")
        choice = input("Choose an option - 'P', 'K', or 'N': ").upper()

    return choice_mapping[choice]


In [11]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']
    df_issues.at[index, 'Challenge_comment_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_comment_body'] = row['Issue_comment_body']
    df_issues.at[index, 'Challenge_comment_score'] = row['Issue_comment_score']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))
df_discussions = pd.read_json(os.path.join(path_dataset, 'discussions.json'))
df_qa = pd.concat([df_questions, df_discussions], ignore_index=True)

for index, row in df_qa.iterrows():
    df_qa.at[index, 'Challenge_title'] = row['Question_title']
    df_qa.at[index, 'Challenge_body'] = row['Question_body']
    df_qa.at[index, 'Challenge_link'] = row['Question_link']
    df_qa.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_qa.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_qa.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_qa.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_qa.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_qa.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_qa.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_qa.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_qa.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_qa.at[index, 'Challenge_self_closed'] = row['Question_self_closed']
    df_qa.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_qa.at[index, 'Challenge_comment_body'] = row['Question_comment_body']
    df_qa.at[index, 'Challenge_comment_score'] = row['Question_comment_score']

    df_qa.at[index, 'Solution_body'] = row['Answer_body']
    df_qa.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_qa.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_qa.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_qa.at[index, 'Solution_comment_score'] = row['Answer_comment_score']
    df_qa.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

df = pd.concat([df_issues, df_qa], ignore_index=True)
df = df[df.columns.drop(list(df.filter(regex=r'(Issue|Question|Answer)_')))]
df = df.sort_values(by=['challenge_link'])

df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [None]:
# name = 'topics'

# df = pd.read_json(os.path.join(path_dataset, f'{name}.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'original.json'))

# for index, row in df.iterrows():
#     for i2, r2 in df_old.iterrows():
#         if row['Challenge_link'] == r2['Challenge_link']:
#             df.at[index, 'Tools'] = r2['Tools']
#             break

# df.to_json(os.path.join(path_dataset, f'{name}.json'), indent=4, orient='records')

In [12]:
# Filter out non-English posts

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    clean_title = remove_words_with_substring(row['Challenge_title'].lower(), tools_keyword_set)
    if not is_english(clean_title):
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')

In [None]:
df = pd.read_json(os.path.join(path_dataset, 'labels.json'))

for index, row in df.iterrows():
    df.at[index, 'Challenge_type'] = input_challenge_type(row['Challenge_link'])
    if index % 50 == 49:
        df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')
    
df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')

In [13]:
# df = pd.read_json(os.path.join(path_dataset, 'labels.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

# # df_difference = pd.concat([df_old, df, df]).drop_duplicates('Challenge_link', keep=False, ignore_index=True)
# df_difference = pd.concat([df, df_old, df_old]).drop_duplicates('Challenge_link', keep=False, ignore_index=True)

# df_all = pd.concat([df_old, df], ignore_index=True)
# df_duplicate = df_all[df_all.duplicated(['Challenge_link'], keep='last')]

# df_new = pd.concat([df_difference, df_duplicate], ignore_index=True)
# df_new.to_json(os.path.join(path_dataset, 'preprocessed+.json'), indent=4, orient='records')

In [None]:
# Discard all the rows that do not have a challenge type

df = pd.read_json(os.path.join(path_dataset, 'labels.json'))

df = df[df['Challenge_type'] != 'na']

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [22]:
# df = pd.read_json(os.path.join(path_dataset, 'labels.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

# for index, row in df.iterrows():
#   for i2, r2 in df_old.iterrows():
#       if row['Challenge_link'] == r2['Challenge_link']:
#           df.at[index, 'Challenge_type'] = r2['Challenge_type']
#           break

# df.to_json(os.path.join(path_dataset, 'labels.json'), indent=4, orient='records')

In [None]:
# Generate GPT-4 titles for the challenges

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary']):
        continue

    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + '\nBody: ' + row['Challenge_body'] + '###\nRefined Title: '
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=50,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(5)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


In [16]:
# Post level preprocessing

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    clean_title = preprocess_text(row['Challenge_title'])
    df.at[index, 'Challenge_preprocessed_title'] = clean_title
    
    clean_summary = preprocess_text(row['Challenge_gpt_summary'])
    df.at[index, 'Challenge_preprocessed_gpt_summary'] = clean_summary
    
    body_analyzer = split_content(row['Challenge_body'])
    clean_body = preprocess_text(body_analyzer.text)
    df.at[index, 'Challenge_preprocessed_content'] = clean_title + '\n' + clean_body
    
    # link_analyzer = analyze_links(challenge_analyzer.links)
    # text_analyzer = analyze_text(challenge_analyzer.text)
    
    # df.at[index, 'Challenge_code_count'] = challenge_analyzer.code_line
    # df.at[index, 'Challenge_word_count'] = text_analyzer.word_count
    # df.at[index, 'Challenge_readability'] = text_analyzer.readability
    # df.at[index, 'Challenge_reading_time'] = text_analyzer.reading_time
    # df.at[index, 'Challenge_link_count_image'] = link_analyzer.image
    # df.at[index, 'Challenge_link_count_documentation'] = link_analyzer.documentation
    # df.at[index, 'Challenge_link_count_example'] = link_analyzer.example
    # df.at[index, 'Challenge_link_count_issue'] = link_analyzer.issue
    # df.at[index, 'Challenge_link_count_patch'] = link_analyzer.patch
    # df.at[index, 'Challenge_link_count_tool'] = link_analyzer.tool
    # df.at[index, 'Challenge_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Challenge_comment_body']):
    #     comment_analyzer = split_content(row['Challenge_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Challenge_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Challenge_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Challenge_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Challenge_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Challenge_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Challenge_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Challenge_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Challenge_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Challenge_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Challenge_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Challenge_comment_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Solution_body']):
    #     solution_analyzer = split_content(row['Solution_body'])
    #     link_analyzer = analyze_links(solution_analyzer.links)
    #     text_analyzer = analyze_text(solution_analyzer.text)
        
    #     df.at[index, 'Solution_code_count'] = solution_analyzer.code_line
    #     df.at[index, 'Solution_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_link_count_tutorial'] = link_analyzer.tutorial
        
    # if pd.notna(row['Solution_comment_body']):
    #     comment_analyzer = split_content(row['Solution_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Solution_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Solution_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_comment_link_count_tutorial'] = link_analyzer.tutorial
    
    df.at[index, 'Challenge_solved_time'] = (row['Challenge_closed_time'] - row['Challenge_created_time']) / pd.Timedelta(hours=1)

df = df.reindex(sorted(df.columns), axis=1)
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [13]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df['State'] = df['Challenge_solved_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

df = df.explode('Tools')
categories = ['Platform', 'Tools', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.pdf'))


In [28]:
df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df_inquiry = []

for name, group in df.groupby(['Challenge_type']):
    info = {
        'Type': name,
        'Prevalence (%)': round(len(group) / len(df) * 100, 2),
        'Unresolved rate (%)': round(len(group[group['Challenge_closed_time'].isna()]) / len(group) * 100, 2),
        'Median resolution time (hours)': round(group['Challenge_solved_time'].median(), 2),
    }
    df_inquiry.append(info)

df_inquiry = pd.DataFrame(df_inquiry)
print(df_inquiry.to_latex(index=False, float_format="%.2f"))

\begin{tabular}{lrrr}
\toprule
     Type &  Prevalence (\%) &  Unresolved rate (\%) &  Median resolution time (hours) \\
\midrule
knowledge &           41.42 &                66.92 &                           22.75 \\
  problem &           58.58 &                64.19 &                           65.43 \\
\bottomrule
\end{tabular}



In [29]:
df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
df = df[df['Challenge_solved_time'].notna()]
df_problem = df[df['Challenge_type'] == 'problem']['Challenge_solved_time'].tolist()
df_knowledge = df[df['Challenge_type'] == 'knowledge']['Challenge_solved_time'].tolist()

print("P-value of problem inquiry:", shapiro(df_problem)[1])
print("P-value of knowledge inquiry:", shapiro(df_knowledge)[1])
print("P-value of two type of inquiries:", mannwhitneyu(df_problem, df_knowledge)[1])


P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 0.0
P-value of two type of inquiries: 1.7583830975428765e-20


In [9]:
prompt_topic = '''You will be given a set of topics refering to specific software engineering concepts. Please summarize each topic in a phrase and attach one sentence description in the MLOps context. Also, you must guarantee that those phrases are not duplicate with one another.###\n'''

with open(os.path.join(path_rq1, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

    topic_term_list = []
    for index, topic in enumerate(topic_terms):
        terms = ', '.join([term[0] for index, term in enumerate(topic)])
        topic_term = f'Topic {index}: {terms}'
        topic_term_list.append(topic_term)

    prompt = prompt_topic + '\n'.join(topic_term_list) + '\n###\n'
    completion = openai.ChatCompletion.create(
        model='gpt-3.5-turbo-16k',
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0,
        max_tokens=8000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        timeout=500,
        stream=False)

    topics = completion.choices[0].message.content
    print(topics)

Topic 0: Pipeline Step - A step in a pipeline that represents a specific task or action.
MLOps Context: Each pipeline step represents a specific action or task in the MLOps workflow, such as data preprocessing or model training.

Topic 1: Log - A record of events or messages generated by a system or application.
MLOps Context: Logs are used to track and monitor the execution of MLOps pipelines and provide visibility into the system's behavior and performance.

Topic 2: Docker Image - A lightweight, standalone, and executable software package that includes everything needed to run a piece of software, including the code, runtime, libraries, and system tools.
MLOps Context: Docker images are commonly used in MLOps to package and deploy machine learning models and their dependencies in a consistent and reproducible manner.

Topic 3: Model Tar - A compressed archive file format used to store and distribute machine learning models.
MLOps Context: Model tar files are commonly used in MLOps t

In [5]:
topics = '''Topic 0: Pipeline Step - A step in a pipeline that represents a specific task or action.
MLOps Context: Each pipeline step represents a specific action or task in the MLOps workflow, such as data preprocessing or model training.

Topic 1: Log - A record of events or messages generated by a system or application.
MLOps Context: Logs are used to track and monitor the execution of MLOps pipelines and provide visibility into the system's behavior and performance.

Topic 2: Docker Image - A lightweight, standalone, and executable software package that includes everything needed to run a piece of software, including the code, runtime, libraries, and system tools.
MLOps Context: Docker images are commonly used in MLOps to package and deploy machine learning models and their dependencies in a consistent and reproducible manner.

Topic 3: Model Tar - A compressed archive file format used to store and distribute machine learning models.
MLOps Context: Model tar files are commonly used in MLOps to package and distribute trained machine learning models for deployment and inference.

Topic 4: File Directory - A hierarchical structure used to organize and store files and folders.
MLOps Context: File directories are used in MLOps to organize and manage the various artifacts, scripts, and data used in the machine learning workflow.

Topic 5: Plot - A graphical representation of data or information.
MLOps Context: Plots and charts are commonly used in MLOps to visualize and analyze the performance and behavior of machine learning models.

Topic 6: Runs - Instances of executing a pipeline or workflow.
MLOps Context: Runs represent the execution of MLOps pipelines or workflows, allowing for tracking, monitoring, and managing the different stages and steps of the process.

Topic 7: Artifacts - Any output or intermediate result generated during the execution of a pipeline or workflow.
MLOps Context: Artifacts in MLOps include trained models, evaluation metrics, logs, and any other files or data generated during the machine learning workflow.

Topic 8: Predict - To make a forecast or estimate based on available data or a trained model.
MLOps Context: Predictions are a key component of MLOps, where trained models are used to make predictions or forecasts based on new or unseen data.

Topic 9: Labeling - The process of assigning predefined categories or tags to data for training or evaluation purposes.
MLOps Context: Labeling is an important step in MLOps, where data is annotated or labeled to create labeled datasets for training and evaluating machine learning models.

Topic 10: Model Training - The process of training a machine learning model using labeled data to learn patterns and make predictions.
MLOps Context: Model training is a core component of MLOps, where machine learning models are trained using labeled datasets to learn patterns and make predictions.

Topic 11: Sweep - A process of systematically exploring a range of hyperparameters or configurations to find the best performing model.
MLOps Context: Sweeps are used in MLOps to automate the process of hyperparameter tuning and find the optimal configuration for a machine learning model.

Topic 12: Workspace - A virtual environment or platform where data, code, and experiments are organized and managed.
MLOps Context: Workspaces provide a centralized and collaborative environment for MLOps teams to manage and track their data, code, and experiments.

Topic 13: Install Package - The process of adding or setting up a software package or library in a system or environment.
MLOps Context: Installing packages is a common task in MLOps to set up the necessary dependencies and libraries required for running machine learning workflows.

Topic 14: Managed Notebook - A cloud-based notebook environment that provides a pre-configured and managed environment for running code and experiments.
MLOps Context: Managed notebooks are used in MLOps to provide a collaborative and scalable environment for running and managing machine learning experiments and workflows.

Topic 15: Deploying Model - The process of making a trained machine learning model available for use in a production environment.
MLOps Context: Deploying models is a critical step in MLOps, where trained models are deployed to production environments to serve predictions or perform inference.

Topic 16: Experiment - A systematic process of testing and evaluating hypotheses or models to gain insights or make predictions.
MLOps Context: Experiments are a core component of MLOps, where different models, algorithms, or configurations are tested and evaluated to improve the performance of machine learning workflows.

Topic 17: Parameters - Variables or settings that can be adjusted or passed to a program or function to control its behavior.
MLOps Context: Parameters are used in MLOps to configure and customize the behavior of machine learning models, pipelines, or workflows.

Topic 18: Compute Instance - A virtual machine or server used to run computations or execute code.
MLOps Context: Compute instances are used in MLOps to provide the necessary computational resources for running machine learning models, training jobs, or data processing tasks.

Topic 19: Batch Transform - The process of applying a trained machine learning model to a large batch of data for prediction or inference.
MLOps Context: Batch transforms are used in MLOps to apply trained models to large datasets in a batch or batch-wise manner for prediction or inference.

Topic 20: Python Install - The process of setting up or configuring the Python programming language and its associated packages or libraries.
MLOps Context: Python installation is a common task in MLOps to set up the Python environment and install the necessary packages and libraries for running machine learning workflows.

Topic 21: Permissions Role - A set of permissions or access rights assigned to a user or group of users in a system or application.
MLOps Context: Permissions roles are used in MLOps to control and manage the access and permissions of users or groups to different resources, such as datasets, models, or pipelines.

Topic 22: Web Service - A software system or application that provides functionality or resources over the internet or a network.
MLOps Context: Web services are used in MLOps to expose machine learning models or pipelines as APIs or endpoints for serving predictions or performing inference.

Topic 23: Metrics - Quantitative measures or indicators used to evaluate or assess the performance or quality of a system or model.
MLOps Context: Metrics are used in MLOps to evaluate and monitor the performance and quality of machine learning models, pipelines, or workflows.

Topic 24: Server - A computer or system that provides resources or services to other computers or clients over a network.
MLOps Context: Servers are used in MLOps to host and serve machine learning models, APIs, or web services for prediction or inference.

Topic 25: Authentication - The process of verifying the identity or credentials of a user or system to grant access to resources or services.
MLOps Context: Authentication is an important aspect of MLOps to ensure secure access and control to machine learning models, pipelines, or data.

Topic 26: Creating Studio - The process of setting up or configuring a development or collaboration environment for machine learning workflows.
MLOps Context: Creating a studio involves setting up a collaborative environment for MLOps teams to work on machine learning projects, share code, and manage experiments.

Topic 27: Loading Data - The process of reading or importing data from a file or external source into a system or application.
MLOps Context: Loading data is a common task in MLOps to import and prepare datasets for training, evaluation, or inference in machine learning workflows.

Topic 28: Inference Endpoint - A network-accessible interface or service that allows clients to send data and receive predictions or inferences from a trained machine learning model.
MLOps Context: Inference endpoints are used in MLOps to deploy and serve trained machine learning models, allowing clients to send data and receive predictions or inferences.

Topic 29: Saving Model - The process of persisting or storing a trained machine learning model to a file or storage system for later use or deployment.
MLOps Context: Saving models is a critical step in MLOps to store and persist trained machine learning models for later use in deployment or inference.

Topic 30: Model Endpoint - A network-accessible interface or service that exposes a trained machine learning model for prediction or inference.
MLOps Context: Model endpoints are used in MLOps to deploy and serve trained machine learning models, providing a network-accessible interface for prediction or inference.

Topic 31: Environment - The combination of hardware, software, and settings that provide a specific computing or development environment.
MLOps Context: Environments are used in MLOps to provide consistent and reproducible computing or development environments for running machine learning workflows.

Topic 32: Spark - An open-source distributed computing system designed for big data processing and analytics.
MLOps Context: Spark is commonly used in MLOps for distributed data processing, machine learning, and analytics tasks, enabling scalable and efficient processing of large datasets.

Topic 33: Log Metrics - The process of recording or tracking specific metrics or variables during the execution of a pipeline or workflow.
MLOps Context: Logging metrics is a common practice in MLOps to track and monitor the performance and behavior of machine learning models, pipelines, or workflows.

Topic 34: Bucket - A logical container or storage unit used to store and organize files or objects in cloud storage systems.
MLOps Context: Buckets are used in MLOps to store and manage datasets, models, or other artifacts in cloud storage systems, providing a scalable and reliable storage solution.

Topic 35: Scoring - The process of evaluating or assigning scores or probabilities to data based on a trained machine learning model.
MLOps Context: Scoring is a common task in MLOps, where trained models are used to assign scores or probabilities to new or unseen data for classification or prediction.

Topic 36: Training - The process of teaching or learning from data to develop a machine learning model or algorithm.
MLOps Context: Training is a core component of MLOps, where machine learning models are developed and optimized using labeled datasets to learn patterns and make predictions.

Topic 37: Limit Exceeded - The condition where a system or resource has reached or exceeded its predefined limit or capacity.
MLOps Context: Limit exceeded refers to situations in MLOps where a system or resource, such as compute capacity or API rate limits, has reached its maximum capacity or allowed usage.

Topic 38: Deployment - The process of making a software application or system available for use in a specific environment or context.
MLOps Context: Deployment is a critical step in MLOps, where trained machine learning models or pipelines are deployed to production environments for serving predictions or performing inference.

Topic 39: Model Registry - A centralized repository or catalog for storing and managing versions of machine learning models.
MLOps Context: Model registries are used in MLOps to store and manage different versions of trained machine learning models, allowing for version control and easy access.

Topic 40: Custom Job - A job or task that is specifically tailored or customized to meet specific requirements or needs.
MLOps Context: Custom jobs are used in MLOps to define and execute specialized tasks or workflows that are tailored to specific requirements or use cases.

Topic 41: Storage - The process or act of storing or saving data or information for later use or retrieval.
MLOps Context: Storage is a critical component of MLOps, where data, models, and other artifacts are stored and managed for training, evaluation, or deployment.

Topic 42: Push - The act of sending or uploading changes or updates to a remote repository or server.
MLOps Context: Pushing changes or updates to a remote repository or server is a common practice in MLOps to share and collaborate on code, models, or other artifacts.

Topic 43: Cluster - A group or collection of interconnected computers or servers that work together to perform a specific task or function.
MLOps Context: Clusters are used in MLOps to provide the necessary computational resources for distributed data processing, training, or inference tasks.

Topic 44: Notebook Instance - A virtual environment or platform that provides an interactive interface for running code, analyzing data, and creating visualizations.
MLOps Context: Notebook instances are used in MLOps to provide a collaborative and interactive environment for developing, testing, and documenting machine learning workflows.

Topic 45: Service Timeout - The condition where a service or operation takes longer than the predefined time limit to complete.
MLOps Context: Service timeouts refer to situations in MLOps where a service or operation, such as model deployment or API response, takes longer than the predefined time limit to complete.

Topic 46: Loading Model - The process of reading or importing a trained machine learning model from a file or storage system into memory for use or inference.
MLOps Context: Loading models is a common task in MLOps to load trained machine learning models into memory for deployment, inference, or further processing.

Topic 47: Tracking - The process of recording or monitoring the progress, changes, or status of a task, experiment, or workflow.
MLOps Context: Tracking is an essential aspect of MLOps, where the progress, changes, or status of machine learning tasks, experiments, or workflows are recorded and monitored.

Topic 48: Optimization - The process of improving or maximizing the performance, efficiency, or effectiveness of a system, model, or process.
MLOps Context: Optimization is a key objective in MLOps, where machine learning models, pipelines, or workflows are optimized to improve performance, accuracy, or efficiency.

Topic 49: Object Attribute - A characteristic or property associated with an object or entity in a programming or data context.
MLOps Context: Object attributes are used in MLOps to represent and store specific characteristics or properties of objects or entities, such as features or metadata of datasets or models.

Topic 50: Studio Lab - A virtual environment or platform that provides a hands-on learning or experimentation environment for machine learning workflows.
MLOps Context: Studio labs are used in MLOps to provide a controlled and interactive environment for learning, experimenting, and practicing machine learning concepts and techniques.

Topic 51: Upload - The process of transferring or sending files or data from a local system or device to a remote server or storage system.
MLOps Context: Uploading files or data is a common task in MLOps to transfer datasets, models, or other artifacts from a local system to a remote server or cloud storage system.

Topic 52: Train Model - The process of training a machine learning model using labeled data to learn patterns and make predictions.
MLOps Context: Training models is a core component of MLOps, where machine learning models are trained using labeled datasets to learn patterns and make predictions.

Topic 53: File Notebook - A file or document that contains code, text, and visualizations used for analysis, documentation, or collaboration.
MLOps Context: File notebooks are used in MLOps to store and organize code, documentation, and visualizations related to machine learning tasks, experiments, or workflows.

Topic 54: Batch Prediction - The process of applying a trained machine learning model to a batch of data to generate predictions or inferences.
MLOps Context: Batch predictions are used in MLOps to apply trained models to large batches of data for prediction or inference, enabling efficient processing of large datasets.

Topic 55: Connect - The process of establishing a connection or link between different systems, services, or components.
MLOps Context: Connecting systems, services, or components is a common task in MLOps to enable communication, data transfer, or integration between different parts of the machine learning workflow.

Topic 56: Executing Script - The process of running or executing a script or program to perform a specific task or function.
MLOps Context: Executing scripts is a common task in MLOps to run or automate specific tasks, such as data preprocessing, model training, or evaluation.

Topic 57: Validation - The process of checking or verifying the correctness, accuracy, or quality of data, models, or processes.
MLOps Context: Validation is an important step in MLOps to ensure the correctness, accuracy, and quality of machine learning models, datasets, or pipelines.

Topic 58: Import - The process of bringing or incorporating external data, code, or libraries into a system or environment.
MLOps Context: Importing data, code, or libraries is a common task in MLOps to incorporate external resources or dependencies into the machine learning workflow.

Topic 59: Handle Columns - The process of managing or manipulating specific columns or attributes of a dataset or table.
MLOps Context: Handling columns involves managing or manipulating specific attributes or features of datasets or tables in the machine learning workflow.

Topic 60: Drift Monitoring - The process of continuously monitoring and detecting changes or deviations in data or model performance over time.
MLOps Context: Drift monitoring is an important aspect of MLOps to detect and address data or model drift, ensuring the accuracy and reliability of machine learning models in production.

Topic 61: Job - A specific task or assignment that needs to be executed or completed.
MLOps Context: Jobs represent specific tasks or assignments in the MLOps workflow, such as data preprocessing, model training, or deployment.

Topic 62: Pandas - A popular open-source data manipulation and analysis library for the Python programming language.
MLOps Context: Pandas is commonly used in MLOps for data preprocessing, manipulation, and analysis tasks, providing powerful tools for working with structured data.

Topic 63: Installation - The process of setting up or configuring software or dependencies in a system or environment.
MLOps Context: Installation is a common task in MLOps to set up the necessary software, libraries, or dependencies required for running machine learning workflows.

Topic 64: Index Range - A specified range or subset of indices or values within a dataset or array.
MLOps Context: Index ranges are used in MLOps to select or subset specific indices or values within datasets or arrays for processing or analysis.

Topic 65: Data Versioning - The process of managing and tracking different versions or revisions of datasets or data files.
MLOps Context: Data versioning is an important aspect of MLOps to track and manage different versions of datasets, ensuring reproducibility and traceability in machine learning workflows.

Topic 66: Service - A software system or application that provides specific functionality or resources to other systems or users.
MLOps Context: Services are used in MLOps to provide specific functionality or resources, such as model serving, data storage, or compute resources, to support machine learning workflows.

Topic 67: Train File - A file or dataset used for training a machine learning model.
MLOps Context: Train files are used in MLOps as input data for training machine learning models, providing the necessary labeled or unlabeled data for learning patterns and making predictions.

Topic 68: Encoding Mismatch - The condition where the encoding or format of data does not match the expected or required encoding.
MLOps Context: Encoding mismatches refer to situations in MLOps where the encoding or format of data does not match the expected or required encoding, leading to parsing or processing errors.

Topic 69: Search - The process of looking for or finding specific information, resources, or patterns within a dataset or system.
MLOps Context: Searching is a common task in MLOps to find specific information, resources, or patterns within datasets, code, or documentation to support machine learning workflows.

Topic 70: Memory Leak - The condition where a program or system continues to consume memory without releasing or deallocating it, leading to memory exhaustion or performance issues.
MLOps Context: Memory leaks refer to situations in MLOps where a program or system continues to consume memory without releasing or deallocating it, potentially leading to memory exhaustion or performance issues.

Topic 71: Training Stuck - The condition where a training job or process becomes unresponsive or fails to complete due to various issues or errors.
MLOps Context: Training stuck refers to situations in MLOps where a training job or process becomes unresponsive or fails to complete due to issues such as resource constraints, errors, or system failures.

Topic 72: Download - The process of transferring or retrieving files or data from a remote server or storage system to a local system or device.
MLOps Context: Downloading files or data is a common task in MLOps to retrieve trained models, datasets, or other artifacts from a remote server or cloud storage system to a local system or device.

Topic 73: Models - Trained machine learning models that can make predictions or inferences based on input data.
MLOps Context: Models are a core component of MLOps, representing the trained machine learning algorithms or models that can be used to make predictions or inferences.

Topic 74: Distributed Training - The process of training machine learning models using distributed computing resources or multiple machines to accelerate training and handle large datasets.
MLOps Context: Distributed training is used in MLOps to leverage distributed computing resources or multiple machines to train machine learning models more efficiently and handle large datasets.

Topic 75: Model Deployment - The process of making a trained machine learning model available for use in a production environment.
MLOps Context: Model deployment is a critical step in MLOps, where trained machine learning models are deployed to production environments to serve predictions or perform inference.

Topic 76: Running Script - The process of executing or running a script or program to perform a specific task or function.
MLOps Context: Running scripts is a common task in MLOps to execute or automate specific tasks, such as data preprocessing, model training, or evaluation.

Topic 77: Model Inference - The process of using a trained machine learning model to make predictions or inferences based on input data.
MLOps Context: Model inference is a core component of MLOps, where trained machine learning models are used to make predictions or inferences based on new or unseen data.

Topic 78: Notebook - An interactive document or environment that combines code, text, and visualizations for data analysis, exploration, and documentation.
MLOps Context: Notebooks are commonly used in MLOps to develop, test, and document machine learning workflows, allowing for interactive code execution, data analysis, and collaboration.

Topic 79: Version Conflict - The condition where different versions or revisions of software, libraries, or dependencies are incompatible or cause conflicts.
MLOps Context: Version conflicts refer to situations in MLOps where different versions or revisions of software, libraries, or dependencies are incompatible or cause conflicts, leading to errors or unexpected behavior.

Topic 80: Studio Domain - A specific domain or context within a machine learning studio or environment.
MLOps Context: Studio domains are used in MLOps to provide specific contexts or environments for different machine learning projects, teams, or use cases within a larger machine learning studio or platform.

Topic 81: Deploying Endpoint - The process of making a machine learning model or service available as an endpoint or API for external clients or systems to access and use.
MLOps Context: Deploying endpoints is a critical step in MLOps, where machine learning models or services are made available as endpoints or APIs for external clients or systems to access and use.

Topic 82: Pipeline - A sequence of steps or tasks that are executed in a specific order to complete a workflow or process.
MLOps Context: Pipelines are a fundamental concept in MLOps, representing the sequence of steps or tasks that are executed to complete a machine learning workflow or process.

Topic 83: Register Model - The process of recording or cataloging a trained machine learning model in a central repository or catalog.
MLOps Context: Registering models is a common practice in MLOps to record or catalog trained machine learning models in a central repository or catalog, allowing for version control and easy access.

Topic 84: Input Type - The format or structure of the input data required by a machine learning model or algorithm.
MLOps Context: Input types refer to the specific format or structure of the input data required by machine learning models or algorithms in MLOps workflows.

Topic 85: Component - A modular or reusable piece of code or functionality that performs a specific task or function.
MLOps Context: Components are used in MLOps to represent modular or reusable pieces of code or functionality that perform specific tasks or functions within machine learning workflows.

Topic 86: Clone - The process of creating an identical or copy of an existing object, repository, or environment.
MLOps Context: Cloning is a common practice in MLOps to create identical or copy objects, repositories, or environments for replication, experimentation, or version control.

Topic 87: Custom Training - The process of designing and implementing a specialized or customized training process or workflow for specific machine learning tasks or requirements.
MLOps Context: Custom training involves designing and implementing specialized or customized training processes or workflows to meet specific machine learning tasks or requirements in MLOps.

Topic 88: Endpoint - A network-accessible interface or service that provides specific functionality or resources to clients or systems.
MLOps Context: Endpoints are used in MLOps to provide specific functionality or resources, such as model serving, data access, or API endpoints, to clients or systems.

Topic 89: Invoke Endpoint - The process of calling or sending requests to a machine learning model or service endpoint to receive predictions or inferences.
MLOps Context: Invoking endpoints is a common task in MLOps to call or send requests to machine learning model or service endpoints to receive predictions or inferences.

Topic 90: File Pickle - A file format used to serialize and store Python objects in a binary format for later use or retrieval.
MLOps Context: File pickles are commonly used in MLOps to store and serialize trained machine learning models or objects in a binary format for later use or retrieval.

Topic 91: Feature Store - A centralized repository or system for storing and managing features or attributes used in machine learning workflows.
MLOps Context: Feature stores are used in MLOps to store and manage features or attributes used in machine learning workflows, enabling efficient feature engineering and reuse.

Topic 92: Estimator - A high-level API or interface that encapsulates the training and evaluation of machine learning models.
MLOps Context: Estimators are used in MLOps to provide a high-level API or interface for training and evaluating machine learning models, simplifying the development and deployment process.

Topic 93: Account - A user or system identity used to access or interact with resources or services in a system or application.
MLOps Context: Accounts are used in MLOps to represent user or system identities that are granted access or permissions to specific resources or services.

Topic 94: Update - The process of making changes or modifications to a system, software, or configuration to improve functionality or address issues.
MLOps Context: Updating systems, software, or configurations is a common practice in MLOps to incorporate improvements, bug fixes, or new features into the machine learning workflow.

Topic 95: Table - A structured representation of data organized in rows and columns.
MLOps Context: Tables are commonly used in MLOps to represent structured data, such as datasets or results, organized in rows and columns for analysis or processing.

Topic 96: Installing Package - The process of adding or setting up a software package or library in a system or environment.
MLOps Context: Installing packages is a common task in MLOps to set up the necessary dependencies and libraries required for running machine learning workflows.

Topic 97: Import Data - The process of bringing or incorporating external data into a system or environment for analysis or processing.
MLOps Context: Importing data is a common task in MLOps to incorporate external datasets or data sources into the machine learning workflow for analysis, training, or evaluation.

Topic 98: Face Transformers - A library or framework for natural language processing and understanding tasks, developed by Hugging Face.
MLOps Context: Face transformers, developed by Hugging Face, is a popular library or framework used in MLOps for natural language processing and understanding tasks, such as text classification or sentiment analysis.

Topic 99: Training Job - A specific task or assignment that involves training a machine learning model using labeled data.
MLOps Context: Training jobs represent specific tasks or assignments in the MLOps workflow that involve training machine learning models using labeled datasets.

Topic 100: Report - A document or summary that provides information, analysis, or insights on a specific topic or subject.
MLOps Context: Reports are commonly used in MLOps to document and communicate information, analysis, or insights on machine learning tasks, experiments, or workflows.

Topic 101: Output - The result or outcome produced by a system, process, or model.
MLOps Context: Outputs in MLOps refer to the results or outcomes produced by machine learning models, pipelines, or workflows, such as predictions, evaluations, or visualizations.

Topic 102: Export - The process of transferring or saving data or information from a system or environment to an external file or storage system.
MLOps Context: Exporting data or information is a common task in MLOps to transfer or save results, predictions, or other outputs from a system or environment to an external file or storage system.

Topic 103: Trained Object - An object or entity that has undergone training or learning from data to make predictions or inferences.
MLOps Context: Trained objects in MLOps refer to entities, such as machine learning models or algorithms, that have undergone training or learning from data to make predictions or inferences.

Topic 104: Run - The execution or performance of a specific task, process, or workflow.
MLOps Context: Runs represent the execution or performance of specific tasks, processes, or workflows in the MLOps context, such as model training, evaluation, or deployment.

Topic 105: Version - A specific release or iteration of a software, model, or artifact.
MLOps Context: Versions in MLOps refer to specific releases or iterations of software, models, or artifacts, allowing for version control, reproducibility, and traceability.

Topic 106: Auto Scaling - The process of automatically adjusting the computational resources or capacity based on the workload or demand.
MLOps Context: Auto scaling is used in MLOps to automatically adjust the computational resources or capacity based on the workload or demand of machine learning tasks or workflows.

Topic 107: Batch Size - The number of samples or data points processed or trained in a single iteration or batch.
MLOps Context: Batch size refers to the number of samples or data points processed or trained in a single iteration or batch during the training or inference process in machine learning workflows.

Topic 108: Deploy Model - The process of making a trained machine learning model available for use in a production environment.
MLOps Context: Deploying models is a critical step in MLOps, where trained machine learning models are made available for use in production environments to serve predictions or perform inference.

Topic 109: Blob Storage - A type of cloud storage that stores unstructured data as binary large objects (BLOBs).
MLOps Context: Blob storage is commonly used in MLOps to store and manage unstructured data, such as images, documents, or logs, in a scalable and cost-effective manner.

Topic 110: Cost - The amount of money, resources, or effort required or consumed to complete a task or process.
MLOps Context: Cost refers to the amount of money, resources, or effort required or consumed to complete machine learning tasks, processes, or workflows in MLOps.

Topic 111: Access - The ability or permission to interact with or use specific resources, services, or functionalities.
MLOps Context: Access refers to the ability or permission granted to users or systems to interact with or use specific resources, services, or functionalities in MLOps.

Topic 112: Scheduling Notebook - The process of setting up a schedule or automated execution for running notebooks or code at specific times or intervals.
MLOps Context: Scheduling notebooks allows for the automated execution of code or notebooks at specific times or intervals, enabling regular updates, data processing, or model training in MLOps.

Topic 113: Tuning Model - The process of adjusting or optimizing the hyperparameters or configurations of a machine learning model to improve its performance or accuracy.
MLOps Context: Tuning models involves adjusting or optimizing the hyperparameters or configurations of machine learning models to improve their performance or accuracy in MLOps.

Topic 114: Tests - The process of evaluating or verifying the correctness, functionality, or performance of a system, model, or process.
MLOps Context: Tests are used in MLOps to evaluate or verify the correctness, functionality, or performance of machine learning systems, models, or processes.

Topic 115: Tracked Directory - A directory or folder that is monitored or tracked for changes, updates, or modifications.
MLOps Context: Tracked directories are used in MLOps to monitor or track specific directories or folders for changes, updates, or modifications, enabling automated workflows or processes.

Topic 116: Create Endpoint - The process of setting up or configuring a machine learning model or service as an endpoint or API for external clients or systems to access and use.
MLOps Context: Creating endpoints involves setting up or configuring machine learning models or services as endpoints or APIs for external clients or systems to access and use in MLOps.

Topic 117: Designer - A person or role responsible for designing or creating machine learning models, workflows, or systems.
MLOps Context: Designers in MLOps are responsible for designing or creating machine learning models, workflows, or systems, ensuring their effectiveness, efficiency, and reliability.

Topic 118: Request Endpoint - The process of sending or making requests to a machine learning model or service endpoint to receive predictions or inferences.
MLOps Context: Requesting endpoints involves sending or making requests to machine learning model or service endpoints to receive predictions or inferences based on input data in MLOps.

Topic 119: Resource Group - A logical grouping or container for organizing and managing resources or services in a cloud or computing environment.
MLOps Context: Resource groups are used in MLOps to logically group and manage resources or services, such as compute instances, storage, or networking, in a cloud or computing environment.

Topic 120: Model Serve - The process of making a trained machine learning model available for use in a serving or production environment.
MLOps Context: Model serving involves making trained machine learning models available for use in serving or production environments, allowing for real-time predictions or inferences in MLOps.

Topic 121: File Size - The size or amount of storage space occupied by a file or data.
MLOps Context: File size refers to the size or amount of storage space occupied by a file or data, which can impact storage requirements, transfer times, or memory usage in MLOps.'''

In [7]:
topic_list = []
for index, topic_entry in enumerate(topics.split('Topic ')):
    if not topic_entry:
        continue
    topic_name, topic_info = topic_entry.split(' - ')
    topic_description, topic_description_mlops = topic_info.split('MLOps Context: ')
    topic = {
        'Index': index,
        'Topic': topic_name.split(': ')[-1],
        'Description': topic_description.strip(),
        # 'Description (MLOps)': topic_description_mlops.strip(),
    }
    topic_list.append(topic)
topic_df = pd.DataFrame(topic_list)
print(topic_df.to_latex(index=False))

\begin{tabular}{rll}
\toprule
Index & Topic & Description \\
\midrule
1 & Pipeline Step & A step in a pipeline that represents a specific task or action. \\
2 & Log & A record of events or messages generated by a system or application. \\
3 & Docker Image & A lightweight, standalone, and executable software package that includes everything needed to run a piece of software, including the code, runtime, libraries, and system tools. \\
4 & Model Tar & A compressed archive file format used to store and distribute machine learning models. \\
5 & File Directory & A hierarchical structure used to organize and store files and folders. \\
6 & Plot & A graphical representation of data or information. \\
7 & Runs & Instances of executing a pipeline or workflow. \\
8 & Artifacts & Any output or intermediate result generated during the execution of a pipeline or workflow. \\
9 & Predict & To make a forecast or estimate based on available data or a trained model. \\
10 & Labeling & The process of a

In [18]:
macro_topic2index_list = [
    ('Code Development', [17,49,56,64,76,85]),
    ('Code Management', []),
    ('Compute Management', [18,37,43,44,45,66,70,106,119,121]),
    ('Data Development', [9,32,59,62,95]),
    ('Data Management', [7,27,34,47,65,68,91,97,102,103,109]),
    ('Environment Management', [2,12,13,14,20,26,31,50,58,63,78,79,80,94,96,105,117]),
    ('Experiment Management', [6,16,104]),
    ('File Management', [4,41,42,51,53,72,84,86,90,101,115]),
    ('Model Development', [10,11,36,48,52,57,67,69,71,74,87,92,98,99,107,113]),
    ('Model Management', [3,29,39,46,73,83]),
    ('Model Serving', [8,15,19,28,30,35,38,54,75,77,81,88,89,108,116,120]),
    ('Network Management', [22,24,55,118]),
    ('Observability Management', [1,5,23,33,60,100]),
    ('Pipeline Management', [0,40,61,82,112]),
    ('Quality Assurance Management', [114]),
    ('Security Management', [21,25,93,111]),
    # ('User Interface Management', []),
    ('Miscellaneous', [110]),
]

topic_list = []
macro_topic_mapping = {}
macro_topic_indexing = {}
macro_topic2index_dict = {}
for index, topic_set in enumerate(macro_topic2index_list):
    macro_topic2index_dict[topic_set[0]] = topic_set[1]
    macro_topic_indexing[index] = topic_set[0]
    topic_list.extend(topic_set[1])
    for topic in topic_set[1]:
        macro_topic_mapping[topic] = index

print(find_duplicates(topic_list))
print(len(topic_df) == len(topic_list))
print(set(range(len(topic_list))).difference(set(range(topic_df.shape[0]))))

[]
True
set()


In [43]:
# assign human-readable & high-level topics to challenges & solutions

df = pd.read_json(os.path.join(path_rq1, 'topics.json'))
df['Challenge_topic_macro'] = -1

for index, row in df.iterrows():
    if row['Challenge_topic'] in macro_topic_mapping:
        df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']])
    else:
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_rq1, 'filtered.json'), indent=4, orient='records')

df_number = pd.DataFrame()

values = []
labels = []

for index, group in df.groupby('Challenge_topic_macro'):
    topic_list = [topic + 1 for topic in macro_topic2index_dict[macro_topic_indexing[index]]]
    entry = {
        'Index': index + 1,
        'Macro-topic': macro_topic_indexing[index],
        'Percentage (%)': round(len(group)/len(df)*100, 2),
        'Unresolved rate (%)': round(len(group[group['Challenge_solved_time'].isna()])/len(group)*100, 2),
        'Median resolution time (hours)': round(group['Challenge_solved_time'].median(), 2),
        'Topic list': topic_list,
    }
    df_number = pd.concat([df_number, pd.DataFrame([entry])], ignore_index=True)
    labels.append(macro_topic_indexing[index])
    values.append(len(group))

print(df_number.to_latex(float_format="%.2f", index=False))
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()

\begin{tabular}{rlrrrl}
\toprule
 Index &              Macro-topic &  Percentage (\%) &  Unresolved rate (\%) &  Median resolution time (hours) &                                                                                                                   Topic list \\
\midrule
     1 &         Code Development &            4.08 &                64.27 &                           53.26 &                                                                                               [16, 41, 60, 67, 73, 134, 137] \\
     2 &          Code Management &            0.45 &                69.23 &                           27.06 &                                                                                                                         [25] \\
     3 &       Compute Management &            3.46 &                66.58 &                           20.23 &                                                                                              [29, 47, 82, 93, 109, 116, 131] \\


In [29]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(len(group_k))
    values_p.append(len(group_p))

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 1.3298123491267688e-08
P-value of knowledge inquiry: 1.7512710170919377e-09
P-value of two type of inquiries: 0.03528018186508468


In [37]:
for index, diff in enumerate([p - k for p, k in zip(values_p, values_k)]):
    if abs(diff) > 50:
        print(f'{index+1}: {diff}')

1: 63
6: 53
23: 57
25: 56
29: 69
46: 52
69: 58
75: 99


In [36]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df_topics = []

for name, group in df.groupby('Challenge_topic_macro'):
    topic_info = {
        'Prevalence': group['Challenge_topic_macro'].count(),
        'Unresolved rate (%)': group['Challenge_solved_time'].isna().sum() / len(group) * 100,
        'Median resolved time (h)': group['Challenge_solved_time'].median(),
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_correlation = pd.DataFrame(index=df_topics.columns, columns=df_topics.columns)

# Iterate over each pair of columns
for i, col1 in enumerate(df_topics.columns):
    for j, col2 in enumerate(df_topics.columns):
        if i < j:
            tau, pvalue = kendalltau(df_topics[col1], df_topics[col2])
            df_correlation.loc[col1, col2] = f'{tau:.2f}/{pvalue:.2f}'

print(df_correlation.to_latex())

\begin{tabular}{llll}
\toprule
{} & Prevalence & Unresolved rate (\%) & Median resolved time (h) \\
\midrule
Prevalence               &        NaN &          -0.13/0.49 &                0.19/0.32 \\
Unresolved rate (\%)      &        NaN &                 NaN &               -0.37/0.06 \\
Median resolved time (h) &        NaN &                 NaN &                      NaN \\
\bottomrule
\end{tabular}



In [16]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df = df[df['Challenge_solved_time'].notna()]
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time = group['Challenge_solved_time'].apply(lambda x: np.log(x+1))
    resolved_time_list.append(resolved_time)

print(f'Kruskal-Wallis H-test: p-value = {kruskal(*resolved_time_list).pvalue:.3f}')

Kruskal-Wallis H-test: p-value = 0.001


In [None]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time = group['Challenge_solved_time'].apply(lambda x: np.log(x+1))
    resolved_time_list.append(resolved_time)

traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
fig = go.Figure(data=traces)
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale.pdf'))

In [None]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['Challenge_solved_time'] = df['Challenge_solved_time'].apply(lambda x: np.log(x+1))
resolved_time_list = []

for name, group in df.groupby('Challenge_topic_macro'):
    resolved_time_list.append(group['Challenge_solved_time'])

traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
fig = go.Figure(data=traces)
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    showlegend=False
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale.pdf'))

In [35]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['Challenge_solved_time'] = df['Challenge_solved_time'].apply(lambda x: np.log(x+1))
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
    
fig = px.box(df, x="Challenge_topic_macro", y="Challenge_solved_time", color="Challenge_type")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    xaxis_title="Macro-topics",
    yaxis_title="Resolved time (hours) in log scale",
    legend_title='Inquiry type',
)
fig.show()
fig.write_image(os.path.join(path_rq1, 'Resolved time in log scale (based on inquiry types).pdf'))

In [None]:
# df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
# topic_names = df['Challenge_topic_macro'].unique()
# df_comparison = pd.DataFrame(index=topic_names, columns=topic_names)

# for name, group in df.groupby('Challenge_topic_macro'):
#     for name2, group2 in df.groupby('Challenge_topic_macro'):
#         if name < name2:
#             continue
#         values = remove_nan(group['Challenge_solved_time'])
#         values2 = remove_nan(group2['Challenge_solved_time'])
#         statistic, pvalue = mannwhitneyu(values, values2)
#         df_comparison.at[name, name2] = f'{statistic:.4f}/{pvalue:.4f}'

# print(df_comparison.to_latex())

In [25]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic_macro'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(group_k['Challenge_solved_time'].isna().sum()/len(group_k))
    values_p.append(group_p['Challenge_solved_time'].isna().sum()/len(group_p))

values_k = remove_nan(values_k)
values_p = remove_nan(values_p)

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 0.5501300096511841
P-value of knowledge inquiry: 0.631529688835144
P-value of two type of inquiries: 0.4553018964198552


In [29]:
df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))

values_k = []
values_p = []

for index, group in df.groupby('Challenge_topic'):
    group_k = group[group['Challenge_type'] == 'knowledge']
    group_p = group[group['Challenge_type'] == 'problem']
    values_k.append(group_k['Challenge_solved_time'].median())
    values_p.append(group_p['Challenge_solved_time'].median())

values_k = remove_nan(values_k)
values_p = remove_nan(values_p)

print("P-value of problem inquiry:", shapiro(values_p)[1])
print("P-value of knowledge inquiry:", shapiro(values_k)[1])
print("P-value of two type of inquiries:", mannwhitneyu(values_k, values_p)[1])

P-value of problem inquiry: 2.157887777458163e-18
P-value of knowledge inquiry: 7.568273346198627e-25
P-value of two type of inquiries: 2.7861333502044098e-11


In [32]:
for index, diff in enumerate([p - k for p, k in zip(values_p, values_k)]):
    if abs(diff) > 500:
        print(f'{index+1}: {diff}')

15: -2233.3722222222
96: 635.2957094444
107: -11147.100535833351
131: 1073.1950750000499
