In [1]:
import os
import re
import time
import spacy
import pickle
import openai
import random
import enchant
import textstat
import itertools
import collections
import pandas as pd
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from collections import namedtuple
from gensim.parsing.preprocessing import remove_stopwords, strip_short, strip_punctuation, STOPWORDS

In [2]:
path_dataset = '../../Dataset'
path_result = '../../Result'
path_rq1 = os.path.join(path_result, 'RQ1')

random_state = 42

nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

prompt_summary = '''Summarize the core idea of the post into a succinct sentence, suppressing verbosity such as "The core idea of the post is that...". Use only common English words rather than unique symbols found in the text.\n###'''

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker', 'amazon', 'aws'],
    'Azure Machine Learning': ['azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow', 'databricks'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex', 'google', 'gcp'],
    'Weights & Biases': ['weights', 'biases', 'wandb']
}

keywords_image = {
    ".jpg", 
    ".png", 
    ".jpeg", 
    ".gif", 
    ".bmp", 
    ".webp", 
    ".svg", 
    ".tiff"
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

stop_words_custom = {
    'ability',
    'abilities',
    'accident',
    'accidents',
    'acknowledgement',
    'action',
    'actions',
    'activities',
    'activity',
    'ad',
    'ads',
    'advice',
    'alternative',
    'alternatives',
    'analysis',
    'analyses',
    'announcement',
    'anomaly'
    'anomalies'
    'answer',
    'answers',
    'appreciation',
    'approach',
    'approaches',
    'article',
    'articles',
    'assertion',
    'assistance',
    'assumption',
    'attempt',
    'author',
    'behavior',
    'behaviour',
    'benefit',
    'bit',
    'bits',
    'block',
    'blocks',
    'blog',
    'blogs',
    'body',
    'bug',
    'bugs',
    'building',
    'case',
    'cases',
    'categories',
    'category',
    'cause',
    'causes',
    'challenge',
    'challenges',
    'change',
    'changes',
    'char',
    'character',
    'characters',
    'check',
    'choice',
    'choices',
    'classification',
    'cloud',
    'collection',
    'com',
    'combination',
    'commmunication',
    'community',
    'communities',
    'company',
    'companies',
    'concept',
    'concepts',
    'concern',
    'concerns',
    'condition',
    'conditions',
    'confirmation',
    'confusion',
    'consideration',
    'contact',
    'content',
    'contents',
    'control',
    'count',
    'couple',
    'couples',
    'course',
    'courses',
    'crash',
    'crashes',
    'cross',
    'current',
    'custom',
    'customer',
    'customers',
    'day',
    'days',
    'decision',
    'default',
    'demand',
    'demo',
    'description',
    'desire',
    'desktop',
    'detail',
    'details',
    'differ',
    'difference',
    'differences',
    'difficulties',
    'difficulty',
    'discrepancies',
    'discrepancy',
    'discussion',
    'dislike',
    'distinction',
    'edit',
    'effect',
    'end',
    'enquiries',
    'enquiry',
    'error',
    'errors',
    'evidence',
    'example',
    'examples',
    'exception',
    'exceptions',
    'existence',
    'exit',
    'expectation',
    'experience',
    'expert',
    'experts',
    'explanation',
    'face',
    'fact',
    'facts',
    'fail',
    'failure',
    'favorite',
    'favorites',
    'fault',
    'feature',
    'features',
    'feedback',
    'feedbacks',
    'fix',
    'fixes',
    'float',
    'forecast',
    'forecasting',
    'form',
    'forms',
    'functionality',
    'functionalities',
    'future',
    'goal',
    'goals',
    'guarantee',
    'guidance',
    'guideline',
    'guide',
    'guy',
    'guys',
    'harm',
    'help',
    'hour',
    'hours',
    'ibm',
    'idea',
    'ideas',
    'individual',
    'individuals',
    'info',
    'information',
    'inquiries',
    'inquiry',
    'insight',
    'instruction',
    'instructions',
    'int',
    'intelligence',
    'interest',
    'introduction',
    'investigation',
    'invitation',
    'issue',
    'issues',
    'kind',
    'kinds',
    'lack',
    'language',
    'languages',
    'laptop',
    'learn',
    'learning',
    'level',
    'levels',
    # 'location',
    # 'locations',
    'look',
    'looks',
    'lot',
    'lots',
    'luck',
    'machine',
    'machines',
    'major',
    'manner',
    'manners',
    'manual',
    'mark',
    'meaning',
    'message',
    'messages',
    'method',
    'methods',
    'mind',
    'minute',
    'minutes',
    'mistake',
    'mistakes',
    'moment',
    'month',
    'months',
    'need',
    'needs',
    'note',
    'notes',
    'number',
    'numbers',
    'offer',
    'one',
    'ones',
    'opinion',
    'opinions',
    'org',
    'organization',
    'outcome',
    'part',
    'parts',
    'past',
    'people',
    'permit',
    'person',
    'persons',
    'perspective',
    'perspectives',
    'picture',
    'pictures',
    'place',
    'places',
    'plan',
    'point',
    'points',
    'post',
    'posts',
    'price',
    'problem',
    'problems',
    'processing',
    'product',
    'products',
    'program',
    'programs',
    'project',
    'projects',
    'proposal',
    'purpose',
    'purposes',
    # 'python',
    'question',
    'questions',
    'raise',
    'reason',
    'reasons',
    'recommendation',
    'recommendations',
    'regression',
    'research',
    'result',
    'results',
    'return',
    'returns',
    'scenario',
    'scenarios',
    'science',
    'screen',
    'screenshot',
    'screenshots',
    'second',
    'seconds',
    'section',
    'self',
    'sense',
    'sentence',
    'setup',
    'shape',
    'show',
    'shows',
    'site',
    'situation',
    'software',
    'solution',
    'solutions',
    'speech',
    'start',
    'state',
    'statement',
    'states',
    'status',
    'step',
    'steps',
    'string',
    'study',
    'stuff',
    'success',
    'suggestion',
    'suggestions',
    'summary',
    'summaries',
    'surprise',
    'support',
    'talk',
    'task',
    'tasks',
    'technique',
    'techniques',
    'technologies',
    'technology',
    'term',
    'terms',
    'text',
    'time',
    'times',
    'thank',
    'thanks',
    'thing',
    'things',
    'thought',
    'three',
    'title',
    'time',
    'today',
    'tomorrow',
    'tool',
    'tools',
    'topic',
    'topics',
    'total',
    'trouble',
    'troubles',
    'truth',
    'try',
    'tutorial',
    'tutorials',
    'two',
    'understand',
    'understanding',
    # 'url',
    # 'urls',
    'use',
    'user',
    'users',
    'uses',
    'value',
    'values',
    'variant',
    'variants',
    'versus',
    'video',
    'videos',
    'view',
    'viewpoint',
    'vision',
    'voice',
    'way',
    'ways',
    'week',
    'weeks',
    'word',
    'words',
    'work',
    'workaround',
    'workarounds',
    'works',
    'yeah',
    'year',
    'years',
    'yesterday',
}

tools_keyword_list = set(itertools.chain(*tools_keyword_mapping.values()))
stop_words_list = STOPWORDS.union(tools_keyword_list).union(stop_words_custom)

In [3]:
def count_code_line(block_list):
    total_loc = 0
    for blocks in block_list:
        for block in blocks:
            for line in block.splitlines():
                if line.strip():
                    total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    clean_text = soup.get_text(separator=' ')
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    code_line = code_line1 + code_line2
    return clean_text, links, code_line

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern1 = r"\!?\[.*?\]\((.*?)\)"
    links1 = re.findall(link_pattern1, text)
    clean_text = re.sub(link_pattern1, '', text)
    link_pattern2 = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links2 = re.findall(link_pattern2, clean_text)
    clean_text = re.sub(link_pattern2, '', clean_text)
    links = links1 + links2
    return clean_text, links

def split_content(content):
    clean_text, links1, code_line1 = extract_styles(content)
    clean_text, code_line2 = extract_code(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_line = code_line1 + code_line2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_line'])
    return content_collection(clean_text, links, code_line)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def extract_nouns(text):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(nouns)

def extract_english(text):
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

def preprocess_text(text):
    clean_text = text.lower()
    clean_text = strip_punctuation(clean_text)
    clean_text = extract_english(clean_text)
    clean_text = extract_nouns(clean_text)
    clean_text = strip_short(clean_text)
    clean_text = remove_stopwords(clean_text, stop_words_list)
    return clean_text

def analyze_links(links):
    image_links = 0
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([image in link for image in keywords_image]):
            image_links += 1
        elif any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['image', 'documentation', 'tool', 'issue', 'patch', 'tutorial', 'example'])
    return link_analysis(image_links, documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

def find_duplicates(in_list):  
    duplicates = []
    unique = set(in_list)
    for each in unique:
        count = in_list.count(each)
        if count > 1:
            duplicates.append(each)
    return duplicates

In [4]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']
    df_issues.at[index, 'Challenge_comment_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_comment_body'] = row['Issue_comment_body']
    df_issues.at[index, 'Challenge_comment_score'] = row['Issue_comment_score']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_questions.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_comment_body'] = row['Question_comment_body']
    df_questions.at[index, 'Challenge_comment_score'] = row['Question_comment_score']

    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_questions.at[index, 'Solution_comment_score'] = row['Answer_comment_score']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df = df[df.columns.drop(list(df.filter(regex=r'Issue|Question|Answer')))]
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [4]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'original.json'))
df = df.explode('Tools')
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tools', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.png'))


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [None]:
# Experiment 1 & 2

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    title_analyzer = split_content(row['Challenge_title'])
    clean_title = preprocess_text(title_analyzer.text)
    
    challenge_analyzer = split_content(row['Challenge_title'] + row['Challenge_body'])
    link_analyzer = analyze_links(challenge_analyzer.links)
    text_analyzer = analyze_text(challenge_analyzer.text)
    clean_text = preprocess_text(challenge_analyzer.text)
    
    df.at[index, 'Challenge_preprocessed_title'] = clean_title
    df.at[index, 'Challenge_preprocessed_content'] = clean_text
    df.at[index, 'Challenge_code_count'] = challenge_analyzer.code_line
    df.at[index, 'Challenge_word_count'] = text_analyzer.word_count
    df.at[index, 'Challenge_readability'] = text_analyzer.readability
    df.at[index, 'Challenge_reading_time'] = text_analyzer.reading_time
    df.at[index, 'Challenge_link_count_image'] = link_analyzer.image
    df.at[index, 'Challenge_link_count_documentation'] = link_analyzer.documentation
    df.at[index, 'Challenge_link_count_example'] = link_analyzer.example
    df.at[index, 'Challenge_link_count_issue'] = link_analyzer.issue
    df.at[index, 'Challenge_link_count_patch'] = link_analyzer.patch
    df.at[index, 'Challenge_link_count_tool'] = link_analyzer.tool
    df.at[index, 'Challenge_link_count_tutorial'] = link_analyzer.tutorial

    if pd.notna(row['Challenge_comment_body']):
        comment_analyzer = split_content(row['Challenge_comment_body'])
        link_analyzer = analyze_links(comment_analyzer.links)
        text_analyzer = analyze_text(comment_analyzer.text)
        
        df.at[index, 'Challenge_comment_code_count'] = comment_analyzer.code_line
        df.at[index, 'Challenge_comment_word_count'] = text_analyzer.word_count
        df.at[index, 'Challenge_comment_readability'] = text_analyzer.readability
        df.at[index, 'Challenge_comment_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Challenge_comment_link_count_image'] = link_analyzer.image
        df.at[index, 'Challenge_comment_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Challenge_comment_link_count_example'] = link_analyzer.example
        df.at[index, 'Challenge_comment_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Challenge_comment_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Challenge_comment_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Challenge_comment_link_count_tutorial'] = link_analyzer.tutorial

    if pd.notna(row['Solution_body']):
        solution_analyzer = split_content(row['Solution_body'])
        link_analyzer = analyze_links(solution_analyzer.links)
        text_analyzer = analyze_text(solution_analyzer.text)
        
        df.at[index, 'Solution_code_count'] = solution_analyzer.code_line
        df.at[index, 'Solution_word_count'] = text_analyzer.word_count
        df.at[index, 'Solution_readability'] = text_analyzer.readability
        df.at[index, 'Solution_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Solution_link_count_image'] = link_analyzer.image
        df.at[index, 'Solution_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Solution_link_count_example'] = link_analyzer.example
        df.at[index, 'Solution_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Solution_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Solution_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Solution_link_count_tutorial'] = link_analyzer.tutorial
        
    if pd.notna(row['Solution_comment_body']):
        comment_analyzer = split_content(row['Solution_comment_body'])
        link_analyzer = analyze_links(comment_analyzer.links)
        text_analyzer = analyze_text(comment_analyzer.text)
        
        df.at[index, 'Solution_comment_code_count'] = comment_analyzer.code_line
        df.at[index, 'Solution_comment_word_count'] = text_analyzer.word_count
        df.at[index, 'Solution_comment_readability'] = text_analyzer.readability
        df.at[index, 'Solution_comment_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Solution_comment_link_count_image'] = link_analyzer.image
        df.at[index, 'Solution_comment_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Solution_comment_link_count_example'] = link_analyzer.example
        df.at[index, 'Solution_comment_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Solution_comment_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Solution_comment_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Solution_comment_link_count_tutorial'] = link_analyzer.tutorial

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [4]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    # if pd.notna(row['Challenge_gpt_summary']):
    #     continue
    
    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + ' Body: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-3.5-turbo',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=50,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(1)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [None]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    clean_summary = preprocess_text(row['Challenge_gpt_summary'])
    df.at[index, 'Challenge_preprocessed_summary'] = clean_summary

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [None]:
# import openai
# from bertopic.backend import OpenAIBackend

# # openai.api_key = MY_API_KEY
# embedding_model = OpenAIBackend(delay_in_seconds=0.1, batch_size=10)

# from bertopic import BERTopic

# df = pd.read_json(os.path.join(path_special_output, 'labels.json'))

# docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()

# topic_model = BERTopic(embedding_model=embedding_model)
# topics, probs = topic_model.fit_transform(docs)
# topic_model.get_topic_info()

In [33]:
# def minimize_weighted_sum(df, sort_column):
#     df_new = df.sort_values(sort_column, ascending=False)
#     n = len(df)
#     center_idx = (n - 1) // 2
#     direction = -1
#     distance = 0

#     for _, row in df_new.iterrows():
#         # Calculate the new index
#         new_idx = center_idx + direction * distance
        
#         # Place the element from the sorted list into the new list
#         df.iloc[new_idx] = row

#         # If we've just moved to the left, increase the distance
#         if direction == -1:
#             distance += 1

#         # Switch the direction
#         direction *= -1

#     return df


In [20]:
prompt_topic = '''You will be given a list of stemmed words refering to specific software engineering topics. Please summarize each topic in terms and attach a one-liner description based on the stemmed words. Also, you must guarantee that the summaries are exclusive to one another.###\n'''

with open(os.path.join(path_rq1, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

    topic_term_list = []
    for index, topic in enumerate(topic_terms):
        terms = ', '.join([term[0] for term in topic])
        topic_term = f'Topic {index}: {terms}]'
        topic_term_list.append(topic_term)

    prompt = prompt_topic + '\n'.join(topic_term_list) + '\n###\n'
    completion = openai.ChatCompletion.create(
        model='gpt-4',
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        timeout=300,
        stream=False)

    topics = completion.choices[0].message.content
    print(topics)

Topic 0: Model Management - Handling and manipulation of models including training, saving, importing, and exporting.
Topic 1: Data Pipelining - The process of managing and processing data through multiple pipelines.
Topic 2: Package Installation - The process of installing, importing, and managing software packages using pip.
Topic 3: Logging - The process of creating, tracking, and managing logs during model training.
Topic 4: Docker Operations - Building, running, and managing Docker images and files.
Topic 5: Access Management - Managing access permissions, roles, and tokens for secure operations.
Topic 6: Data Labeling - The process of labeling data for training and object recognition.
Topic 7: Git Operations - Managing data, files, and version control using Git.
Topic 8: Bucket Operations - Managing files, data, and paths in storage buckets.
Topic 9: Sweep Operations - Configuring, running, and managing multiple sweeps.
Topic 10: Quota Management - Managing request quotas and han

In [13]:
topics = '''Topic 0: Model Management - Handling and manipulation of models including training, saving, importing, and exporting.
Topic 1: Data Pipelining - The process of managing and processing data through multiple pipelines.
Topic 2: Package Installation - The process of installing, importing, and managing software packages using pip.
Topic 3: Logging - The process of creating, tracking, and managing logs during model training.
Topic 4: Docker Operations - Building, running, and managing Docker images and files.
Topic 5: Access Management - Managing access permissions, roles, and tokens for secure operations.
Topic 6: Data Labeling - The process of labeling data for training and object recognition.
Topic 7: Git Operations - Managing data, files, and version control using Git.
Topic 8: Bucket Operations - Managing files, data, and paths in storage buckets.
Topic 9: Sweep Operations - Configuring, running, and managing multiple sweeps.
Topic 10: Quota Management - Managing request quotas and handling limit exceptions.
Topic 11: Remote Operations - Configuring, running, and connecting to remote files and executions.
Topic 12: Batch Processing - Managing and processing data, files, and jobs in batches.
Topic 13: Lambda Functions - Invoking and processing data using Lambda functions.
Topic 14: Database Operations - Connecting, importing, and running operations on databases.
Topic 15: Language Translation - Translating documents and languages using models.
Topic 16: Panda Operations - Managing and converting files using Panda.
Topic 17: Speech Processing - Handling audio files, generating speech, and transcribing services.
Topic 18: Spark Operations - Configuring, implementing, and managing data using Spark.
Topic 19: Instance Management - Creating, managing, and removing instances.
Topic 20: Column Operations - Managing, cleaning, and visualizing data in columns.'''

In [15]:
topic_list = [topic for topic in topics.split('\n') if topic]
macro_topic_mapping_inverse = {
    '1: Observability Management': [3],
    '2: Lifecycle Management': [1],
    '3: Compute Management': [9, 10, 12, 13, 18],
    '4: Environment Management': [2, 4, 19],
    '5: Access Management': [5, 11],
    '6: Model Management': [0],
    '7: Data Management': [6, 8, 14, 16, 20],
    '8: Code Management': [7],
}
        
macro_topic_list = []
macro_topic_mapping = {}
macro_topic_indexing = {}
for macro_topic, sub_topics in macro_topic_mapping_inverse.items():
    index, name = int(macro_topic.split(': ')[0]), macro_topic.split(': ')[1]
    macro_topic_indexing[index] = name
    macro_topic_list.extend(sub_topics)
    for topic in sub_topics:
        macro_topic_mapping[topic] = macro_topic

print(find_duplicates(macro_topic_list))
print(len(macro_topic_list) == len(topic_list))
print(set(range(len(topic_list))).difference(set(macro_topic_list)))

[]
False
{17, 15}


In [187]:
# # assign human-readable & high-level topics to challenges & solutions

# df = pd.read_json(os.path.join(path_special_output, 'labels.json'))
# df['Challenge_topic_macro'] = -1

# for index, row in df.iterrows():
#     if row['Challenge_topic'] in macro_topic_mapping:
#         df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']].split(':')[0])
#     else:
#         df.drop(index, inplace=True)

# df.to_json(os.path.join(path_special_output, 'labels.json'), indent=4, orient='records')

In [178]:
# assign human-readable & high-level topics to challenges & solutions

df = pd.read_json(os.path.join(path_rq1, 'topics.json'))
df['Challenge_topic_macro'] = -1

for index, row in df.iterrows():
    if row['Challenge_topic'] in macro_topic_mapping:
        df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']].split(':')[0])
    else:
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_rq1, 'filtered.json'), indent=4, orient='records')

df_number = pd.DataFrame()

for name, group in df.groupby('Challenge_topic_macro'):
    entry = {
        'Topic': macro_topic_indexing[name],
        'Percentage': round(len(group)/len(df)*100, 2),
        'Number': len(group),
    }
    df_number = pd.concat([df_number, pd.DataFrame([entry])], ignore_index=True)

df_number = df_number.sort_values('Percentage', ascending=False)
print(df_number.to_latex(float_format="%.2f", index=False))

\begin{tabular}{lrr}
\toprule
Topic & Percentage & Number \\
\midrule
Model Management & 21.39 & 2378 \\
Compute Management & 20.05 & 2229 \\
Environment Management & 17.90 & 1990 \\
Data Management & 13.13 & 1460 \\
Lifecycle Management & 9.94 & 1105 \\
Access Management & 7.84 & 872 \\
Observability Management & 6.65 & 739 \\
Code Management & 3.09 & 344 \\
\bottomrule
\end{tabular}



In [None]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
categories = ['Challenge_type', 'Challenge_topic_macro', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_rq1, 'State type topic sankey.png'))
