In [1]:
import os
import re
import time
import spacy
import pickle
import openai
import random
import enchant
import textstat
import itertools
import collections
import subprocess
import pandas as pd
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from langdetect import detect
from collections import namedtuple
from gensim.parsing.preprocessing import remove_stopwords, strip_short, strip_punctuation, STOPWORDS

In [2]:
path_dataset = '../../Dataset'
path_result = '../../Result'
path_rq1 = os.path.join(path_result, 'RQ1')

random_state = 42

# subprocess.run(['python', '-m' 'spacy', 'download', 'en_core_web_sm'])
nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

prompt_summary = '''Refine the post title to make it short and clear in simple English.\n###'''

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['amazon', 'aws', 'maker', 'sage'],
    'Azure Machine Learning': ['azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['databricks', 'mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['google', 'gcp', 'vertex'],
    'Weights & Biases': ['biases', 'wandb', 'weights']
}

tools_keyword_list = set(itertools.chain(*tools_keyword_mapping.values()))

keywords_image = {
    ".jpg", 
    ".png", 
    ".jpeg", 
    ".gif", 
    ".bmp", 
    ".webp", 
    ".svg", 
    ".tiff"
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

stop_words_se = {
    'ability',
    'abilities',
    'accident',
    'accidents',
    # 'acknowledgement',
    'action',
    'actions',
    'activities',
    'activity',
    'advice',
    'ai',
    'alternative',
    'alternatives',
    # 'announcement',
    'anomaly'
    'anomalies'
    'answer',
    'answers',
    'appreciation',
    'approach',
    'approaches',
    'article',
    'articles',
    'assistance',
    'attempt',
    'author',
    'behavior',
    'behaviour',
    'benefit',
    'bit',
    'bits',
    'block',
    'blocks',
    # 'blog',
    # 'blogs',
    'body',
    'bug',
    'bugs',
    'building',
    'case',
    'cases',
    'categories',
    'categorization',
    'category',
    'cause',
    'causes',
    'challenge',
    'challenges',
    'change',
    'changes',
    # 'char',
    'check',
    'choice',
    'choices',
    'clarification',
    'collection',
    'com',
    'combination',
    # 'commmunication',
    # 'community',
    # 'communities',
    # 'company',
    # 'companies',
    # 'computer',
    # 'computers',
    # 'concept',
    # 'concepts',
    'concern',
    'concerns',
    # 'condition',
    # 'conditions',
    'confirmation',
    'confusion',
    'consideration',
    # 'content',
    # 'contents',
    'context',
    # 'count',
    'couple',
    'couples',
    # 'course',
    # 'courses',
    'crash',
    'crashes',
    'cross',
    # 'custom',
    'customer',
    'customers',
    'day',
    'days',
    'demand',
    # 'description',
    'desire',
    'detail',
    'details',
    'devops',
    'difference',
    'differences',
    'difficulties',
    'difficulty',
    'discrepancies',
    'discrepancy',
    'discussion',
    'dislike',
    'distinction',
    'effect',
    'end',
    # 'engineering',
    'enquiries',
    'enquiry',
    'error',
    'errors',
    'evidence',
    'example',
    'examples',
    'exception',
    'exceptions',
    'existence',
    'exit',
    'expectation',
    'experience',
    'expert',
    'experts',
    # 'explanation',
    'fact',
    'facts',
    'fail',
    'failure',
    'favorite',
    'favorites',
    'fault',
    'faults',
    # 'feature',
    # 'features',
    # 'feedback',
    # 'feedbacks',
    'fix',
    'fixes',
    # 'float',
    'form',
    'forms',
    'functionality',
    'functionalities',
    'future',
    'goal',
    'goals',
    'guarantee',
    # 'guidance',
    # 'guideline',
    # 'guide',
    'guy',
    'guys',
    'harm',
    'hello',
    'help',
    'hour',
    'hours',
    'ibm',
    'idea',
    'ideas',
    'individual',
    'individuals',
    'info',
    'information',
    'inquiries',
    'inquiry',
    'insight',
    # 'instruction',
    # 'instructions',
    # 'int',
    'intelligence',
    'intent',
    'interest',
    'introduction',
    'investigation',
    'invitation',
    'ipynb',
    'issue',
    'issues',
    'kind',
    'kinds',
    'lack',
    'learning',
    'level',
    'levels',
    'look',
    'looks',
    'lot',
    'lots',
    'luck',
    'machine',
    'major',
    'manner',
    'manners',
    # 'manual',
    'mark',
    'meaning',
    # 'message',
    # 'messages',
    'method',
    'methods',
    'ml',
    'mlops',
    'minute',
    'minutes',
    'mistake',
    'mistakes',
    'month',
    'months',
    'need',
    'needs',
    'number',
    'numbers',
    'offer',
    'one',
    'ones',
    'opinion',
    'opinions',
    # 'org',
    # 'organization',
    'outcome',
    'part',
    'parts',
    'past',
    'people',
    'person',
    'persons',
    'perspective',
    'perspectives',
    'place',
    'places',
    'point',
    'points',
    'post',
    'posts',
    'practice',
    'practices',
    'problem',
    'problems',
    # 'product',
    # 'products',
    # 'program',
    # 'programs',
    # 'project',
    # 'projects',
    # 'proposal',
    'purpose',
    'purposes',
    'py',
    # 'python',
    'qa',
    'question',
    'questions',
    'reason',
    'reasons',
    # 'recognition',
    # 'recommendation',
    # 'recommendations',
    # 'recommender',
    # 'regression',
    # 'request',
    'research',
    'result',
    'results',
    'scenario',
    'scenarios',
    'science',
    'screenshot',
    'screenshots',
    'second',
    'seconds',
    'section',
    'sense',
    'sentence',
    'show',
    'shows',
    'situation',
    'software',
    'solution',
    'solutions',
    'start',
    # 'state',
    # 'statement',
    # 'states',
    # 'status',
    # 'step',
    # 'steps',
    # 'string',
    'study',
    'stuff',
    'success',
    'suggestion',
    'suggestions',
    'summary',
    'summaries',
    'surprise',
    # 'support',
    'talk',
    # 'task',
    # 'tasks',
    # 'technique',
    # 'techniques',
    # 'technologies',
    # 'technology',
    'term',
    'terms',
    'times',
    'thank',
    'thanks',
    'thing',
    'things',
    'thought',
    'three',
    'title',
    'today',
    'tomorrow',
    # 'tool',
    # 'tools',
    'topic',
    'topics',
    'total',
    'trouble',
    'troubles',
    'truth',
    'try',
    'two',
    'understand',
    'understanding',
    'usage',
    'use',
    'user',
    'users',
    'uses',
    # 'value',
    # 'values',
    'view',
    'viewpoint',
    'way',
    'ways',
    'week',
    'weeks',
    'word',
    'words',
    'work',
    'workaround',
    'workarounds',
    'works',
    'yeah',
    'year',
    'years',
    'yesterday',
}

stop_words_ml = {
    'ad',
    'ads',
    'algorithm',
    'algorithms',
    'analysis',
    'anomaly',
    'asr',
    'audio',
    'automl',
    'autopilot',
    'bert',
    'bi',
    'chatbot',
    'classification',
    'classifier',
    'clustering',
    'cnn',
    'cv',
    'decision',
    'detection',
    'dimensionality'
    'forecast',
    'forecasts',
    'forecasting',
    'forest',
    'fraud',
    'gan',
    'gesture',
    'gpt',
    'ica',
    'knn',
    'language',
    'languages',
    'lda',
    'lstm',
    'nlp',
    'nmf',
    'ocr',
    'pca',
    'phone',
    'processing',
    'recognition',
    'recommendation',
    'recommendations',
    'recommender',
    'reduction',
    'regression',
    'regressor',
    'reinforcement',
    'rf',
    'rl',
    'rnn',
    'segmentation',
    'sentiment',
    'series',
    'sota',
    'sound',
    'spam',
    'speech',
    'stt',
    'svd',
    'svm',
    'time',
    'translation',
    'translator',
    'tree',
    'tsne',
    'tts',
    'video',
    'vision',
    'voice',
}

stop_words_level1 = STOPWORDS.union(stop_words_se)
stop_words_level2 = stop_words_level1.union(stop_words_ml)


In [3]:
def count_code_line(block_list):
    total_loc = 0
    for blocks in block_list:
        for block in blocks:
            for line in block.splitlines():
                if line.strip():
                    total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    clean_text = soup.get_text(separator=' ')
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    code_line = code_line1 + code_line2
    return clean_text, links, code_line

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern1 = r"\!?\[.*?\]\((.*?)\)"
    links1 = re.findall(link_pattern1, text)
    clean_text = re.sub(link_pattern1, '', text)
    link_pattern2 = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links2 = re.findall(link_pattern2, clean_text)
    clean_text = re.sub(link_pattern2, '', clean_text)
    links = links1 + links2
    return clean_text, links

def split_content(content):
    clean_text, links1, code_line1 = extract_styles(content)
    clean_text, code_line2 = extract_code(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_line = code_line1 + code_line2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_line'])
    return content_collection(clean_text, links, code_line)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def extract_nouns(text):
    doc = nlp(text)
    words = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(words)

def is_english(text):
    try:
        language = detect(text)
        return language == 'en'
    except:
        # In case the detection fails (e.g. if the text is too short or doesn't contain enough features)
        return False

def extract_english(text):
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

def remove_words_with_substring(text, substring_list):
    words = text.split()
    for substring in substring_list:
        words = [word for word in words if substring not in word]
    return ' '.join(words)

def preprocess_text(text, level=1):
    clean_text = text.lower()
    clean_text = remove_words_with_substring(clean_text, tools_keyword_list)
    clean_text = strip_punctuation(clean_text)
    # clean_text = extract_english(clean_text)
    # clean_text = strip_short(clean_text)
    clean_text = extract_nouns(clean_text)
    match level:
        case 1:
            clean_text = remove_stopwords(clean_text, stop_words_level1)
        case 2:
            clean_text = remove_stopwords(clean_text, stop_words_level2)
    return clean_text

def analyze_links(links):
    image_links = 0
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([image in link for image in keywords_image]):
            image_links += 1
        elif any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['image', 'documentation', 'tool', 'issue', 'patch', 'tutorial', 'example'])
    return link_analysis(image_links, documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0)
            time.sleep(sleep)
            x += 1

def find_duplicates(in_list):  
    duplicates = []
    unique = set(in_list)
    for each in unique:
        count = in_list.count(each)
        if count > 1:
            duplicates.append(each)
    return duplicates

In [None]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']
    df_issues.at[index, 'Challenge_comment_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_comment_body'] = row['Issue_comment_body']
    df_issues.at[index, 'Challenge_comment_score'] = row['Issue_comment_score']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_questions.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_comment_body'] = row['Question_comment_body']
    df_questions.at[index, 'Challenge_comment_score'] = row['Question_comment_score']

    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_questions.at[index, 'Solution_comment_score'] = row['Answer_comment_score']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df = df[df.columns.drop(list(df.filter(regex=r'(Issue|Question|Answer)_')))]
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [5]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'original.json'))
df = df.explode('Tools')
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tools', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.pdf'))


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [6]:
# df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

# del df['Challenge_preprocessed_content']
# del df['Challenge_preprocessed_gpt_summary']
# del df['Challenge_preprocessed_title']

# df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

In [51]:
# Post level preprocessing

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    title_analyzer = split_content(row['Challenge_title'])
    # clean_title1 = preprocess_text(title_analyzer.text)
    clean_title2 = preprocess_text(title_analyzer.text, 2)
    
    # challenge_analyzer = split_content(row['Challenge_title'] + row['Challenge_body'])
    # link_analyzer = analyze_links(challenge_analyzer.links)
    # text_analyzer = analyze_text(challenge_analyzer.text)
    # clean_text1 = preprocess_text(challenge_analyzer.text)
    # clean_text2 = preprocess_text(challenge_analyzer.text, 2)
    
    # df.at[index, 'Challenge_preprocessed_title1'] = clean_title1
    df.at[index, 'Challenge_preprocessed_title2'] = clean_title2
    # df.at[index, 'Challenge_preprocessed_content1'] = clean_text1
    # df.at[index, 'Challenge_preprocessed_content2'] = clean_text2
    # df.at[index, 'Challenge_code_count'] = challenge_analyzer.code_line
    # df.at[index, 'Challenge_word_count'] = text_analyzer.word_count
    # df.at[index, 'Challenge_readability'] = text_analyzer.readability
    # df.at[index, 'Challenge_reading_time'] = text_analyzer.reading_time
    # df.at[index, 'Challenge_link_count_image'] = link_analyzer.image
    # df.at[index, 'Challenge_link_count_documentation'] = link_analyzer.documentation
    # df.at[index, 'Challenge_link_count_example'] = link_analyzer.example
    # df.at[index, 'Challenge_link_count_issue'] = link_analyzer.issue
    # df.at[index, 'Challenge_link_count_patch'] = link_analyzer.patch
    # df.at[index, 'Challenge_link_count_tool'] = link_analyzer.tool
    # df.at[index, 'Challenge_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Challenge_comment_body']):
    #     comment_analyzer = split_content(row['Challenge_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Challenge_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Challenge_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Challenge_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Challenge_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Challenge_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Challenge_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Challenge_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Challenge_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Challenge_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Challenge_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Challenge_comment_link_count_tutorial'] = link_analyzer.tutorial

    # if pd.notna(row['Solution_body']):
    #     solution_analyzer = split_content(row['Solution_body'])
    #     link_analyzer = analyze_links(solution_analyzer.links)
    #     text_analyzer = analyze_text(solution_analyzer.text)
        
    #     df.at[index, 'Solution_code_count'] = solution_analyzer.code_line
    #     df.at[index, 'Solution_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_link_count_tutorial'] = link_analyzer.tutorial
        
    # if pd.notna(row['Solution_comment_body']):
    #     comment_analyzer = split_content(row['Solution_comment_body'])
    #     link_analyzer = analyze_links(comment_analyzer.links)
    #     text_analyzer = analyze_text(comment_analyzer.text)
        
    #     df.at[index, 'Solution_comment_code_count'] = comment_analyzer.code_line
    #     df.at[index, 'Solution_comment_word_count'] = text_analyzer.word_count
    #     df.at[index, 'Solution_comment_readability'] = text_analyzer.readability
    #     df.at[index, 'Solution_comment_reading_time'] = text_analyzer.reading_time
    #     df.at[index, 'Solution_comment_link_count_image'] = link_analyzer.image
    #     df.at[index, 'Solution_comment_link_count_documentation'] = link_analyzer.documentation
    #     df.at[index, 'Solution_comment_link_count_example'] = link_analyzer.example
    #     df.at[index, 'Solution_comment_link_count_issue'] = link_analyzer.issue
    #     df.at[index, 'Solution_comment_link_count_patch'] = link_analyzer.patch
    #     df.at[index, 'Solution_comment_link_count_tool'] = link_analyzer.tool
    #     df.at[index, 'Solution_comment_link_count_tutorial'] = link_analyzer.tutorial

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [52]:
# Post level preprocessing

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    title_analyzer = split_content(row['Challenge_title'])
    if not is_english(title_analyzer.text):
        df.drop(index, inplace=True)
        continue
    clean_title1 = preprocess_text(title_analyzer.text)
    clean_title2 = preprocess_text(title_analyzer.text, 2)
    df.at[index, 'Challenge_preprocessed_title1'] = clean_title1
    df.at[index, 'Challenge_preprocessed_title2'] = clean_title2
    
    challenge_analyzer = split_content(row['Challenge_title'] + row['Challenge_body'])
    link_analyzer = analyze_links(challenge_analyzer.links)
    text_analyzer = analyze_text(challenge_analyzer.text)
    clean_text1 = preprocess_text(challenge_analyzer.text)
    clean_text2 = preprocess_text(challenge_analyzer.text, 2)
    
    df.at[index, 'Challenge_preprocessed_content1'] = clean_text1
    df.at[index, 'Challenge_preprocessed_content2'] = clean_text2
    df.at[index, 'Challenge_code_count'] = challenge_analyzer.code_line
    df.at[index, 'Challenge_word_count'] = text_analyzer.word_count
    df.at[index, 'Challenge_readability'] = text_analyzer.readability
    df.at[index, 'Challenge_reading_time'] = text_analyzer.reading_time
    df.at[index, 'Challenge_link_count_image'] = link_analyzer.image
    df.at[index, 'Challenge_link_count_documentation'] = link_analyzer.documentation
    df.at[index, 'Challenge_link_count_example'] = link_analyzer.example
    df.at[index, 'Challenge_link_count_issue'] = link_analyzer.issue
    df.at[index, 'Challenge_link_count_patch'] = link_analyzer.patch
    df.at[index, 'Challenge_link_count_tool'] = link_analyzer.tool
    df.at[index, 'Challenge_link_count_tutorial'] = link_analyzer.tutorial

    if pd.notna(row['Challenge_comment_body']):
        comment_analyzer = split_content(row['Challenge_comment_body'])
        link_analyzer = analyze_links(comment_analyzer.links)
        text_analyzer = analyze_text(comment_analyzer.text)
        
        df.at[index, 'Challenge_comment_code_count'] = comment_analyzer.code_line
        df.at[index, 'Challenge_comment_word_count'] = text_analyzer.word_count
        df.at[index, 'Challenge_comment_readability'] = text_analyzer.readability
        df.at[index, 'Challenge_comment_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Challenge_comment_link_count_image'] = link_analyzer.image
        df.at[index, 'Challenge_comment_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Challenge_comment_link_count_example'] = link_analyzer.example
        df.at[index, 'Challenge_comment_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Challenge_comment_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Challenge_comment_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Challenge_comment_link_count_tutorial'] = link_analyzer.tutorial

    if pd.notna(row['Solution_body']):
        solution_analyzer = split_content(row['Solution_body'])
        link_analyzer = analyze_links(solution_analyzer.links)
        text_analyzer = analyze_text(solution_analyzer.text)
        
        df.at[index, 'Solution_code_count'] = solution_analyzer.code_line
        df.at[index, 'Solution_word_count'] = text_analyzer.word_count
        df.at[index, 'Solution_readability'] = text_analyzer.readability
        df.at[index, 'Solution_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Solution_link_count_image'] = link_analyzer.image
        df.at[index, 'Solution_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Solution_link_count_example'] = link_analyzer.example
        df.at[index, 'Solution_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Solution_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Solution_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Solution_link_count_tutorial'] = link_analyzer.tutorial
        
    if pd.notna(row['Solution_comment_body']):
        comment_analyzer = split_content(row['Solution_comment_body'])
        link_analyzer = analyze_links(comment_analyzer.links)
        text_analyzer = analyze_text(comment_analyzer.text)
        
        df.at[index, 'Solution_comment_code_count'] = comment_analyzer.code_line
        df.at[index, 'Solution_comment_word_count'] = text_analyzer.word_count
        df.at[index, 'Solution_comment_readability'] = text_analyzer.readability
        df.at[index, 'Solution_comment_reading_time'] = text_analyzer.reading_time
        df.at[index, 'Solution_comment_link_count_image'] = link_analyzer.image
        df.at[index, 'Solution_comment_link_count_documentation'] = link_analyzer.documentation
        df.at[index, 'Solution_comment_link_count_example'] = link_analyzer.example
        df.at[index, 'Solution_comment_link_count_issue'] = link_analyzer.issue
        df.at[index, 'Solution_comment_link_count_patch'] = link_analyzer.patch
        df.at[index, 'Solution_comment_link_count_tool'] = link_analyzer.tool
        df.at[index, 'Solution_comment_link_count_tutorial'] = link_analyzer.tutorial

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.




The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.


The input looks more like a URL than markup. You may want to use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



In [4]:
# GPT summary

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary']):
        continue

    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + '\nBody: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-3.5-turbo',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=50,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(0.5)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


persisting on post 99
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/awslabs/gluonts/issues/426
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/huggingface/transformers/issues/13875
persisting on post 199
persisting on post 299
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/rom1504/img2dataset/issues/219
persisting on post 399
persisting on post 499
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/SeldonIO/seldon-core/issues/4497
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/SeldonIO/seldon-core/issues/4014
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/SeldonIO/seldon-core/issues/3846
Random.uniform() missing 1 required positional argument: 'b' on post https://github.com/SeldonIO/MLServer/issues/811
persisting on post 599
Random.uniform() missin

In [19]:
# GPT summary

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    clean_summary1 = preprocess_text(row['Challenge_gpt_summary'])
    clean_summary2 = preprocess_text(row['Challenge_gpt_summary'], 2)
    df.at[index, 'Challenge_preprocessed_gpt_summary1'] = clean_summary1
    df.at[index, 'Challenge_preprocessed_gpt_summary2'] = clean_summary2

df.sort_index(axis=1, inplace=True)
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [7]:
# df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))
# df_old = pd.read_json(os.path.join(path_dataset, 'original.json'))

# for index, row in df.iterrows():
#     if 'Stack' not in row['Platform']:
#         continue
#     for i2, r2 in df_old.iterrows():
#         if 'Stack' not in r2['Platform']:
#             continue
#         if row['Challenge_link'] == r2['Challenge_link']:
#             df.at[index, 'Tools'] = r2['Tools']
#             break

# df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


In [None]:
# import openai
# from bertopic.backend import OpenAIBackend

# # openai.api_key = MY_API_KEY
# embedding_model = OpenAIBackend(delay_in_seconds=0.1, batch_size=10)

# from bertopic import BERTopic

# df = pd.read_json(os.path.join(path_special_output, 'labels.json'))

# docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()

# topic_model = BERTopic(embedding_model=embedding_model)
# topics, probs = topic_model.fit_transform(docs)
# topic_model.get_topic_info()

In [33]:
# def minimize_weighted_sum(df, sort_column):
#     df_new = df.sort_values(sort_column, ascending=False)
#     n = len(df)
#     center_idx = (n - 1) // 2
#     direction = -1
#     distance = 0

#     for _, row in df_new.iterrows():
#         # Calculate the new index
#         new_idx = center_idx + direction * distance
        
#         # Place the element from the sorted list into the new list
#         df.iloc[new_idx] = row

#         # If we've just moved to the left, increase the distance
#         if direction == -1:
#             distance += 1

#         # Switch the direction
#         direction *= -1

#     return df


In [None]:
prompt_topic = '''You will be given a list of words refering to specific software engineering topics. Please summarize each topic in two to three words and attach one sentence description. Also, you must guarantee that the terms are not duplicate with one another.###\n'''

with open(os.path.join(path_rq1, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

    topic_term_list = []
    for index, topic in enumerate(topic_terms):
        terms = ', '.join([term[0] for index, term in enumerate(topic) if index < 5])
        topic_term = f'Topic {index}: {terms}]'
        topic_term_list.append(topic_term)

    prompt = prompt_topic + '\n'.join(topic_term_list) + '\n###\n'
    completion = openai.ChatCompletion.create(
        model='gpt-4',
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        timeout=300,
        stream=False)

    topics = completion.choices[0].message.content
    print(topics)

In [13]:
topics = ''''''

In [15]:
topic_list = [topic for topic in topics.split('\n') if topic]
macro_topic_mapping_inverse = {
    'Environment Management': [0,8,21,28,44,56,58,60,62,75,80,83,86,95,97,107,112,118,122,138,141,156,158],#107?
    'Resource Management': [23,36,38,49,81,96,117,119,123,159],
    'Pipeline Management': [20,32,42,47,67,74,105,109,131,135],
    'Experiment Management': [13,59,88,128,165],
    'Code Development': [1,43,51,70,79,90,91,100,108,126,129,140,145,160],#129?
    'Data Development': [24,26,27,30,46,53,82,84,89,130],#89?
    'Data Management': [3,15,17,29,33,34,39,71,72,101,104,116,134,157,163],
    'Model Development': [4,7,11,14,63,106,115,142,143,148,149],
    'Model Management': [2,5,18,61,77,87,94,114,125,133],
    'Model Serving': [6,16,22,31,40,52,57,64,69,103,113,124,127,139,144,152,166],
    'Version Management': [45,65,78,120,161],
    'Network Management': [25,66,93,121,132,164],#164?
    'Observability Management': [10,12,35,54,76,137,147],
    'Security Management': [9,73,99,153,155],
    'Documentation Management': [],
    'File Management': [37,41,50,92,111,136,146,150,151],#146?#150?
    'QA Management': [98,110,162],
    'Visualization Management': [19,154],
    'Discarded': [48,55,68,85,102],
}
        
macro_topic_list = []
macro_topic_mapping = {}
macro_topic_indexing = {}
for macro_topic, sub_topics in macro_topic_mapping_inverse.items():
    index, name = int(macro_topic.split(': ')[0]), macro_topic.split(': ')[1]
    macro_topic_indexing[index] = name
    macro_topic_list.extend(sub_topics)
    for topic in sub_topics:
        macro_topic_mapping[topic] = macro_topic

print(find_duplicates(macro_topic_list))
print(len(macro_topic_list) == len(topic_list))
print(set(range(len(topic_list))).difference(set(macro_topic_list)))

[]
False
{17, 15}


In [187]:
# # assign human-readable & high-level topics to challenges & solutions

# df = pd.read_json(os.path.join(path_special_output, 'labels.json'))
# df['Challenge_topic_macro'] = -1

# for index, row in df.iterrows():
#     if row['Challenge_topic'] in macro_topic_mapping:
#         df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']].split(':')[0])
#     else:
#         df.drop(index, inplace=True)

# df.to_json(os.path.join(path_special_output, 'labels.json'), indent=4, orient='records')

In [178]:
# assign human-readable & high-level topics to challenges & solutions

df = pd.read_json(os.path.join(path_rq1, 'topics.json'))
df['Challenge_topic_macro'] = -1

for index, row in df.iterrows():
    if row['Challenge_topic'] in macro_topic_mapping:
        df.at[index, 'Challenge_topic_macro'] = int(macro_topic_mapping[row['Challenge_topic']].split(':')[0])
    else:
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_rq1, 'filtered.json'), indent=4, orient='records')

df_number = pd.DataFrame()

for name, group in df.groupby('Challenge_topic_macro'):
    entry = {
        'Topic': macro_topic_indexing[name],
        'Percentage': round(len(group)/len(df)*100, 2),
        'Number': len(group),
    }
    df_number = pd.concat([df_number, pd.DataFrame([entry])], ignore_index=True)

df_number = df_number.sort_values('Percentage', ascending=False)
print(df_number.to_latex(float_format="%.2f", index=False))

\begin{tabular}{lrr}
\toprule
Topic & Percentage & Number \\
\midrule
Model Management & 21.39 & 2378 \\
Compute Management & 20.05 & 2229 \\
Environment Management & 17.90 & 1990 \\
Data Management & 13.13 & 1460 \\
Lifecycle Management & 9.94 & 1105 \\
Access Management & 7.84 & 872 \\
Observability Management & 6.65 & 739 \\
Code Management & 3.09 & 344 \\
\bottomrule
\end{tabular}



In [None]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_rq1, 'filtered.json'))
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
categories = ['Challenge_type', 'Challenge_topic_macro', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_rq1, 'State type topic sankey.png'))
