In [20]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np

from matplotlib import pyplot as plt

import plotly.graph_objects as go
import plotly.express as px

import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

import subprocess
cmd_str = "python -m spacy download en_core_web_trf -q"
subprocess.run(cmd_str, shell=True)


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


CompletedProcess(args='python -m spacy download en_core_web_trf -q', returncode=0)

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [21]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_solution = os.path.join(path_result, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_challenge_information = os.path.join(path_challenge, 'Information')
if not os.path.exists(path_challenge_information):
    os.makedirs(path_challenge_information)

path_solution_information = os.path.join(path_solution, 'Information')
if not os.path.exists(path_solution_information):
    os.makedirs(path_solution_information)

path_challenge_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_challenge_evolution):
    os.makedirs(path_challenge_evolution)

path_solution_evolution = os.path.join(path_solution, 'Evolution')
if not os.path.exists(path_solution_evolution):
    os.makedirs(path_solution_evolution)

In [None]:
# combine issues and questions

import re
import spacy

# Refer to https://textacy.readthedocs.io/en/stable/api_reference/text_stats.html
from textacy import text_stats

nlp = spacy.load('en_core_web_trf')
link_pattern = '(?P<url>ftp|https?://[^\s]+)'

df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

df_issues['Solution_word_count'] = np.nan
df_issues['Solution_unique_word_count'] = np.nan
df_issues['Solution_sentence_count'] = np.nan
df_issues['Solution_information_entropy'] = np.nan
df_issues['Solution_readability'] = np.nan
df_issues['Solution_link_count'] = np.nan

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_original_content'] = row['Issue_original_content']
    df_issues.at[index, 'Challenge_preprocessed_content'] = row['Issue_preprocessed_content']
    df_issues.at[index, 'Challenge_gpt_summary'] = row['Issue_gpt_summary']
    df_issues.at[index, 'Challenge_creation_time'] = row['Issue_creation_time']
    df_issues.at[index, 'Challenge_answer_count'] = row['Issue_answer_count']
    df_issues.at[index, 'Challenge_score'] = row['Issue_upvote_count'] - row['Issue_downvote_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    
    challenge_content = row['Issue_title'] + '. ' + str(row['Issue_body'])
    challenge_content_nlp = nlp(challenge_content)
    df_issues.at[index, 'Challenge_word_count'], df_issues.at[index, 'Challenge_unique_word_count'] = text_stats.utils.compute_n_words_and_types(challenge_content_nlp)
    df_issues.at[index, 'Challenge_sentence_count'] = text_stats.basics.n_sents(challenge_content_nlp)
    df_issues.at[index, 'Challenge_information_entropy'] = text_stats.basics.entropy(challenge_content_nlp)
    df_issues.at[index, 'Challenge_readability'] = text_stats.readability.automated_readability_index(challenge_content_nlp)
    df_issues.at[index, 'Challenge_link_count'] = len(re.findall(link_pattern, challenge_content))
    
    df_issues.at[index, 'Solution_original_content'] = row['Answer_original_content']
    df_issues.at[index, 'Solution_preprocessed_content'] = row['Answer_preprocessed_content']
    df_issues.at[index, 'Solution_gpt_summary'] = row['Answer_gpt_summary']
    
    discussion = row['Answer_body']
    
    if pd.notna(discussion):
        discussion_nlp = nlp(discussion)
        df_issues.at[index, 'Solution_word_count'], df_issues.at[index, 'Solution_unique_word_count'] = text_stats.utils.compute_n_words_and_types(discussion_nlp)
        df_issues.at[index, 'Solution_sentence_count'] = text_stats.basics.n_sents(discussion_nlp)
        df_issues.at[index, 'Solution_information_entropy'] = text_stats.basics.entropy(discussion_nlp)
        df_issues.at[index, 'Solution_readability'] = text_stats.readability.automated_readability_index(discussion_nlp)
        df_issues.at[index, 'Solution_link_count'] = len(re.findall(link_pattern, discussion))

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

df_questions['Solution_word_count'] = np.nan
df_questions['Solution_unique_word_count'] = np.nan
df_questions['Solution_sentence_count'] = np.nan
df_questions['Solution_information_entropy'] = np.nan
df_questions['Solution_readability'] = np.nan
df_questions['Solution_link_count'] = np.nan

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_original_content'] = row['Question_original_content']
    df_questions.at[index, 'Challenge_preprocessed_content'] = row['Question_preprocessed_content']
    df_questions.at[index, 'Challenge_gpt_summary'] = row['Question_gpt_summary']
    df_questions.at[index, 'Challenge_creation_time'] = row['Question_creation_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_score'] = row['Question_score']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_follower_count'] = row['Question_follower_count']
    df_questions.at[index, 'Challenge_converted_from_issue'] = row['Question_converted_from_issue']
    
    challenge_content = row['Question_title'] + '. ' + str(row['Question_body'])
    challenge_content_nlp = nlp(challenge_content)
    df_questions.at[index, 'Challenge_word_count'], df_questions.at[index, 'Challenge_unique_word_count'] = text_stats.utils.compute_n_words_and_types(challenge_content_nlp)
    df_questions.at[index, 'Challenge_sentence_count'] = text_stats.basics.n_sents(challenge_content_nlp)
    df_questions.at[index, 'Challenge_information_entropy'] = text_stats.basics.entropy(challenge_content_nlp)
    df_questions.at[index, 'Challenge_readability'] = text_stats.readability.automated_readability_index(challenge_content_nlp)
    df_questions.at[index, 'Challenge_link_count'] = len(re.findall(link_pattern, challenge_content))
    
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']
    df_questions.at[index, 'Solution_score'] = row['Answer_score']
    df_questions.at[index, 'Solution_original_content'] = row['Answer_original_content']
    df_questions.at[index, 'Solution_preprocessed_content'] = row['Answer_preprocessed_content']
    df_questions.at[index, 'Solution_gpt_summary'] = row['Answer_gpt_summary']
    
    discussion = row['Answer_body']
        
    if discussion:
        discussion_nlp = nlp(discussion)
        df_questions.at[index, 'Solution_word_count'], df_questions.at[index, 'Solution_unique_word_count'] = text_stats.utils.compute_n_words_and_types(discussion_nlp)
        df_questions.at[index, 'Solution_sentence_count'] = text_stats.basics.n_sents(discussion_nlp)
        df_questions.at[index, 'Solution_information_entropy'] = text_stats.basics.entropy(discussion_nlp)
        df_questions.at[index, 'Solution_readability'] = text_stats.readability.automated_readability_index(discussion_nlp)
        df_questions.at[index, 'Solution_link_count'] = len(re.findall(link_pattern, discussion))

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_creation_time']
del df_issues['Issue_answer_count']
del df_issues['Issue_upvote_count']
del df_issues['Issue_downvote_count']
del df_issues['Issue_original_content']
del df_issues['Issue_preprocessed_content']
del df_issues['Issue_gpt_summary_original']
del df_issues['Issue_gpt_summary']
del df_issues['Issue_closed_time']

del df_issues['Answer_body']
del df_issues['Answer_list']
del df_issues['Answer_original_content']
del df_issues['Answer_preprocessed_content']
del df_issues['Answer_gpt_summary_original']
del df_issues['Answer_gpt_summary']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_creation_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score']
del df_questions['Question_original_content']
del df_questions['Question_preprocessed_content']
del df_questions['Question_gpt_summary_original']
del df_questions['Question_gpt_summary']
del df_questions['Question_closed_time']
del df_questions['Question_view_count']
del df_questions['Question_favorite_count']
del df_questions['Question_last_edit_time']
del df_questions['Question_follower_count']
del df_questions['Question_converted_from_issue']

del df_questions['Answer_body']
del df_questions['Answer_list']
del df_questions['Answer_comment_count']
del df_questions['Answer_last_edit_time']
del df_questions['Answer_score']
del df_questions['Answer_original_content']
del df_questions['Answer_preprocessed_content']
del df_questions['Answer_gpt_summary_original']
del df_questions['Answer_gpt_summary']

df_all = pd.concat([df_issues, df_questions], ignore_index=True)
df_all = df_all.reindex(sorted(df_all.columns), axis=1)
df_all.to_json(os.path.join(path_dataset, 'original.json'),
               indent=4, orient='records')

In [56]:
from gensim.parsing.preprocessing import preprocess_string
preprocess_string("confusing")

['confus']

In [25]:
# remove custom stop words from challenges and solutions

from gensim.parsing.preprocessing import remove_stopwords

stop_words_custom = [
    'altern',
    'amaz',
    'amazon',
    'answer',
    'appear',
    # 'api',
    'applic',
    'appreci',
    'approach',
    'aris',
    'ask',
    'assum',
    'astonish',
    'attempt',
    'aw',
    'awesom',
    'azur',
    'bad',
    # 'begin',
    'behavior',
    'behaviour',
    'best',
    'better',
    'case',
    'categori',
    'caus',
    'challeng',
    'cloudera',
    # 'close',
    'code',
    'command',
    'confus',
    'consid',
    'contain',
    'content',
    'correct',
    'correctli',
    'correspond',
    'couldn',
    'curiou',
    'custom',
    'deep',
    'demand',
    'demo',
    'despit',
    'differ',
    'differenti',
    'difficult',
    'difficulti',
    'discuss',
    'distinguish',
    'easi',
    'effect',
    'emerg',
    'encount',
    # 'end',
    'enquiri',
    'error',
    'especi',
    'exampl',
    'excit',
    'expect',
    'experi',
    'databrick',
    'domo',
    'face',
    'fascin',
    'fail',
    'failur',
    'fairli',
    'favorit',
    'favourit',
    'feel',
    'firstli',
    'fix',
    'gcp',
    'given',
    'good',
    'googl',
    'gurante',
    'happen',
    'hard',
    'hei',
    'hello',
    'help',
    'ibm',
    'impli',
    'implic',
    'includ',
    'incorrect',
    'incorrectli',
    'incred',
    'indic',
    'info',
    'inform',
    'inquiri',
    'insight',
    'instead',
    'interest',
    'invalid',
    'issu',
    'kind',
    'know',
    'lead',
    'learn',
    'like',
    'look',
    'machin',
    'main',
    'major',
    'manner',
    'marvel',
    'mean',
    'meaning',
    'meaningfulli',
    'meaningless',
    'mention',
    'method',
    'microsoft',
    'mind',
    'mistak',
    'mistakenli',
    # 'multipl',
    'need',
    'new',
    'non',
    'notice',
    'occas',
    'occasion',
    'occur',
    'offer',
    'old',
    'own',
    # 'open',
    'oracl',
    'ought',
    'outcom',
    'particular',
    'particularli',
    'perceive',
    'perspect',
    'point',
    'pointless',
    'possibl',
    'pretty',
    'problem',
    'product',
    # 'program',
    'project',
    'provid',
    'python',
    'pytorch',
    'question',
    'realize',
    'recognize',
    'refer',
    'regard',
    'requir',
    'resolv',
    'respond',
    'result',
    'right',
    'rightli',
    'scenario',
    'scikit',
    'script',
    'second',
    'secondli',
    'seek',
    'seen',
    'shall',
    'shan',
    'shock',
    'shouldn',
    'similar',
    'situat',
    'sklearn',
    'snippet',
    'snowflak',
    'solut',
    'solv',
    'sound',
    # 'sourc',
    'special',
    'specif',
    # 'start',
    'startl',
    'strang',
    'struggl',
    'stun',
    'succe',
    'success',
    'suggest',
    'super',
    'talk',
    'tensorflow',
    'thank',
    'think',
    'thirdli',
    'thought',
    'topic',
    'try',
    'unabl',
    'understand',
    'unexpect',
    'us',
    'user',
    'usual',
    'valid',
    'view',
    'viewpoint',
    'wai',
    'want',
    'weird',
    'worst',
    'won',
    'wonder',
    'work',
    'wors',
    'wouldn',
    'wrong',
    'wrongli',
    'ye',
] 

df_all = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df_all.iterrows():
    df_all.at[index, 'Challenge_original_content'] = remove_stopwords(row['Challenge_original_content'], stopwords=stop_words_custom)
    df_all.at[index, 'Challenge_preprocessed_content'] = remove_stopwords(row['Challenge_preprocessed_content'], stopwords=stop_words_custom)
    df_all.at[index, 'Challenge_gpt_summary'] = remove_stopwords(row['Challenge_gpt_summary'], stopwords=stop_words_custom)

    if row['Solution_gpt_summary']:
        df_all.at[index, 'Solution_original_content'] = remove_stopwords(row['Solution_original_content'], stopwords=stop_words_custom)
        df_all.at[index, 'Solution_preprocessed_content'] = remove_stopwords(row['Solution_preprocessed_content'], stopwords=stop_words_custom)
        df_all.at[index, 'Solution_gpt_summary'] = remove_stopwords(row['Solution_gpt_summary'], stopwords=stop_words_custom)

df_all.to_json(os.path.join(path_dataset, 'preprocessed.json'),
               indent=4, orient='records')

In [57]:
df_all = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

# remove issues with uninformed content
for index, row in df_all.iterrows():
    if len(row['Challenge_original_content'].split()) < 6 or len(row['Challenge_original_content']) < 30:
        print('Challenge: ', row['Challenge_original_content'])
        df_all.drop(index, inplace=True)
#     elif pd.notna(row['Solution_original_content']) and (len(row['Solution_original_content'].split()) < 6 or len(row['Solution_original_content']) < 30):
#         print('Solution: ', row['Solution_original_content'])
#         df_all.drop(index, inplace=True)

df_all.to_json(os.path.join(path_dataset, 'filtered.json'),
               indent=4, orient='records')


Challenge:  initi studio lab titl
Challenge:  reproduc featur appli patch stage
Challenge:  access allow access member
Challenge:  tensorboard default logger option
Challenge:  connect databas ye connect databas
Challenge:  save model save model studi
Challenge:  api api studio calcul
Challenge:  run notebook develop notebook
Challenge:  endpoint public endpoint access publicli
Challenge:  price know estim price design
Challenge:  support distribut gpu plan date
Challenge:  sdk plane gen chang
Challenge:  descriptor creat messag super confus
Challenge:  model state deploi web servic
Challenge:  monitor setup monitor pipelin job
Challenge:  disappear dont migrat design access
Challenge:  build bot build bot hand book
Challenge:  framework dont document
Challenge:  copi workspac duplic workspac debug
Challenge:  algorithm suitabl reorgan attend
Challenge:  csv export datacsvcsv
Challenge:  mapinputport execut maml mapinputport studio
Challenge:  graph dataset graph dataset
Challenge:  ap

In [59]:
# Draw sankey diagram of tool and platform

df_all = pd.read_json(os.path.join(path_dataset, 'original.json'))
df_all['State'] = df_all['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tool', 'State']

df_all = df_all.groupby(categories).size().reset_index(name='value')
df_all.to_json(os.path.join(path_general, 'Tool platform info.json'),
               indent=4, orient='records')

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_all[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_all[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict(label=label)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(
    path_general, 'Tool platform sankey.png'))

In [None]:
# post-process the clustered topics out of the best training model 

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]

# as if we assign the topic id as the label
label_challenge_original = df_topics['Challenge_topic'].unique().tolist()
label_challenge_refined = [f'c_{label}' for label in label_challenge_original]
label_challenge_map = dict(
    zip(label_challenge_original, label_challenge_refined))

label_solution_original = df_topics['Solution_topic'].unique().tolist()
label_solution_refined = [f's_{label}' for label in label_solution_original]
label_solution_map = dict(zip(label_solution_original, label_solution_refined))

df_topics = df_topics.replace(
    {'Challenge_topic': label_challenge_map, 'Solution_topic': label_solution_map})

categories = ['Challenge_topic', 'Solution_topic']
df_topics = df_topics.groupby(categories).size().reset_index(name='value')

# we only visualize large topics
df_topics = df_topics[df_topics['value'] > 50]

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_topics[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_topics[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict(label=label)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(height=1000, width=1000, font=dict(size=30))
fig.write_image(os.path.join(path_challenge_information,
                'Challenge solution sankey.png'))

In [None]:
# Create challenge topic distribution tree map

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Challenge_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_challenge_information,
                'Challenge_topic_distribution.png'))


In [26]:
# Create solution topic distribution tree map

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Solution_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_solution_information,
                'Solution_topic_distribution.png'))


In [None]:
# Collect challenge statistics information

df_challenge = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_challenge['Challenge_solved_time'] = df_challenge['Challenge_closed_time'] - \
    df_challenge['Challenge_creation_time']
df_challenge['Challenge_participation_count'] = df_challenge['Challenge_answer_count'] + \
    df_challenge['Challenge_comment_count']

total_count = df_challenge['Challenge_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Challenge_topic'):
    Mean_score = group['Challenge_score'].mean()
    Mean_link_count = group['Challenge_link_count'].mean()
    Mean_information_entropy = group['Challenge_information_entropy'].mean()
    Mean_answer_count = group['Challenge_answer_count'].mean()
    Mean_comment_count = group['Challenge_comment_count'].mean()
    Mean_participation_count = Mean_answer_count + Mean_comment_count
    Score_participation_ratio = Mean_score / Mean_participation_count
    Score_participation_weighted_product = (
        group['Challenge_score'] * group['Challenge_participation_count']).mean()
    Count = group['Challenge_topic'].count()
    Count_ratio = Count / total_count * 100
    Solved_ratio = group['Challenge_closed_time'].notna().sum() / Count
    Mean_solved_time = group['Challenge_solved_time'].mean(
    ) / pd.Timedelta(hours=1)
    Median_solved_time = group['Challenge_solved_time'].median(
    ) / pd.Timedelta(hours=1)
    topic_info = {
        'Challenge_topic': name,
        'Mean_score': Mean_score,
        'Mean_link_count': Mean_link_count,
        'Mean_information_entropy': Mean_information_entropy,
        'Mean_answer_count': Mean_answer_count,
        'Mean_comment_count': Mean_comment_count,
        'Score_participation_ratio': Score_participation_ratio,
        'Score_participation_weighted_product': Score_participation_weighted_product,
        'Count_ratio': Count_ratio,
        'Solved_ratio': Solved_ratio,
        'Mean_solved_time': Mean_solved_time,
        'Median_solved_time': Median_solved_time,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_challenge_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Challenge_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_information_entropy', ascending=False)['Mean_information_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_information_entropy.png'))
plt.close()

fig = df_topics.sort_values('Mean_answer_count', ascending=False)['Mean_answer_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean answer count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_answer_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_comment_count', ascending=False)['Mean_comment_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean comment count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_comment_count.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_ratio', ascending=False)['Score_participation_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_ratio.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_weighted_product', ascending=False)['Score_participation_weighted_product'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation weighted product', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_weighted_product.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Count_ratio.png'))
plt.close()

fig = df_topics.sort_values('Solved_ratio')['Solved_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge Solved ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'solved_ratio.png'))
plt.close()

fig = df_topics.sort_values('Mean_solved_time', ascending=False)['Mean_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge median solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'mean_solved_time.png'))
plt.close()

fig = df_topics.sort_values('Median_solved_time', ascending=False)['Median_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'median_solved_time.png'))
plt.close()


In [None]:
# Collect solution statistics information

df_solution = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]

total_count = df_challenge['Solution_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Solution_topic'):
    Mean_score = group['Answer_score'].mean()
    Mean_link_count = group['Solution_link_count'].mean()
    Mean_information_entropy = group['Solution_information_entropy'].mean()
    Count_ratio = group['Solution_topic'].count() / total_count * 100
    topic_info = {
        'Challenge_topic': name,
        'Mean_score': Mean_score,
        'Mean_link_count': Mean_link_count,
        'Mean_information_entropy': Mean_information_entropy,
        'Count_ratio': Count_ratio,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_solution_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Challenge_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_information_entropy', ascending=False)['Mean_information_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_information_entropy.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Count_ratio.png'))
plt.close()


In [4]:
import scipy.interpolate
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess


def smooth(x, y, xgrid, lowess_kw=None):
    samples = np.random.choice(len(x), 50, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s, x_s, **lowess_kw)
    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(
        x_s, y_sm, fill_value='extrapolate')(xgrid)
    return y_grid


def lowess_with_confidence_bounds(x, y, conf_interval=0.95, lowess_kw=None):
    """
    Perform Lowess regression and determine a confidence interval by bootstrap resampling
    """
    xgrid = np.linspace(x.min(), x.max())

    K = 100
    smooths = np.stack([smooth(x, y, xgrid, lowess_kw) for _ in range(K)]).T

    mean = np.nanmean(smooths, axis=1)
    stderr = scipy.stats.sem(smooths, axis=1)

    clower = np.nanpercentile(smooths, (1-conf_interval)*50, axis=1)
    cupper = np.nanpercentile(smooths, (1+conf_interval)*50, axis=1)

    return xgrid, mean, stderr, clower, cupper

In [73]:
df_all = pd.read_json(os.path.join(path_dataset, 'topics.json'))
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_all['Challenge_creation_time']), max(df_all['Challenge_creation_time'])


(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [None]:
# Explore challenge topics evolution

df_challenge = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_challenge = df_challenge[(df_challenge['Challenge_creation_time'] > '2014-09-14')
                            & (df_challenge['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_challenge.groupby('Challenge_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_creation_time', freq='2W')).agg(
        Count=('Challenge_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_creation_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_creation_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_challenge_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()

In [28]:
df_all = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_all[df_all['Solution_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_solution['Challenge_creation_time']), max(
    df_solution['Challenge_creation_time'])

(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [7]:
# Explore solution topics evolution

df_solution = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]
df_solution = df_solution[(df_solution['Challenge_creation_time'] > '2014-09-14')
                          & (df_solution['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_solution.groupby('Solution_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='W')).agg(
        Count=('Solution_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_closed_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_closed_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_solution_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()