In [2]:
import os
import numpy as np
import plotly.graph_objects as go
from matplotlib import pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_solution = os.path.join(path_result, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_challenge_information = os.path.join(path_challenge, 'Information')
if not os.path.exists(path_challenge_information):
    os.makedirs(path_challenge_information)

path_solution_information = os.path.join(path_solution, 'Information')
if not os.path.exists(path_solution_information):
    os.makedirs(path_solution_information)

path_challenge_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_challenge_evolution):
    os.makedirs(path_challenge_evolution)

path_solution_evolution = os.path.join(path_solution, 'Evolution')
if not os.path.exists(path_solution_evolution):
    os.makedirs(path_solution_evolution)

In [28]:
# combine issues and questions

import re
from scipy.stats import entropy

link_pattern = '(?P<url>https?://[^\s]+)'

df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))
df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

df_issues['Challenge_link'] = df_issues['Issue_link']
df_issues['Challenge_original_content'] = df_issues['Issue_original_content']
df_issues['Challenge_preprocessed_content'] = df_issues['Issue_preprocessed_content']
df_issues['Challenge_summary'] = df_issues['Issue_gpt_summary']
df_issues['Challenge_creation_time'] = df_issues['Issue_creation_time']
df_issues['Challenge_answer_count'] = df_issues['Issue_answer_count']
df_issues['Challenge_comment_count'] = 0
df_issues['Challenge_score'] = df_issues['Issue_upvote_count'] - \
    df_issues['Issue_downvote_count']
df_issues['Challenge_closed_time'] = df_issues['Issue_closed_time']
df_issues['Challenge_info_entropy'] = (df_issues['Issue_title'] + ' ' + df_issues['Issue_body'].astype(
    str)).apply(lambda x: entropy([x.count(c) / len(x) for c in set(x)], base=2))
df_issues['Challenge_link_count'] = (df_issues['Issue_title'] + ' ' + df_issues['Issue_body'].astype(
    str)).apply(lambda x: len(re.findall(link_pattern, x))) 
df_issues['Solution_summary'] = df_issues['Fix_manual_summary']

df_questions['Challenge_link'] = df_questions['Question_link']
df_questions['Challenge_original_content'] = df_questions['Question_original_content']
df_questions['Challenge_preprocessed_content'] = df_questions['Question_preprocessed_content']
df_questions['Challenge_summary'] = df_questions['Question_gpt_summary']
df_questions['Challenge_creation_time'] = df_questions['Question_creation_time']
df_questions['Challenge_answer_count'] = df_questions['Question_answer_count']
df_questions['Challenge_comment_count'] = df_questions['Question_comment_count']
df_questions['Challenge_score'] = df_questions['Question_score']
df_questions['Challenge_closed_time'] = df_questions['Question_closed_time']
df_questions['Challenge_info_entropy'] = (df_questions['Question_title'] + ' ' + df_questions['Question_body'].astype(
    str)).apply(lambda x: entropy([x.count(c) / len(x) for c in set(x)], base=2))
df_questions['Challenge_link_count'] = (df_questions['Question_title'] + ' ' + df_questions['Question_body'].astype(
    str)).apply(lambda x: len(re.findall(link_pattern, x)))
df_questions['Solution_original_content'] = df_questions['Answer_original_content']
df_questions['Solution_preprocessed_content'] = df_questions['Answer_preprocessed_content']
df_questions['Solution_summary'] = df_questions['Answer_gpt_summary']
df_questions['Solution_info_entropy'] = df_questions['Answer_body'].apply(lambda x: entropy([x.count(c) / len(x) for c in set(x)], base=2) if not pd.isna(x) else None)
df_questions['Solution_link_count'] = df_questions['Answer_body'].astype(str).apply(lambda x: len(re.findall(link_pattern, x)))

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_creation_time']
del df_issues['Issue_answer_count']
del df_issues['Issue_upvote_count']
del df_issues['Issue_downvote_count']
del df_issues['Issue_original_content']
del df_issues['Issue_preprocessed_content']
del df_issues['Issue_gpt_summary_original']
del df_issues['Issue_gpt_summary']
del df_issues['Issue_closed_time']
del df_issues['Fix_manual_summary_original']
del df_issues['Fix_manual_summary']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_creation_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score']
del df_questions['Question_original_content']
del df_questions['Question_preprocessed_content']
del df_questions['Question_gpt_summary_original']
del df_questions['Question_gpt_summary']
del df_questions['Question_closed_time']
del df_questions['Answer_body']
del df_questions['Answer_list']
del df_questions['Answer_original_content']
del df_questions['Answer_preprocessed_content']
del df_questions['Answer_gpt_summary_original']
del df_questions['Answer_gpt_summary']

df_all = pd.concat([df_issues, df_questions], ignore_index=True)
df_all.to_json(os.path.join(path_dataset, 'original.json'),
               indent=4, orient='records')

In [29]:
# Draw sankey diagram of tool and platform

df_all = pd.read_json(os.path.join(path_dataset, 'original.json'))

categories = ['Platform', 'Tool']
df_all = df_all.groupby(categories).size().reset_index(name='value')

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_all[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_all[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict(label=label)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(
    path_general, 'Tool platform sankey.png'))

In [30]:
# remove custom stop words from challenges and solutions

from gensim.parsing.preprocessing import remove_stopwords

# Refer to https://venturebeat.com/data-infrastructure/top-10-data-lake-solution-vendors-in-2022/
stop_words_custom = [
    'altern',
    'amazon',
    'answer',
    # 'api',
    'applic',
    'appreci',
    'approach',
    'aris',
    'ask',
    'assum',
    'attempt',
    'aw',
    'azur',
    'bad',
    # 'begin',
    'behavior',
    'behaviour',
    'best',
    'better',
    'caus',
    'challeng',
    'cloudera',
    # 'close',
    'code',
    'command',
    'consid',
    'contain',
    'content',
    'correct',
    'correctli',
    'correspond',
    'couldn',
    'curiou',
    'custom',
    'deep',
    'demand',
    'demo',
    'despit',
    'differ',
    'differenti',
    'difficult',
    'difficulti',
    'distinguish',
    'easi',
    'effect',
    'encount',
    # 'end',
    'enquiri',
    'error',
    'especi',
    'exampl',
    'expect',
    'experi',
    'databrick',
    'domo',
    'face',
    'fail',
    'failur',
    'firstli',
    'fix',
    'gcp',
    'given',
    'good',
    'googl',
    'gurante',
    'happen',
    'hard',
    'hei',
    'hello',
    'help',
    'ibm',
    'includ',
    'incorrect',
    'incorrectli',
    'info',
    'inform',
    'inquiri',
    'insight',
    'instead',
    'intern',
    'invalid',
    'issu',
    'lead',
    'learn',
    'like',
    'look',
    'machin',
    'main',
    'major',
    'manner',
    'mention',
    'method',
    'microsoft',
    'mind',
    'mistak',
    'mistakenli',
    # 'multipl',
    'need',
    'new',
    'non',
    'occur',
    'offer',
    'old',
    'own',
    # 'open',
    'oracl',
    'ought',
    'outcom',
    'particular',
    'particularli',
    'perspect',
    'possibl',
    'problem',
    'product',
    # 'program',
    'project',
    'provid',
    'python',
    'pytorch',
    'question',
    'requir',
    'resolv',
    'respond',
    'result',
    'right',
    'rightli',
    'scikit',
    'script',
    'second',
    'secondli',
    'seek',
    'seen',
    'shall',
    'shan',
    'shouldn',
    'similar',
    'sklearn',
    'snippet',
    'snowflak',
    'solut',
    'solv',
    # 'sourc',
    'special',
    'specif',
    # 'start',
    'strang',
    'struggl',
    'succe',
    'success',
    'suggest',
    'tensorflow',
    'thank',
    'think',
    'thirdli',
    'thought',
    'topic',
    'try',
    'unabl',
    'understand',
    'unexpect',
    'us',
    'user',
    'valid',
    'view',
    'viewpoint',
    'wai',
    'want',
    'weird',
    'worst',
    'won',
    'wonder',
    'work',
    'wors',
    'wouldn',
    'wrong',
    'wrongli',
]

df_all = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df_all.iterrows():
    df_all.at[index, 'Challenge_original_content'] = remove_stopwords(row['Challenge_original_content'].replace(
        'Title: ', '').replace('Answer: ', ''), stopwords=stop_words_custom)
    df_all.at[index, 'Challenge_preprocessed_content'] = remove_stopwords(
        row['Challenge_preprocessed_content'].replace('Title: ', '').replace('Answer: ', ''), stopwords=stop_words_custom)
    df_all.at[index, 'Challenge_summary'] = remove_stopwords(
        row['Challenge_summary'], stopwords=stop_words_custom)

    if row['Solution_original_content']:
        df_all.at[index, 'Solution_original_content'] = remove_stopwords(row['Solution_original_content'].replace(
            'Title: ', '').replace('Answer: ', ''), stopwords=stop_words_custom)
        df_all.at[index, 'Solution_preprocessed_content'] = remove_stopwords(
            row['Solution_preprocessed_content'].replace('Title: ', '').replace('Answer: ', ''), stopwords=stop_words_custom)
        df_all.at[index, 'Solution_summary'] = remove_stopwords(
            row['Solution_summary'], stopwords=stop_words_custom)

df_all.to_json(os.path.join(path_dataset, 'preprocessed.json'),
               indent=4, orient='records')

In [31]:
df_all = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

# remove issues with uninformed content
for index, row in df_all.iterrows():
    if len(row['Challenge_original_content'].split()) < 6 or len(row['Challenge_original_content']) < 30:
        print(row['Challenge_original_content'])
        df_all.drop(index, inplace=True)
    elif row['Solution_original_content'] and (len(row['Solution_original_content'].split()) < 6 or len(row['Solution_original_content']) < 30):
        print(row['Solution_original_content'])
        df_all.drop(index, inplace=True)

df_all.to_json(os.path.join(path_dataset, 'filtered.json'),
               indent=4, orient='records')


initi studio lab titl
modulenotfounderror modul tensorboard
combin param param work
access allow access member given
load
deploy
log val loss
import
tensorboard default logger option
logger
connect databas ye connect databas
afraid moment set built autopilot
run end run finish creat
vpc endpoint api api st
support hive adl
cloudform templat sure
csv file folder recordio
littl involv ye
file pin azurestor
processingstep uri locat refer
leav case come accross
pipelin moment
cli sdk
let know happen
delet creat endpoint
report appear bug team investig
work
like templat run creat instanc
passau refer visual studio magazin
bug got todai close
upgrad sqlalchemi
file path ensur path
tri work let file transient
think chang log stream servic
strang instal pip instal
situat chang git avail
sdk support automl sdk support
actual come smdebug version downgrad
updat core
forc gpu devic devic gpu
begin list
accord document maximum
randomcutforest rcf lean randomforest algorithm
sure estim framework ve

In [None]:
import plotly.graph_objects as go

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]

# as if we assign the topic id as the label
label_challenge_original = df_topics['Challenge_topic'].unique().tolist()
label_challenge_refined = [f'c_{label}' for label in label_challenge_original]
label_challenge_map = dict(
    zip(label_challenge_original, label_challenge_refined))

label_solution_original = df_topics['Solution_topic'].unique().tolist()
label_solution_refined = [f's_{label}' for label in label_solution_original]
label_solution_map = dict(zip(label_solution_original, label_solution_refined))

df_topics = df_topics.replace(
    {'Challenge_topic': label_challenge_map, 'Solution_topic': label_solution_map})

categories = ['Challenge_topic', 'Solution_topic']
df_topics = df_topics.groupby(categories).size().reset_index(name='value')

# we only visualize large topics
df_topics = df_topics[df_topics['value'] > 50]

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_topics[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_topics[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict(label=label)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(height=1000, width=1000, font=dict(size=30))
fig.write_image(os.path.join(path_challenge_information,
                'Challenge solution sankey.png'))

In [None]:
# Create challenge topic distribution tree map

import plotly.express as px

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Challenge_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_challenge_information,
                'Challenge_topic_distribution.png'))


In [26]:
# Create solution topic distribution tree map

import plotly.express as px

df_topics = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Solution_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_solution_information,
                'Solution_topic_distribution.png'))


In [None]:
# Collect challenge statistics information

df_challenge = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_challenge['Challenge_solved_time'] = df_challenge['Challenge_closed_time'] - \
    df_challenge['Challenge_creation_time']
df_challenge['Challenge_participation_count'] = df_challenge['Challenge_answer_count'] + \
    df_challenge['Challenge_comment_count']

total_count = df_challenge['Challenge_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Challenge_topic'):
    Mean_score = group['Challenge_score'].mean()
    Mean_link_count = group['Challenge_link_count'].mean()
    Mean_info_entropy = group['Challenge_info_entropy'].mean()
    Mean_answer_count = group['Challenge_answer_count'].mean()
    Mean_comment_count = group['Challenge_comment_count'].mean()
    Mean_participation_count = Mean_answer_count + Mean_comment_count
    Score_participation_ratio = Mean_score / Mean_participation_count
    Score_participation_weighted_product = (
        group['Challenge_score'] * group['Challenge_participation_count']).mean()
    Count = group['Challenge_topic'].count()
    Count_ratio = Count / total_count * 100
    Solved_ratio = group['Challenge_closed_time'].notna().sum() / Count
    Mean_solved_time = group['Challenge_solved_time'].mean(
    ) / pd.Timedelta(hours=1)
    Median_solved_time = group['Challenge_solved_time'].median(
    ) / pd.Timedelta(hours=1)
    topic_info = {
        'Challenge_topic': name,
        'Mean_score': Mean_score,
        'Mean_link_count': Mean_link_count,
        'Mean_info_entropy': Mean_info_entropy,
        'Mean_answer_count': Mean_answer_count,
        'Mean_comment_count': Mean_comment_count,
        'Score_participation_ratio': Score_participation_ratio,
        'Score_participation_weighted_product': Score_participation_weighted_product,
        'Count_ratio': Count_ratio,
        'Solved_ratio': Solved_ratio,
        'Mean_solved_time': Mean_solved_time,
        'Median_solved_time': Median_solved_time,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_challenge_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Challenge_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_info_entropy', ascending=False)['Mean_info_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_info_entropy.png'))
plt.close()

fig = df_topics.sort_values('Mean_answer_count', ascending=False)['Mean_answer_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean answer count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_answer_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_comment_count', ascending=False)['Mean_comment_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean comment count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_comment_count.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_ratio', ascending=False)['Score_participation_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_ratio.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_weighted_product', ascending=False)['Score_participation_weighted_product'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation weighted product', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_weighted_product.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Count_ratio.png'))
plt.close()

fig = df_topics.sort_values('Solved_ratio')['Solved_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge Solved ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'solved_ratio.png'))
plt.close()

fig = df_topics.sort_values('Mean_solved_time', ascending=False)['Mean_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge median solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'mean_solved_time.png'))
plt.close()

fig = df_topics.sort_values('Median_solved_time', ascending=False)['Median_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'median_solved_time.png'))
plt.close()


In [None]:
# Collect solution statistics information

df_solution = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]

total_count = df_challenge['Solution_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Solution_topic'):
    Mean_score = group['Answer_score'].mean()
    Mean_link_count = group['Solution_link_count'].mean()
    Mean_info_entropy = group['Solution_info_entropy'].mean()
    Count_ratio = group['Solution_topic'].count() / total_count * 100
    topic_info = {
        'Challenge_topic': name,
        'Mean_score': Mean_score,
        'Mean_link_count': Mean_link_count,
        'Mean_info_entropy': Mean_info_entropy,
        'Count_ratio': Count_ratio,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_solution_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Challenge_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_info_entropy', ascending=False)['Mean_info_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_info_entropy.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Count_ratio.png'))
plt.close()


In [4]:
import scipy.interpolate
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess


def smooth(x, y, xgrid, lowess_kw=None):
    samples = np.random.choice(len(x), 50, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s, x_s, **lowess_kw)
    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(
        x_s, y_sm, fill_value='extrapolate')(xgrid)
    return y_grid


def lowess_with_confidence_bounds(x, y, conf_interval=0.95, lowess_kw=None):
    """
    Perform Lowess regression and determine a confidence interval by bootstrap resampling
    """
    xgrid = np.linspace(x.min(), x.max())

    K = 100
    smooths = np.stack([smooth(x, y, xgrid, lowess_kw) for _ in range(K)]).T

    mean = np.nanmean(smooths, axis=1)
    stderr = scipy.stats.sem(smooths, axis=1)

    clower = np.nanpercentile(smooths, (1-conf_interval)*50, axis=1)
    cupper = np.nanpercentile(smooths, (1+conf_interval)*50, axis=1)

    return xgrid, mean, stderr, clower, cupper

In [73]:
df_all = pd.read_json(os.path.join(path_dataset, 'topics.json'))
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_all['Challenge_creation_time']), max(df_all['Challenge_creation_time'])


(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [None]:
# Explore challenge topics evolution

df_challenge = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_challenge = df_challenge[(df_challenge['Challenge_creation_time'] > '2014-09-14')
                            & (df_challenge['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_challenge.groupby('Challenge_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_creation_time', freq='2W')).agg(
        Count=('Challenge_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_creation_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_creation_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_challenge_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()

In [28]:
df_all = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_all[df_all['Solution_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_solution['Challenge_creation_time']), max(
    df_solution['Challenge_creation_time'])

(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [7]:
# Explore solution topics evolution

df_solution = pd.read_json(os.path.join(path_dataset, 'topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]
df_solution = df_solution[(df_solution['Challenge_creation_time'] > '2014-09-14')
                          & (df_solution['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_solution.groupby('Solution_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='W')).agg(
        Count=('Solution_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_closed_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_closed_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_solution_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()