In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from statsmodels.nonparametric.smoothers_lowess import lowess

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [10]:
path_result = '../../Result'
path_dataset = '../../Dataset'

path_rq12 = os.path.join(path_result, 'RQ12')
path_rq4 = os.path.join(path_result, 'RQ3')
path_rq6 = os.path.join(path_result, 'RQ6')

warnings.filterwarnings('ignore')

tools_open_post = [
    'Domino',
    'DVC',
    'Guild AI',
    'MLflow',
    'SigOpt'
]

macro_topic_indexing = {
    0: 'Code Development',
    1: 'Code Management',
    2: 'Compute Management',
    3: 'Data Development',
    4: 'Data Management',
    5: 'Environment Management',
    6: 'Experiment Management',
    7: 'File Management',
    8: 'Model Development',
    9: 'Model Management',
    10: 'Model Deployment',
    11: 'Network Management',
    12: 'Observability Management',
    13: 'Pipeline Management',
    14: 'Security Management',
    15: 'User Interface Management'
}


In [4]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    return np.isnan(y), lambda z: np.nonzero(z)[0]


def extrapolate_nans_1d(y):
    nans, x = nan_helper(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    return y

def filter_open_posts(df):
    df_copy = df.copy()
    for index, row in df_copy.iterrows():
        if ('Tool' in row['Platform']) and (row['Tools'][0] in tools_open_post):
            df_copy.drop(index, inplace=True)
    return df_copy


In [12]:
# plot the challenge mertics evolution

df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
df = filter_open_posts(df)

fig_challenge_topic_count = go.Figure()
fig_challenge_score = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_topic_closed_count = go.Figure()
fig_challenge_unsolved_rate = go.Figure()

for name, group in df.groupby('Challenge_topic_macro'):
    # plot challenge topic count evolution
    group_count = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        'Challenge_topic_macro'].count().reset_index()
    # group_count = group_count[group_count['Challenge_created_time'] < lastest_time]
    x = pd.to_datetime(group_count['Challenge_created_time']).values
    y = np.diff(group_count['Challenge_topic_macro'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_topic_count.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))

    group_evolution = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        ['Challenge_view_count', 'Challenge_score_count']].sum().reset_index()

    # plot challenge score count evolution
    y = np.diff(group_evolution['Challenge_score_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_score.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))

    # plot challenge view count evolution
    y = np.diff(group_evolution['Challenge_view_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_view_count.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))
    
    # plot challenge solved rate evolution
    group_all = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_closed = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='Q'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_solved = pd.merge(group_closed, group_all, on='Date', how='outer').fillna(
        0).sort_values(by='Date')
    # group_solved = group_solved[group_solved['Date'] < lastest_time]
    x = pd.to_datetime(group_solved['Date']).values
    y = (1 - group_solved['Solved'] / group_solved['All']) * 100
    y = extrapolate_nans_1d(y)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_unsolved_rate.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))

    # plot challenge closed topic count evolution
    group_evolution = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='Q'))[
        'Challenge_topic_macro'].count().reset_index()
    # group_evolution = group_evolution[group_evolution['Challenge_closed_time'] < lastest_time]
    x = pd.to_datetime(group_evolution['Challenge_closed_time']).values
    y = np.diff(group_evolution['Challenge_topic_macro'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_topic_closed_count.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))

fig_challenge_topic_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_view_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_score.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_topic_closed_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_unsolved_rate.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))

fig_challenge_topic_count.write_image(os.path.join(
    path_rq6, f'Challenge_topic_count_increase_rate.pdf'))
fig_challenge_view_count.write_image(os.path.join(
    path_rq6, f'Challenge_view_count_increase_rate.pdf'))
fig_challenge_score.write_image(os.path.join(
    path_rq6, f'Challenge_score_count_increase_rate.pdf'))
fig_challenge_topic_closed_count.write_image(os.path.join(
    path_rq6, f'Challenge_topic_closed_count_increase_rate.pdf'))
fig_challenge_unsolved_rate.write_image(os.path.join(
    path_rq6, f'Challenge_unsolved_rate.pdf'))


In [16]:
# plot challenge median solved & open time evolution

df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])

fig_challenge_open_time = go.Figure()

for name, group in df.groupby('Challenge_topic_macro'):
    x = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q')).count().reset_index()['Challenge_created_time']
    y = []
    for quarter in x:
        group_created = group[group['Challenge_created_time'] <= quarter]
        group_open = group_created[group_created['Challenge_closed_time'] > quarter]
        group_open['Challenge_open_time'] = (quarter - group_open['Challenge_created_time']) / pd.Timedelta(hours=1)
        group_closed = group_created[~group_created.isin(group_open)]
        open_time = pd.concat([group_closed['Challenge_resolved_time'], group_open['Challenge_open_time']]).median()
        # log scale all numerical values for better visualization of long-tailed distributions
        open_time = np.log(open_time + 1)
        y.append(open_time)
    x = pd.to_datetime(x).values
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False)
    fig_challenge_open_time.add_trace(go.Scatter(x=x, y=y_lowess, mode='lines', name=name))

fig_challenge_open_time.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_open_time.write_image(os.path.join(
    path_rq6, f'Challenge_open_time.pdf'))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/