In [11]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go

from statsmodels.nonparametric.smoothers_lowess import lowess

In [12]:
path_result = '../../Result'
path_dataset = '../../Dataset'

path_rq12 = os.path.join(path_result, 'RQ12')
path_rq4 = os.path.join(path_result, 'RQ3')
path_rq6 = os.path.join(path_result, 'RQ6')

warnings.filterwarnings('ignore')

tools_open_post = [
    'Domino',
    'DVC',
    'Guild AI',
    'MLflow',
    'SigOpt'
]

macro_topic_indexing = {
    0: 'Code Development',
    1: 'Code Management',
    2: 'Computation Management',
    3: 'Data Development',
    4: 'Data Management',
    5: 'Environment Management',
    6: 'Experiment Management',
    7: 'File Management',
    8: 'Model Development',
    9: 'Model Management',
    10: 'Model Deployment',
    11: 'Network Management',
    12: 'Observability Management',
    13: 'Pipeline Management',
    14: 'Security Management',
    15: 'User Interface Management'
}

colors = [
    '#e6194b',  # red
    '#3cb44b',  # green
    '#ffe119',  # yellow
    '#4363d8',  # blue
    '#f58231',  # orange
    '#911eb4',  # purple
    '#46f0f0',  # cyan
    '#f032e6',  # magenta
    '#bcf60c',  # lime green
    '#fabebe',  # pink
    '#008080',  # teal
    '#e6beff',  # lavender
    '#9a6324',  # brown
    '#fffac8',  # beige
    '#800000',  # maroon
    '#aaffc3'   # mint
]


In [13]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    return np.isnan(y), lambda z: np.nonzero(z)[0]


def extrapolate_nans_1d(y):
    nans, x = nan_helper(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    return y

def filter_open_posts(df):
    df_copy = df.copy()
    for index, row in df_copy.iterrows():
        if ('Tool' in row['Platform']) and (row['Tools'][0] in tools_open_post):
            df_copy.drop(index, inplace=True)
    return df_copy


In [14]:
topic_mapping = {
    0: 'Pipeline Step',
    1: 'Notebook Instance',
    2: 'Docker Image',
    3: 'Version',
    4: 'Log',
    5: 'Plots',
    6: 'Studio',
    7: 'PyTorch',
    8: 'Prediction',
    9: 'Labeling Job',
    10: 'Hyperparameter Tuning',
    11: 'TensorFlow',
    12: 'Spark',
    13: 'RStudio',
    14: 'Workspace',
    15: 'Batch Transform',
    16: 'Web Service',
    17: 'Experiment',
    18: 'Instance',
    19: 'Sweep',
    20: 'Deploying Model',
    21: 'Columns Values',
    22: 'Inference Endpoint',
    23: 'Compute',
    24: 'Account',
    25: 'File Directory',
    26: 'Object Attribute',
    27: 'Parameters Parameter',
    28: 'File Pandas',
    29: 'Git Repo',
    30: 'Training Job',
    31: 'Save Model',
    32: 'Bucket',
    33: 'Limit Exceeded',
    34: 'Data Factory',
    35: 'Format Convert',
    36: 'Server',
    37: 'Feature Store',
    38: 'Environment',
    39: 'Initialization Init',
    40: 'Custom Metrics',
    41: 'Compute Cluster',
    42: 'Model Endpoint',
    43: 'Memory',
    44: 'Stuck Queue',
    45: 'Distributed Training',
    46: 'File Upload',
    47: 'Python',
    48: 'TensorBoard',
    49: 'Deploying Endpoint',
    50: 'Authentication',
    51: 'Module Import',
    52: 'Deployment Model',
    53: 'Artifact Download',
    54: 'File Read',
    55: 'Batch Prediction',
    56: 'API',
    57: 'API Key',
    58: 'File Storage',
    59: 'Creating Terraform',
    60: 'CUDA Memory',
    61: 'Register Model',
    62: 'Log Metrics',
    63: 'Config Configuration',
    64: 'Inference Pipeline',
    65: 'Invoke Endpoint',
    66: 'Blob Storage',
    67: 'Comparison Performance',
    68: 'Loading Model',
    69: 'Pickle File',
    70: 'Endpoint Lambda',
    71: 'File Download',
    72: 'Broken Link',
    73: 'Permissions Authorization',
    74: 'Exporting Data',
    75: 'Datasets',
    76: 'Model Registry',
    77: 'Image Read',
    78: 'Artifacts',
    79: 'Deploy Model',
    80: 'Endpoint Content',
    81: 'Creating Model',
    82: 'Train Model',
    83: 'Score Model',
    84: 'Endpoint Ping',
    85: 'Parallelization Job',
    86: 'Module Usage',
    87: 'Deployment Kubernetes',
    88: 'Data Stores',
    89: 'Permission Denied',
    90: 'Training Reports',
    91: 'Log Model',
    92: 'Model Monitoring',
    93: 'Model Deployment',
    94: 'Pipeline TrainingStep',
    95: 'Missing Module',
    96: 'Install Package',
    97: 'Logs CloudWatch',
    98: 'Import Notebook',
    99: 'Training Model',
    100: 'Installation',
    101: 'Quota Request',
    102: 'NEO Compiling',
    103: 'Connection',
    104: 'Training Container',
    105: 'Huggingface Model',
    106: 'Container Registry',
    107: 'Model ONNX',
    108: 'Models',
    109: 'Globals YAML',
    110: 'Deployment ACI',
    111: 'Trained Model',
    112: 'Web Interface',
    113: 'scikit-learn',
    114: 'Dependency',
    115: 'Nested Runs',
    116: 'Import Data',
    117: 'Endpoint Deploying',
    118: 'Endpoint Prediction',
    119: 'Lifecycle Configuration',
    120: 'Output Inputs',
    121: 'Custom Job',
    122: 'Jupyter Notebook',
    123: 'Response Endpoint',
    124: 'Shared Cache',
    125: 'Training YOLO',
    126: 'Checkpoints Training',
    127: 'Endpoint Transitioning',
    128: 'Training File',
    129: 'Inference',
    130: 'Connecting RDS',
    131: 'Model Serving',
    132: 'Tuning Model',
}

df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
df = df[df['Challenge_topic_macro'] == 12]

fig_challenge_topic_count = go.Figure()
fig_challenge_score_count = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_topic_rate = go.Figure()
fig_challenge_score_rate = go.Figure()
fig_challenge_view_rate = go.Figure()
fig_challenge_unsolved_rate = go.Figure()

for index, group in df.groupby('Challenge_topic'):
    name = topic_mapping[index]
    # plot challenge topic count evolution
    group_count = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        'Challenge_topic'].count().reset_index()
    x = pd.to_datetime(group_count['Challenge_created_time']).values
    y = group_count['Challenge_topic'].values
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_topic_count.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    # plot challenge topic rate evolution
    y = np.diff(group_count['Challenge_topic'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_topic_rate.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    group_evolution = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        ['Challenge_view_count', 'Challenge_score_count']].sum().reset_index()

    # plot challenge score count evolution
    y = group_evolution['Challenge_score_count'].values
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_score_count.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    # plot challenge score rate evolution
    y = np.diff(group_evolution['Challenge_score_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_score_rate.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    # plot challenge view count evolution
    y = group_evolution['Challenge_view_count'].values
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_view_count.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    # plot challenge view rate evolution
    y = np.diff(group_evolution['Challenge_view_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_view_rate.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

    # plot challenge solved rate evolution
    group = filter_open_posts(group)
    group_all = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))['Challenge_topic'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic': 'All'})
    group_closed = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='Q'))['Challenge_topic'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic': 'Solved'})
    group_solved = pd.merge(group_closed, group_all, on='Date', how='outer').fillna(
        0).sort_values(by='Date')
    x = pd.to_datetime(group_solved['Date']).values
    y = (1 - group_solved['Solved'] / group_solved['All']) * 100
    y = extrapolate_nans_1d(y)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_unsolved_rate.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name))#, marker=dict(color=colors[index])))

fig_challenge_topic_count.update_layout(
    title='topic count evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_view_count.update_layout(
    title='view count evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_score_count.update_layout(
    title='score count evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_topic_rate.update_layout(
    title='topic rate evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_view_rate.update_layout(
    title='view rate evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_score_rate.update_layout(
    title='score rate evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_unsolved_rate.update_layout(
    title='unsolved rate evolution',
    width=1000,
    height=750,
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))

fig_challenge_topic_count.show()
fig_challenge_view_count.show()
fig_challenge_score_count.show()
fig_challenge_topic_rate.show()
fig_challenge_view_rate.show()
fig_challenge_score_rate.show()
fig_challenge_unsolved_rate.show()

In [15]:
# plot the challenge mertics evolution

df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
df = filter_open_posts(df)

fig_challenge_topic_count = go.Figure()
fig_challenge_score_count = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_topic_rate = go.Figure()
fig_challenge_score_rate = go.Figure()
fig_challenge_view_rate = go.Figure()
fig_challenge_unsolved_rate = go.Figure()

for index, group in df.groupby('Challenge_topic_macro'):
    name = macro_topic_indexing[index]
    # plot challenge topic count evolution
    group_count = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        'Challenge_topic_macro'].count().reset_index()
    x = pd.to_datetime(group_count['Challenge_created_time']).values
    y = group_count['Challenge_topic_macro'].values
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_topic_count.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))
    
    # plot challenge topic rate evolution
    y = np.diff(group_count['Challenge_topic_macro'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_topic_rate.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

    group_evolution = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))[
        ['Challenge_view_count', 'Challenge_score_count']].sum().reset_index()

    # plot challenge score count evolution
    y = group_evolution['Challenge_score_count'].values
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_score_count.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

    # plot challenge score rate evolution
    y = np.diff(group_evolution['Challenge_score_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_score_rate.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

    # plot challenge view count evolution
    y = group_evolution['Challenge_view_count'].values
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_view_count.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

    # plot challenge view rate evolution
    y = np.diff(group_evolution['Challenge_view_count'].values)
    y = extrapolate_nans_1d(y)
    y = np.insert(y, 0, 0)
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_view_rate.add_trace(go.Scatter(
        x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))
    
    # plot challenge solved rate evolution
    group = filter_open_posts(group)
    group_all = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_closed = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='Q'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_solved = pd.merge(group_closed, group_all, on='Date', how='outer').fillna(
        0).sort_values(by='Date')
    x = pd.to_datetime(group_solved['Date']).values
    y = (1 - group_solved['Solved'] / group_solved['All']) * 100
    y = extrapolate_nans_1d(y)
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_unsolved_rate.add_trace(
        go.Scatter(x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

fig_challenge_topic_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_view_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_score_count.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_topic_rate.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_view_rate.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=0,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_score_rate.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=0,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_unsolved_rate.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=1,
        y=0,
        xanchor='auto',
        yanchor='auto'
    ))

pio.full_figure_for_development(fig_challenge_topic_count, warn=False)
pio.full_figure_for_development(fig_challenge_view_count, warn=False)
pio.full_figure_for_development(fig_challenge_score_count, warn=False)
pio.full_figure_for_development(fig_challenge_topic_rate, warn=False)
pio.full_figure_for_development(fig_challenge_view_rate, warn=False)
pio.full_figure_for_development(fig_challenge_score_rate, warn=False)
pio.full_figure_for_development(fig_challenge_unsolved_rate, warn=False)

fig_challenge_topic_count.write_image(os.path.join(
    path_rq6, f'Challenge topic count.pdf'), engine="kaleido")
fig_challenge_view_count.write_image(os.path.join(
    path_rq6, f'Challenge view count.pdf'), engine="kaleido")
fig_challenge_score_count.write_image(os.path.join(
    path_rq6, f'Challenge score count.pdf'), engine="kaleido")
fig_challenge_topic_rate.write_image(os.path.join(
    path_rq6, f'Challenge topic rate.pdf'), engine="kaleido")
fig_challenge_view_rate.write_image(os.path.join(
    path_rq6, f'Challenge view rate.pdf'), engine="kaleido")
fig_challenge_score_rate.write_image(os.path.join(
    path_rq6, f'Challenge score rate.pdf'), engine="kaleido")
fig_challenge_unsolved_rate.write_image(os.path.join(
    path_rq6, f'Challenge unsolved rate.pdf'), engine="kaleido")

fig_challenge_topic_count.show()
fig_challenge_view_count.show()
fig_challenge_score_count.show()
fig_challenge_topic_rate.show()
fig_challenge_view_rate.show()
fig_challenge_score_rate.show()
fig_challenge_unsolved_rate.show()

In [16]:
# plot challenge median solved & open time evolution

df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
# df['Challenge_topic_macro'] = df['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])

fig_challenge_open_time = go.Figure()

for index, group in df.groupby('Challenge_topic_macro'):
    name = macro_topic_indexing[index]
    x = group.groupby(pd.Grouper(key='Challenge_created_time', freq='Q')).count().reset_index()['Challenge_created_time']
    y = []
    
    for quarter in x:
        group_created = group[group['Challenge_created_time'] <= quarter]
        group_open = group_created[group_created['Challenge_closed_time'] > quarter]
        group_open['Challenge_open_time'] = (quarter - group_open['Challenge_created_time']) / pd.Timedelta(hours=1)
        group_closed = group_created[~group_created.isin(group_open)]
        open_time = pd.concat([group_closed['Challenge_resolved_time'], group_open['Challenge_open_time']]).median()
        # log scale all numerical values for better visualization of long-tailed distributions
        open_time = np.log(open_time + 1)
        y.append(open_time)
        
    x = pd.to_datetime(x).values
    x_lowess = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y_lowess = lowess(y, x_lowess, return_sorted=False, frac=0.5)
    fig_challenge_open_time.add_trace(go.Scatter(x=x, y=y_lowess, mode='lines', name=name, marker=dict(color=colors[index])))

fig_challenge_open_time.update_layout(
    width=1000,
    height=750,
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        x=0,
        y=1,
        xanchor='auto',
        yanchor='auto'
    ))
fig_challenge_open_time.write_image(os.path.join(
    path_rq6, f'Challenge open time.pdf'))
fig_challenge_open_time.show()