In [None]:
import os
import shap
import pandas as pd
import plotly.graph_objects as go
import numpy as np

from xgboost import XGBRegressor, XGBClassifier
from matplotlib import pyplot as plt
from sklearn.inspection import permutation_importance
from scipy.stats import mannwhitneyu


In [None]:
# The significance level is the probability of rejecting the null hypothesis when it is true.
alpha = 0.05

random_state = 42

In [None]:
macro_topic_ensemble_inverse = [
    # Code versioning refers to the practice of tracking changes to software code over time.
    {'Code Management': ['Code Versioning']},
    # These words are all related to data management and analysis. They refer to various tasks and techniques used to organize, manipulate, store, transfer, and analyze data.
    {'Data Management': ['Artifact Management', 'Columnar Manipulation', 'CSV Manipulation', 'Data Labeling', 'Data Storage',
                         'Data Transfer', 'Data Visualization', 'Database Connectivity', 'Dataset Versioning', 'Pandas Dataframe', 'Batch Processing']},
    # All of these words are related to the development and management of machine learning models.
    {'Model Management': ['Hyperparameter Tuning',
                          'Model Evaluation', 'Model Exporting', 'Model Registry']},
    # These words are all related to the management and optimization of data pipelines in software development.
    {'Lifecycle Management': ['Pipeline Configuration',
                              'Pipeline Configuration (Data)', 'Pipeline Configuration (Model)', 'Run Management', 'Kubernetes Orchestration']},
    # All of these words relate to the configuration and management of infrastructure aspects of computer systems and networks. Specifically, they involve setting up and optimizing different components such as processing power, memory, network connections, and software to ensure that they work together efficiently and effectively.
    {'Infrastructure Management': ['Apache Spark Configuration', 'Cluster Configuration', 'Docker Configuration', 'GPU Configuration', 'VPC Networking', 'Memory Management',
                                   'Remote Configuration', 'Resource Quota Control', 'TensorFlow Configuration', 'Jupyter Notebook', 'Package Management', 'SDK Management', 'YAML Configuration']},
    # All of these words are related to the deployment and management of machine learning models or web services.
    {'Deployment Management': ['Endpoint Serving', 'Endpoint Deployment', 'Model Serving', 'Model Inference',
                               'REST Payload', 'Web Service', 'Serverless Serving', 'API Invocation']},
    # All of these words are related to monitoring and logging data in various systems.
    {'Report Management': ['CloudWatch Monitoring',
                           'Metrics Logging', 'TensorBoard Logging', 'Metrics Logging']},
    # All of these words are related to controlling access to information or resources in a system.
    {'Security Management': ['Account Management',
                             'Bucket Access Control', 'Role-based Access Control']},
]

In [None]:
path_challenge_so_to = 'Stack Overflow vs Tool-specific'
path_general = os.path.join(os.getcwd(), '..', '..', 'General')
path_solution = os.path.join(os.getcwd(), '..', '..', 'Solution')

In [None]:
# Compare metrics distribution of Stack Overflow vs tool-specific fora challenges across different topics

df = pd.read_json(os.path.join(path_general, 'logscale.json'))

df_so = df[df['Platform'] == 'Stack Overflow']
df_to = df[df['Platform'] == 'Tool-specific']

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_so), 'Challenge topic count (higher level)'),
        y=df_so['Challenge_topic_macro'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='Stack Overflow',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_to), 'Challenge topic count (higher level)'),
        y=df_to['Challenge_topic_macro'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Tool-specific',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_so_to, 'Challenge score.png'))

# Challenge favorite count
fig_challenge_favorite_count = go.Figure()
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge favorite count.png'))

# Challenge view count
fig_challenge_view_count = go.Figure()
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_view_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_view_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge view count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge link count.png'))

# Challenge sentence count
fig_challenge_sentence_count = go.Figure()
fig_challenge_sentence_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_sentence_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_sentence_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_sentence_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge sentence count.png'))

# Challenge word count
fig_challenge_word_count = go.Figure()
fig_challenge_word_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_word_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_word_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_word_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_word_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_word_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge word count.png'))

# Challenge unique word count
fig_challenge_unique_word_count = go.Figure()
fig_challenge_unique_word_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_unique_word_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_unique_word_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_unique_word_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_unique_word_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_unique_word_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge unique word count.png'))

# Challenge information entropy
fig_challenge_information_entropy = go.Figure()
fig_challenge_information_entropy.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_information_entropy'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_information_entropy.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_information_entropy'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_information_entropy.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_information_entropy.write_image(os.path.join(
    path_challenge_so_to, 'Challenge information entropy.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_so_to, 'Challenge readability.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge answer count.png'))

# Challenge comment count
fig_challenge_comment_count = go.Figure()
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_comment_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_comment_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge comment count.png'))

# Challenge participation count
fig_challenge_participation_count = go.Figure()
fig_challenge_participation_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_participation_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_participation_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_participation_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_participation_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_participation_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge participation count.png'))

# Challenge solved time
fig_challenge_solved_time = go.Figure()
fig_challenge_solved_time.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_macro'],
        y=df_so['Challenge_solved_time'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_solved_time.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_macro'],
        y=df_to['Challenge_solved_time'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_solved_time.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_solved_time.write_image(os.path.join(
    path_challenge_so_to, 'Challenge solved time.png'))

fig_challenge_mean_solved_time_evolution_so = go.Figure()
fig_challenge_median_solved_time_evolution_so = go.Figure()

fig_challenge_mean_solved_time_evolution_to = go.Figure()
fig_challenge_median_solved_time_evolution_to = go.Figure()

for name, group in df.groupby('Challenge_topic_macro'):
    so = group[group['Platform'] == 'Stack Overflow']
    to = group[group['Platform'] == 'Tool-specific']

    # Challenge score
    challenge_score_so = so[so['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_to = to[to['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_so) * len(challenge_score_to) > 0:
        _, p = mannwhitneyu(challenge_score_so, challenge_score_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge score')

    # Challenge favorite count
    challenge_favorite_count_so = so[so['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_to = to[to['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_so) * len(challenge_favorite_count_to) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_so,
                            challenge_favorite_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge favorite count')

    # Challenge link count
    challenge_link_count_so = so[so['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_to = to[to['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_so) * len(challenge_link_count_to) > 0:
        _, p = mannwhitneyu(challenge_link_count_so, challenge_link_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge link count')

    # Challenge sentence count
    challenge_sentence_count_so = so[so['Challenge_sentence_count'].notna(
    )]['Challenge_sentence_count']
    challenge_sentence_count_to = to[to['Challenge_sentence_count'].notna(
    )]['Challenge_sentence_count']
    if len(challenge_sentence_count_so) * len(challenge_sentence_count_to) > 0:
        _, p = mannwhitneyu(challenge_sentence_count_so,
                            challenge_sentence_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge sentence count')

    # Challenge word count
    challenge_word_count_so = so[so['Challenge_word_count'].notna(
    )]['Challenge_word_count']
    challenge_word_count_to = to[to['Challenge_word_count'].notna(
    )]['Challenge_word_count']
    if len(challenge_word_count_so) * len(challenge_word_count_to) > 0:
        _, p = mannwhitneyu(challenge_word_count_so, challenge_word_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge word count')

    # Challenge unique word count
    challenge_unique_word_count_so = so[so['Challenge_unique_word_count'].notna(
    )]['Challenge_unique_word_count']
    challenge_unique_word_count_to = to[to['Challenge_unique_word_count'].notna(
    )]['Challenge_unique_word_count']
    if len(challenge_unique_word_count_so) * len(challenge_unique_word_count_to) > 0:
        _, p = mannwhitneyu(challenge_unique_word_count_so,
                            challenge_unique_word_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge unique word count')

    # Challenge information entropy
    challenge_information_entropy_so = so[so['Challenge_information_entropy'].notna(
    )]['Challenge_information_entropy']
    challenge_information_entropy_to = to[to['Challenge_information_entropy'].notna(
    )]['Challenge_information_entropy']
    if len(challenge_information_entropy_so) * len(challenge_information_entropy_to) > 0:
        _, p = mannwhitneyu(challenge_information_entropy_so,
                            challenge_information_entropy_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge information entropy')

    # Challenge readability
    challenge_readability_so = so[so['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_to = to[to['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_so) * len(challenge_readability_to) > 0:
        _, p = mannwhitneyu(challenge_readability_so,
                            challenge_readability_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge readability')

    # Challenge view count
    challenge_view_count_so = so[so['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    challenge_view_count_to = to[to['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    if len(challenge_view_count_so) * len(challenge_view_count_to) > 0:
        _, p = mannwhitneyu(challenge_view_count_so,
                            challenge_view_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge answer count')

    # Challenge answer count
    challenge_answer_count_so = so['Challenge_answer_count']
    challenge_answer_count_to = to['Challenge_answer_count']
    if len(challenge_answer_count_so) * len(challenge_answer_count_to) > 0:
        _, p = mannwhitneyu(challenge_answer_count_so,
                            challenge_answer_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge answer count')

    # Challenge comment count
    challenge_comment_count_so = so['Challenge_comment_count']
    challenge_comment_count_to = to['Challenge_comment_count']
    if len(challenge_comment_count_so) * len(challenge_comment_count_to) > 0:
        _, p = mannwhitneyu(challenge_comment_count_so,
                            challenge_comment_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge comment count')

    # Challenge participation count
    challenge_participation_count_so = so['Challenge_participation_count']
    challenge_participation_count_to = to['Challenge_participation_count']
    if len(challenge_comment_count_so) * len(challenge_comment_count_to) > 0:
        _, p = mannwhitneyu(challenge_comment_count_so,
                            challenge_comment_count_to)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge participation count')

    # Challenge mean solved time evolution
    group_so = so.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].mean().reset_index()
    x_so = pd.to_datetime(group_so['Challenge_created_time']).values
    y_so = group_so['Challenge_solved_time'].values
    fig_challenge_mean_solved_time_evolution_so.add_trace(
        go.Scatter(x=x_so, y=y_so, mode='lines', name=name))

    group_to = to.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].mean().reset_index()
    x_to = pd.to_datetime(group_to['Challenge_created_time']).values
    y_to = group_to['Challenge_solved_time'].values
    fig_challenge_mean_solved_time_evolution_to.add_trace(
        go.Scatter(x=x_to, y=y_to, mode='lines', name=name))

    # Challenge median solved time evolution
    group_so = so.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].median().reset_index()
    x_so = pd.to_datetime(group_so['Challenge_created_time']).values
    y_so = group_so['Challenge_solved_time'].values
    fig_challenge_median_solved_time_evolution_so.add_trace(
        go.Scatter(x=x_so, y=y_so, mode='lines', name=name))

    group_to = to.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].median().reset_index()
    x_to = pd.to_datetime(group_to['Challenge_created_time']).values
    y_to = group_to['Challenge_solved_time'].values
    fig_challenge_median_solved_time_evolution_to.add_trace(
        go.Scatter(x=x_to, y=y_to, mode='lines', name=name))

fig_challenge_mean_solved_time_evolution_so.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_median_solved_time_evolution_so.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_mean_solved_time_evolution_so.write_image(os.path.join(
    path_challenge_so_to, 'Challenge mean solved time evolution (Stack Overflow).png'))
fig_challenge_median_solved_time_evolution_so.write_image(os.path.join(
    path_challenge_so_to, 'Challenge median solved time evolution (Stack Overflow).png'))

fig_challenge_mean_solved_time_evolution_to.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_median_solved_time_evolution_to.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_mean_solved_time_evolution_to.write_image(os.path.join(
    path_challenge_so_to, 'Challenge mean solved time evolution (Tool-specific).png'))
fig_challenge_median_solved_time_evolution_to.write_image(os.path.join(
    path_challenge_so_to, 'Challenge median solved time evolution (Tool-specific).png'))

# Challenge mean solved time
challenge_mean_solved_time_so = df_so[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').mean()['Challenge_solved_time']
challenge_mean_solved_time_to = df_to[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').mean()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_mean_solved_time_so,
                    challenge_mean_solved_time_to)
if p < alpha:
    print(f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora in higher level mean challenge solved time')

# Challenge median solved time
challenge_median_solved_time_so = df_so[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').median()['Challenge_solved_time']
challenge_median_solved_time_to = df_to[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').median()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_median_solved_time_so,
                    challenge_median_solved_time_to)
if p < alpha:
    print(f'p = {p:.2f}, indicating different distribution of Stack Overflow vs tool-specific fora in higher level median challenge solved time')

In [None]:
# Compare the explanability of the challenge solved rate classification model between Stack Overflow vs tool-specific fora

df = pd.read_json(os.path.join(path_general, 'filtered.json'))

# Stack Overflow

df_so = df[df['Platform'] == 'Stack Overflow']
df_so = df_so[df_so.columns.drop(
    list(df_so.filter(regex='Platform|Tool|Solution|topic|solved_time|edit_time')))]
X = df_so.drop(['Challenge_link', 'Challenge_closed_time',
               'Challenge_created_time'], axis=1)
y = df_so['Challenge_closed_time'].isna()

classifier = XGBClassifier(objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist',
                           random_state=random_state, max_depth=5, n_estimators=1000, eta=0.1483)
classifier.fit(X, y)

sorted_idx = classifier.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx][:10],
         classifier.feature_importances_[sorted_idx][:10])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate xgboost_feature_importance (Stack Overflow).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate SHAP_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    classifier, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate permutation_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

# Tool-specific fora

df_to = df[df['Platform'] == 'Tool-specific']
df_to = df_to[df_to.columns.drop(
    list(df_to.filter(regex='Platform|Tool|Solution|topic|solved_time|edit_time')))]
X = df_to.drop(['Challenge_link', 'Challenge_closed_time',
               'Challenge_created_time'], axis=1)
y = df_to['Challenge_closed_time'].isna()

classifier = XGBClassifier(objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist',
                           random_state=random_state, max_depth=5, n_estimators=1000, eta=0.1483)
classifier.fit(X, y)

sorted_idx = classifier.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx][:10],
         classifier.feature_importances_[sorted_idx][:10])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate xgboost_feature_importance (Tool-specific).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate SHAP_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    classifier, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_rate permutation_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

In [None]:
# Compare the explanability of the challenge solved time regression model between Stack Overflow vs tool-specific fora

df = pd.read_json(os.path.join(path_solution, 'solved.json'))
df = df[df['Challenge_solved_time'].notna()]
df = df.drop(['Challenge_link', 'Challenge_closed_time',
             'Challenge_created_time'], axis=1)

# Stack Overflow

df_so = df[df['Platform'] == 'Stack Overflow']
X = df_so[df_so.columns.drop(
    list(df_so.filter(regex='Platform|Tool|topic|solved_time|edit_time')))]
y = df_so['Challenge_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.0206)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time xgboost_feature_importance (Stack Overflow).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time SHAP_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time permutation_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

# tool-specific fora

df_to = df[df['Platform'] == 'Tool-specific']
X = df_to[df_to.columns.drop(list(df_to.filter(
    regex='Platform|Tool|topic|solved_time|edit_time|favorite_count|view_count')))]
y = df_to['Challenge_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.0206)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time xgboost_feature_importance (Tool-specific).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time SHAP_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_solved_time permutation_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

In [None]:
# Compare the explanability of the challenge adjusted solved time regression model between Stack Overflow vs tool-specific fora

df = pd.read_json(os.path.join(path_solution, 'solved.json'))
df = df[df['Challenge_adjusted_solved_time'].notna()]
df = df.drop(['Challenge_link', 'Challenge_closed_time',
             'Challenge_created_time'], axis=1)

# Stack Overflow

df_so = df[df['Platform'] == 'Stack Overflow']
X = df_so[df_so.columns.drop(
    list(df_so.filter(regex='Platform|Tool|topic|solved_time|edit_time')))]
y = df_so['Challenge_adjusted_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.03353)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time xgboost_feature_importance (Stack Overflow).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time SHAP_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time permutation_based_feature_importance (Stack Overflow).png'), bbox_inches='tight')

# tool-specific fora

df_to = df[df['Platform'] == 'Tool-specific']
X = df_to[df_to.columns.drop(list(df_to.filter(
    regex='Platform|Tool|topic|solved_time|edit_time|favorite_count|view_count')))]
y = df_to['Challenge_adjusted_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.03353)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time xgboost_feature_importance (Tool-specific).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time SHAP_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_so_to,
            f'Challenge_adjusted_solved_time permutation_based_feature_importance (Tool-specific).png'), bbox_inches='tight')

In [None]:
# Compare metrics evolution of Stack Overflow vs tool-specific fora challenges across different topics

df_challenge = pd.read_json(os.path.join(path_general, 'filtered.json'))

fig_challenge_topic_count_to = go.Figure()
fig_challenge_view_count_to = go.Figure()
fig_challenge_favorite_count_to = go.Figure()
fig_challenge_answer_count_to = go.Figure()
fig_challenge_comment_count_to = go.Figure()
fig_challenge_participation_count_to = go.Figure()
fig_challenge_topic_count_to = go.Figure()
fig_challenge_answer_count_to = go.Figure()
fig_challenge_score_to = go.Figure()
fig_challenge_word_count_to = go.Figure()
fig_challenge_unique_word_count_to = go.Figure()
fig_challenge_sentence_count_to = go.Figure()
fig_challenge_link_count_to = go.Figure()
fig_challenge_information_entropy_to = go.Figure()
fig_challenge_readability_to = go.Figure()
fig_challenge_topic_closed_count_to = go.Figure()
fig_challenge_solved_rate_to = go.Figure()

fig_challenge_topic_count_so = go.Figure()
fig_challenge_view_count_so = go.Figure()
fig_challenge_favorite_count_so = go.Figure()
fig_challenge_answer_count_so = go.Figure()
fig_challenge_comment_count_so = go.Figure()
fig_challenge_participation_count_so = go.Figure()
fig_challenge_topic_count_so = go.Figure()
fig_challenge_answer_count_so = go.Figure()
fig_challenge_score_so = go.Figure()
fig_challenge_word_count_so = go.Figure()
fig_challenge_unique_word_count_so = go.Figure()
fig_challenge_sentence_count_so = go.Figure()
fig_challenge_link_count_so = go.Figure()
fig_challenge_information_entropy_so = go.Figure()
fig_challenge_readability_so = go.Figure()
fig_challenge_topic_closed_count_so = go.Figure()
fig_challenge_solved_rate_so = go.Figure()

for name, group in df_challenge.groupby('Challenge_topic_macro'):
    so = group[group['Platform'] == 'Stack Overflow']
    to = group[group['Platform'] == 'Tool-specific']

    # plot challenge topic count over time
    group_to = to.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x_to = pd.to_datetime(group_to['Challenge_created_time']).values
    y_to = group_to['Challenge_topic_macro'].values
    diff_y = np.diff(y_to)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_topic_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    group_so = so.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x_so = pd.to_datetime(group_so['Challenge_created_time']).values
    y_so = group_so['Challenge_topic_macro'].values
    diff_y = np.diff(y_so)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_topic_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge participation count over time
    group_to = to.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[['Challenge_link_count', 'Challenge_word_count', 'Challenge_unique_word_count', 'Challenge_sentence_count', 'Challenge_information_entropy',
                                                                               'Challenge_readability', 'Challenge_participation_count', 'Challenge_answer_count', 'Challenge_comment_count', 'Challenge_view_count', 'Challenge_favorite_count', 'Challenge_score']].sum().reset_index()
    y = group_to['Challenge_participation_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_participation_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    group_so = so.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[['Challenge_link_count', 'Challenge_word_count', 'Challenge_unique_word_count', 'Challenge_sentence_count', 'Challenge_information_entropy',
                                                                               'Challenge_readability', 'Challenge_participation_count', 'Challenge_answer_count', 'Challenge_comment_count', 'Challenge_view_count', 'Challenge_favorite_count', 'Challenge_score']].sum().reset_index()
    y = group_so['Challenge_participation_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_participation_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge closed topic count over time
    group_closed_so = so.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x = pd.to_datetime(group_closed_so['Challenge_closed_time']).values
    y = group_closed_so['Challenge_topic_macro'].values
    diff_Y = np.diff(y)
    diff_Y = np.insert(diff_Y, 0, 0)
    fig_challenge_topic_closed_count_so.add_trace(
        go.Scatter(x=x, y=diff_Y, mode='lines', name=name))
    
    group_closed_to = to.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x = pd.to_datetime(group_closed_to['Challenge_closed_time']).values
    y = group_closed_to['Challenge_topic_macro'].values
    diff_Y = np.diff(y)
    diff_Y = np.insert(diff_Y, 0, 0)
    fig_challenge_topic_closed_count_to.add_trace(
        go.Scatter(x=x, y=diff_Y, mode='lines', name=name))

    # plot challenge answer count over time
    y = group_to['Challenge_answer_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_answer_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    y = group_so['Challenge_answer_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_answer_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge comment count over time
    y = group_to['Challenge_comment_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_comment_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    y = group_so['Challenge_comment_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_comment_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge view count over time
    y = group_to['Challenge_view_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_view_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    y = group_so['Challenge_view_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_view_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge favorite count over time
    y = group_to['Challenge_favorite_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_favorite_count_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    y = group_so['Challenge_favorite_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_favorite_count_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge score over time
    y = group_to['Challenge_score'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_score_to.add_trace(
        go.Scatter(x=x_to, y=diff_y, mode='lines', name=name))

    y = group_so['Challenge_score'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_score_so.add_trace(
        go.Scatter(x=x_so, y=diff_y, mode='lines', name=name))

    # plot challenge link count over time
    y = group_to['Challenge_link_count'].values / y_to
    fig_challenge_link_count_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_link_count'].values / y_so
    fig_challenge_link_count_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge word count over time
    y = group_to['Challenge_word_count'].values / y_to
    fig_challenge_word_count_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_word_count'].values / y_so
    fig_challenge_word_count_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge sentence count over time
    y = group_to['Challenge_sentence_count'].values / y_to
    fig_challenge_sentence_count_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_sentence_count'].values / y_so
    fig_challenge_sentence_count_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge unique word count over time
    y = group_to['Challenge_unique_word_count'].values / y_to
    fig_challenge_unique_word_count_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_unique_word_count'].values / y_so
    fig_challenge_unique_word_count_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge information entropy over time
    y = group_to['Challenge_information_entropy'].values / y_to
    fig_challenge_information_entropy_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_information_entropy'].values / y_so
    fig_challenge_information_entropy_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge readability over time
    y = group_to['Challenge_readability'].values / y_to
    fig_challenge_readability_to.add_trace(
        go.Scatter(x=x_to, y=y, mode='lines', name=name))

    y = group_so['Challenge_readability'].values / y_so
    fig_challenge_readability_so.add_trace(
        go.Scatter(x=x_so, y=y, mode='lines', name=name))

    # plot challenge solved rate over time
    group_closed_to = to.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_all_to = to.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_solved = pd.merge(group_closed_to, group_all_to, on='Date', how='outer').fillna(0).sort_values(by='Date')
    x = pd.to_datetime(group_all_to['Date']).values
    y = group_solved['Solved'] / group_solved['All'] * 100
    fig_challenge_solved_rate_to.add_trace(
        go.Scatter(x=x, y=y, mode='lines', name=name))
    
    group_closed_so = so.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_all_so = so.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_solved = pd.merge(group_closed_so, group_all_so, on='Date', how='outer').fillna(0).sort_values(by='Date')
    x = pd.to_datetime(group_all_so['Date']).values
    y = group_solved['Solved'] / group_solved['All'] * 100
    fig_challenge_solved_rate_so.add_trace(
        go.Scatter(x=x, y=y, mode='lines', name=name))

fig_challenge_topic_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_view_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_favorite_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_answer_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_comment_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_participation_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_score_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_word_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_unique_word_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_sentence_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_link_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_information_entropy_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_readability_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_topic_closed_count_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_solved_rate_to.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))

fig_challenge_topic_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_view_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_favorite_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_answer_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_comment_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_participation_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_score_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_word_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_unique_word_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_sentence_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_link_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_information_entropy_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_readability_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_topic_closed_count_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_solved_rate_so.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))

fig_challenge_topic_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_topic_count_increase_rate (Tool-specific).png'))
fig_challenge_view_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_view_count_increase_rate (Tool-specific).png'))
fig_challenge_favorite_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_favorite_count_increase_rate (Tool-specific).png'))
fig_challenge_answer_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_answer_count_increase_rate (Tool-specific).png'))
fig_challenge_comment_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_comment_count_increase_rate (Tool-specific).png'))
fig_challenge_participation_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_participation_count_increase_rate (Tool-specific).png'))
fig_challenge_score_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_score_increase_rate (Tool-specific).png'))
fig_challenge_link_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_link_count (Tool-specific).png'))
fig_challenge_word_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_word_count (Tool-specific).png'))
fig_challenge_unique_word_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_unique_word_count (Tool-specific).png'))
fig_challenge_sentence_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_sentence_count (Tool-specific).png'))
fig_challenge_information_entropy_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_information_entropy (Tool-specific).png'))
fig_challenge_readability_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_readability (Tool-specific).png'))
fig_challenge_topic_closed_count_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_topic_closed_count_increase_rate (Tool-specific).png'))
fig_challenge_solved_rate_to.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_solved_rate (Tool-specific).png'))

fig_challenge_topic_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_topic_count_increase_rate (Stack Overflow).png'))
fig_challenge_view_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_view_count_increase_rate (Stack Overflow).png'))
fig_challenge_favorite_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_favorite_count_increase_rate (Stack Overflow).png'))
fig_challenge_answer_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_answer_count_increase_rate (Stack Overflow).png'))
fig_challenge_comment_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_comment_count_increase_rate (Stack Overflow).png'))
fig_challenge_participation_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_participation_count_increase_rate (Stack Overflow).png'))
fig_challenge_score_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_score_increase_rate (Stack Overflow).png'))
fig_challenge_link_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_link_count (Stack Overflow).png'))
fig_challenge_word_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_word_count (Stack Overflow).png'))
fig_challenge_unique_word_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_unique_word_count (Stack Overflow).png'))
fig_challenge_sentence_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_sentence_count (Stack Overflow).png'))
fig_challenge_information_entropy_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_information_entropy (Stack Overflow).png'))
fig_challenge_readability_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_readability (Stack Overflow).png'))
fig_challenge_topic_closed_count_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_topic_closed_count_increase_rate (Stack Overflow).png'))
fig_challenge_solved_rate_so.write_image(os.path.join(
    path_challenge_so_to, f'Challenge_solved_rate (Stack Overflow).png'))