In [None]:
import os
import shap
import pandas as pd
import plotly.graph_objects as go
import numpy as np

from xgboost import XGBRegressor, XGBClassifier
from matplotlib import pyplot as plt
from sklearn.inspection import permutation_importance
from scipy.stats import mannwhitneyu

In [None]:
# The significance level is the probability of rejecting the null hypothesis when it is true.
alpha = 0.05

random_state = 42

In [None]:
macro_topic_ensemble_inverse = [
    # Code versioning refers to the practice of tracking changes to software code over time.
    {'Code Management': ['Code Versioning']},
    # These words are all related to data management and analysis. They refer to various tasks and techniques used to organize, manipulate, store, transfer, and analyze data.
    {'Data Management': ['Artifact Management', 'Columnar Manipulation', 'CSV Manipulation', 'Data Labeling', 'Data Storage',
                         'Data Transfer', 'Data Visualization', 'Database Connectivity', 'Dataset Versioning', 'Pandas Dataframe', 'Batch Processing']},
    # All of these words are related to the development and management of machine learning models.
    {'Model Management': ['Hyperparameter Tuning',
                          'Model Evaluation', 'Model Exporting', 'Model Registry']},
    # These words are all related to the management and optimization of data pipelines in software development.
    {'Lifecycle Management': ['Pipeline Configuration',
                              'Pipeline Configuration (Data)', 'Pipeline Configuration (Model)', 'Run Management', 'Kubernetes Orchestration']},
    # All of these words relate to the configuration and management of infrastructure aspects of computer systems and networks. Specifically, they involve setting up and optimizing different components such as processing power, memory, network connections, and software to ensure that they work together efficiently and effectively.
    {'Infrastructure Management': ['Apache Spark Configuration', 'Cluster Configuration', 'Docker Configuration', 'GPU Configuration', 'VPC Networking', 'Memory Management',
                                   'Remote Configuration', 'Resource Quota Control', 'TensorFlow Configuration', 'Jupyter Notebook', 'Package Management', 'SDK Management', 'YAML Configuration']},
    # All of these words are related to the deployment and management of machine learning models or web services.
    {'Deployment Management': ['Endpoint Serving', 'Endpoint Deployment', 'Model Serving', 'Model Inference',
                               'REST Payload', 'Web Service', 'Serverless Serving', 'API Invocation']},
    # All of these words are related to monitoring and logging data in various systems.
    {'Report Management': ['CloudWatch Monitoring',
                           'Metrics Logging', 'TensorBoard Logging', 'Metrics Logging']},
    # All of these words are related to controlling access to information or resources in a system.
    {'Security Management': ['Account Management',
                             'Bucket Access Control', 'Role-based Access Control']},
]

In [None]:
path_challenge_git_qa = 'QA vs Git'
path_general = os.path.join(os.getcwd(), '..', '..', 'General')
path_solution = os.path.join(os.getcwd(), '..', '..', 'Solution')

In [None]:
# Compare metrics distribution of Q&A forum and Git repo challenges across different topics

df = pd.read_json(os.path.join(path_general, 'logscale.json'))

df_qa = df[df['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
df_git = df[df['Platform'].isin(['Github', 'Gitlab'])]

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_qa), 'Challenge topic count (higher level)'),
        y=df_qa['Challenge_topic_macro'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_git), 'Challenge topic count (higher level)'),
        y=df_git['Challenge_topic_macro'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge score.png'))

# Challenge sentence count
fig_challenge_sentence_count = go.Figure()
fig_challenge_sentence_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_sentence_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_sentence_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_sentence_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge sentence count.png'))

# Challenge word count
fig_challenge_word_count = go.Figure()
fig_challenge_word_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_word_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_word_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_word_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_word_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_word_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge word count.png'))

# Challenge unique word count
fig_challenge_unique_word_count = go.Figure()
fig_challenge_unique_word_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_unique_word_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_sentence_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_unique_word_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_unique_word_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge unique word count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge link count.png'))

# Challenge information entropy
fig_challenge_information_entropy = go.Figure()
fig_challenge_information_entropy.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_information_entropy'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_information_entropy.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_information_entropy'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_information_entropy.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_information_entropy.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge information entropy.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge readability.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge answer count.png'))

# Challenge solved time
fig_challenge_solved_time = go.Figure()
fig_challenge_solved_time.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_macro'],
        y=df_qa['Challenge_solved_time'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        name='QA',
    ))
fig_challenge_solved_time.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_macro'],
        y=df_git['Challenge_solved_time'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        name='Git',
    ))
fig_challenge_solved_time.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_solved_time.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge solved time.png'))

fig_challenge_mean_solved_time_evolution_qa = go.Figure()
fig_challenge_median_solved_time_evolution_qa = go.Figure()

fig_challenge_mean_solved_time_evolution_git = go.Figure()
fig_challenge_median_solved_time_evolution_git = go.Figure()

for name, group in df.groupby('Challenge_topic_macro'):
    qa = group[group['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
    git = group[group['Platform'].isin(['Github', 'Gitlab'])]

    # Challenge score
    challenge_score_qa = qa[qa['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_git = git[git['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_qa) * len(challenge_score_git) > 0:
        _, p = mannwhitneyu(challenge_score_qa, challenge_score_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge score')

    # Challenge link count
    challenge_link_count_qa = qa[qa['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_git = git[git['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_qa) * len(challenge_link_count_git) > 0:
        _, p = mannwhitneyu(challenge_link_count_qa, challenge_link_count_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge link count')

    # Challenge sentence count
    challenge_sentence_count_qa = qa['Challenge_sentence_count']
    challenge_sentence_count_git = git['Challenge_sentence_count']
    if len(challenge_sentence_count_qa) * len(challenge_sentence_count_git) > 0:
        _, p = mannwhitneyu(challenge_sentence_count_qa,
                            challenge_sentence_count_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge sentence count')

    # Challenge word count
    challenge_word_count_qa = qa['Challenge_word_count']
    challenge_word_count_git = git['Challenge_word_count']
    if len(challenge_word_count_qa) * len(challenge_word_count_git) > 0:
        _, p = mannwhitneyu(challenge_word_count_qa,
                            challenge_word_count_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge word count')

    # Challenge unique word count
    challenge_unique_word_count_qa = qa['Challenge_word_count']
    challenge_unique_word_count_git = git['Challenge_word_count']
    if len(challenge_unique_word_count_qa) * len(challenge_unique_word_count_git) > 0:
        _, p = mannwhitneyu(challenge_unique_word_count_qa,
                            challenge_unique_word_count_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge unique word count')

    # Challenge information entropy
    challenge_information_entropy_qa = qa['Challenge_word_count']
    challenge_information_entropy_git = git['Challenge_word_count']
    if len(challenge_information_entropy_qa) * len(challenge_information_entropy_git) > 0:
        _, p = mannwhitneyu(challenge_information_entropy_qa,
                            challenge_information_entropy_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge information entropy')

    # Challenge readability
    challenge_readability_qa = qa[qa['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_git = git[git['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_qa) * len(challenge_readability_git) > 0:
        _, p = mannwhitneyu(challenge_readability_qa,
                            challenge_readability_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge readability')

    # Challenge answer count
    challenge_answer_count_qa = qa['Challenge_answer_count']
    challenge_answer_count_git = git['Challenge_answer_count']
    if len(challenge_answer_count_qa) * len(challenge_answer_count_git) > 0:
        _, p = mannwhitneyu(challenge_answer_count_qa,
                            challenge_answer_count_git)
        if p < alpha:
            print(
                f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge answer count')

    # Challenge mean solved time evolution
    group_qa = qa.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].mean().reset_index()
    x_qa = pd.to_datetime(group_qa['Challenge_created_time']).values
    y_qa = group_qa['Challenge_solved_time'].values
    fig_challenge_mean_solved_time_evolution_qa.add_trace(
        go.Scatter(x=x_qa, y=y_qa, mode='lines', name=name))

    group_git = git.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].mean().reset_index()
    x_git = pd.to_datetime(group_git['Challenge_created_time']).values
    y_git = group_git['Challenge_solved_time'].values
    fig_challenge_mean_solved_time_evolution_git.add_trace(
        go.Scatter(x=x_git, y=y_git, mode='lines', name=name))

    # Challenge median solved time evolution
    group_qa = qa.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].median().reset_index()
    x_qa = pd.to_datetime(group_qa['Challenge_created_time']).values
    y_qa = group_qa['Challenge_solved_time'].values
    fig_challenge_median_solved_time_evolution_qa.add_trace(
        go.Scatter(x=x_qa, y=y_qa, mode='lines', name=name))

    group_git = git.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_solved_time'].median().reset_index()
    x_git = pd.to_datetime(group_git['Challenge_created_time']).values
    y_git = group_git['Challenge_solved_time'].values
    fig_challenge_median_solved_time_evolution_git.add_trace(
        go.Scatter(x=x_git, y=y_git, mode='lines', name=name))

# Challenge mean solved time
challenge_mean_solved_time_qa = df_qa[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').mean()['Challenge_solved_time']
challenge_mean_solved_time_git = df_git[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').mean()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_mean_solved_time_qa,
                    challenge_mean_solved_time_git)
if p < alpha:
    print(f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos in higher level mean challenge solved time')

# Challenge median solved time
challenge_median_solved_time_qa = df_qa[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').median()['Challenge_solved_time']
challenge_median_solved_time_git = df_git[['Challenge_topic_macro', 'Challenge_solved_time']].groupby(
    'Challenge_topic_macro').median()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_median_solved_time_qa,
                    challenge_median_solved_time_git)
if p < alpha:
    print(f'p = {p:.2f}, indicating different distribution of Q&A fora vs Git repos in higher level median challenge solved time')

fig_challenge_mean_solved_time_evolution_qa.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_median_solved_time_evolution_qa.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_mean_solved_time_evolution_qa.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge mean solved time evolution (QA).png'))
fig_challenge_median_solved_time_evolution_qa.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge median solved time evolution (QA).png'))

fig_challenge_mean_solved_time_evolution_git.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_median_solved_time_evolution_git.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_mean_solved_time_evolution_git.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge mean solved time evolution (Git).png'))
fig_challenge_median_solved_time_evolution_git.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge median solved time evolution (Git).png'))

In [None]:
# Compare the explanability of the challenge solved rate classification model between Q&A forum and Git repo

df = pd.read_json(os.path.join(path_general, 'filtered.json'))

# QA forum

df_qa = df[df['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
df_qa = df_qa[df_qa.columns.drop(
    list(df_qa.filter(regex='Platform|Tool|Solution|topic|solved_time|edit_time')))]
X = df_qa.drop(['Challenge_link', 'Challenge_closed_time',
               'Challenge_created_time'], axis=1)
y = df_qa['Challenge_closed_time'].isna()

classifier = XGBClassifier(objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist',
                           random_state=random_state, max_depth=5, n_estimators=1000, eta=0.1483)
classifier.fit(X, y)

sorted_idx = classifier.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx][:10],
         classifier.feature_importances_[sorted_idx][:10])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate xgboost_feature_importance (QA).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate SHAP_based_feature_importance (QA).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    classifier, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate permutation_based_feature_importance (QA).png'), bbox_inches='tight')

# Git repo

df_git = df[df['Platform'].isin(['Github', 'Gitlab'])]
df_git = df_git[df_git.columns.drop(list(df_git.filter(
    regex='Platform|Tool|Solution|topic|solved_time|edit_time')))]
X = df_git.drop(['Challenge_link', 'Challenge_closed_time',
                'Challenge_created_time'], axis=1)
y = df_git['Challenge_closed_time'].isna()

classifier = XGBClassifier(objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist',
                           random_state=random_state, max_depth=5, n_estimators=1000, eta=0.1483)
classifier.fit(X, y)

sorted_idx = classifier.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx][:10],
         classifier.feature_importances_[sorted_idx][:10])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate xgboost_feature_importance (Git).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate SHAP_based_feature_importance (Git).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    classifier, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_rate permutation_based_feature_importance (Git).png'), bbox_inches='tight')

In [None]:
# Compare the explanability of the challenge solved time regression model between Q&A forum and Git repo

df = pd.read_json(os.path.join(path_solution, 'solved.json'))
df = df[df['Challenge_solved_time'].notna()]
df = df.drop(['Challenge_link', 'Challenge_closed_time',
             'Challenge_created_time'], axis=1)

# QA forum

df_qa = df[df['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
X = df_qa[df_qa.columns.drop(
    list(df_qa.filter(regex='Platform|Tool|topic|solved_time|edit_time')))]
y = df_qa['Challenge_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.0206)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time xgboost_feature_importance (QA).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time SHAP_based_feature_importance (QA).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time permutation_based_feature_importance (QA).png'), bbox_inches='tight')

# Git repo

df_git = df[df['Platform'].isin(['Github', 'Gitlab'])]
X = df_git[df_git.columns.drop(list(df_git.filter(
    regex='Platform|Tool|topic|solved_time|edit_time|favorite_count|view_count|comment_count')))]
y = df_git['Challenge_solved_time']

regressor = XGBRegressor(tree_method='gpu_hist', random_state=random_state,
                         max_depth=5, n_estimators=1000, eta=0.0206)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time xgboost_feature_importance (Git).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time SHAP_based_feature_importance (Git).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_solved_time permutation_based_feature_importance (Git).png'), bbox_inches='tight')

In [None]:
# Compare the explanability of the challenge adjusted solved time regression model between Q&A forum and Git repo

df = pd.read_json(os.path.join(path_solution, 'solved.json'))
df = df[df['Challenge_adjusted_solved_time'].notna()]
df = df.drop(['Challenge_link', 'Challenge_closed_time',
             'Challenge_created_time'], axis=1)

# QA forum

df_qa = df[df['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
X = df_qa[df_qa.columns.drop(
    list(df_qa.filter(regex='Platform|Tool|topic|solved_time|edit_time')))]
y = df_qa['Challenge_adjusted_solved_time']

regressor = XGBRegressor(objective='reg:squaredlogerror', tree_method='gpu_hist',
                         random_state=random_state, max_depth=5, n_estimators=1000, eta=0.03353)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time xgboost_feature_importance (QA).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time SHAP_based_feature_importance (QA).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time permutation_based_feature_importance (QA).png'), bbox_inches='tight')

# Git repo

df_git = df[df['Platform'].isin(['Github', 'Gitlab'])]
X = df_git[df_git.columns.drop(list(df_git.filter(
    regex='Platform|Tool|topic|solved_time|edit_time|favorite_count|view_count|comment_count')))]
y = df_git['Challenge_adjusted_solved_time']

regressor = XGBRegressor(tree_method='gpu_hist', random_state=random_state,
                         max_depth=5, n_estimators=1000, eta=0.03353)
regressor.fit(X, y)

sorted_idx = regressor.feature_importances_.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         regressor.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time xgboost_feature_importance (Git).png'), bbox_inches='tight')

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, show=False)
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time SHAP_based_feature_importance (Git).png'), bbox_inches='tight')

perm_importance = permutation_importance(
    regressor, X, y, random_state=random_state)
sorted_idx = perm_importance.importances_mean.argsort()
fig, _ = plt.subplots()
plt.barh(X.columns[sorted_idx],
         perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
fig.savefig(os.path.join(path_challenge_git_qa,
            f'Challenge_adjusted_solved_time permutation_based_feature_importance (Git).png'), bbox_inches='tight')

In [None]:
# Compare metrics evolution of Q&A forum and Git repo challenges across different topics

df_challenge = pd.read_json(os.path.join(path_general, 'filtered.json'))

fig_challenge_topic_count_qa = go.Figure()
fig_challenge_answer_count_qa = go.Figure()
fig_challenge_score_qa = go.Figure()
fig_challenge_word_count_qa = go.Figure()
fig_challenge_unique_word_count_qa = go.Figure()
fig_challenge_sentence_count_qa = go.Figure()
fig_challenge_link_count_qa = go.Figure()
fig_challenge_information_entropy_qa = go.Figure()
fig_challenge_readability_qa = go.Figure()
fig_challenge_topic_closed_count_qa = go.Figure()
fig_challenge_solved_rate_qa = go.Figure()

fig_challenge_topic_count_git = go.Figure()
fig_challenge_answer_count_git = go.Figure()
fig_challenge_score_git = go.Figure()
fig_challenge_word_count_git = go.Figure()
fig_challenge_unique_word_count_git = go.Figure()
fig_challenge_sentence_count_git = go.Figure()
fig_challenge_link_count_git = go.Figure()
fig_challenge_information_entropy_git = go.Figure()
fig_challenge_readability_git = go.Figure()
fig_challenge_topic_closed_count_git = go.Figure()
fig_challenge_solved_rate_git = go.Figure()

for name, group in df_challenge.groupby('Challenge_topic_macro'):
    qa = group[group['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
    git = group[group['Platform'].isin(['Github', 'Gitlab'])]

    # plot challenge topic count over time
    group_qa = qa.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x_qa = pd.to_datetime(group_qa['Challenge_created_time']).values
    y_qa = group_qa['Challenge_topic_macro'].values
    diff_y = np.diff(y_qa)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_topic_count_qa.add_trace(
        go.Scatter(x=x_qa, y=diff_y, mode='lines', name=name))

    group_git = git.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x_git = pd.to_datetime(group_git['Challenge_created_time']).values
    y_git = group_git['Challenge_topic_macro'].values
    diff_y = np.diff(y_git)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_topic_count_git.add_trace(
        go.Scatter(x=x_git, y=diff_y, mode='lines', name=name))

    # plot challenge answer count over time
    group_qa = qa.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[['Challenge_answer_count', 'Challenge_link_count', 'Challenge_word_count', 'Challenge_score',
                                                                               'Challenge_unique_word_count', 'Challenge_sentence_count', 'Challenge_information_entropy', 'Challenge_readability']].sum().reset_index()
    y = group_qa['Challenge_answer_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_answer_count_qa.add_trace(
        go.Scatter(x=x_qa, y=diff_y, mode='lines', name=name))

    group_git = git.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))[['Challenge_answer_count', 'Challenge_link_count', 'Challenge_word_count', 'Challenge_score',
                                                                                 'Challenge_unique_word_count', 'Challenge_sentence_count', 'Challenge_information_entropy', 'Challenge_readability']].sum().reset_index()
    y = group_git['Challenge_answer_count'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_answer_count_git.add_trace(
        go.Scatter(x=x_git, y=diff_y, mode='lines', name=name))

    # plot challenge score over time
    y = group_qa['Challenge_score'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_score_qa.add_trace(
        go.Scatter(x=x_qa, y=diff_y, mode='lines', name=name))

    y = group_git['Challenge_score'].values
    diff_y = np.diff(y)
    diff_y = np.insert(diff_y, 0, 0)
    fig_challenge_score_git.add_trace(
        go.Scatter(x=x_git, y=diff_y, mode='lines', name=name))

    # plot challenge closed topic count over time
    group_closed_qa = qa.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x = pd.to_datetime(group_closed_qa['Challenge_closed_time']).values
    y = group_closed_qa['Challenge_topic_macro'].values
    diff_Y = np.diff(y)
    diff_Y = np.insert(diff_Y, 0, 0)
    fig_challenge_topic_closed_count_qa.add_trace(
        go.Scatter(x=x, y=diff_Y, mode='lines', name=name))
    
    group_closed_git = git.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))[
        'Challenge_topic_macro'].count().reset_index()
    x = pd.to_datetime(group_closed_git['Challenge_closed_time']).values
    y = group_closed_git['Challenge_topic_macro'].values
    diff_Y = np.diff(y)
    diff_Y = np.insert(diff_Y, 0, 0)
    fig_challenge_topic_closed_count_git.add_trace(
        go.Scatter(x=x, y=diff_Y, mode='lines', name=name))

    # plot challenge link count over time
    y = group_qa['Challenge_link_count'].values / y_qa
    fig_challenge_link_count_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_link_count'].values / y_git
    fig_challenge_link_count_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge word count over time
    y = group_qa['Challenge_word_count'].values / y_qa
    fig_challenge_word_count_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_word_count'].values / y_git
    fig_challenge_word_count_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge sentence count over time
    y = group_qa['Challenge_sentence_count'].values / y_qa
    fig_challenge_sentence_count_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_sentence_count'].values / y_git
    fig_challenge_sentence_count_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge unique word count over time
    y = group_qa['Challenge_unique_word_count'].values / y_qa
    fig_challenge_unique_word_count_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_unique_word_count'].values / y_git
    fig_challenge_unique_word_count_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge information entropy over time
    y = group_qa['Challenge_information_entropy'].values / y_qa
    fig_challenge_information_entropy_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_information_entropy'].values / y_git
    fig_challenge_information_entropy_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge readability over time
    y = group_qa['Challenge_readability'].values / y_qa
    fig_challenge_readability_qa.add_trace(
        go.Scatter(x=x_qa, y=y, mode='lines', name=name))

    y = group_git['Challenge_readability'].values / y_git
    fig_challenge_readability_git.add_trace(
        go.Scatter(x=x_git, y=y, mode='lines', name=name))

    # plot challenge solved rate over time
    group_closed_qa = qa.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_all_qa = qa.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_solved = pd.merge(group_closed_qa, group_all_qa, on='Date', how='outer').fillna(0).sort_values(by='Date')
    x = pd.to_datetime(group_all_qa['Date']).values
    y = group_solved['Solved'] / group_solved['All'] * 100
    fig_challenge_solved_rate_qa.add_trace(
        go.Scatter(x=x, y=y, mode='lines', name=name))
    
    group_closed_git = git.groupby(pd.Grouper(key='Challenge_closed_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_closed_time': 'Date', 'Challenge_topic_macro': 'Solved'})
    group_all_git = git.groupby(pd.Grouper(key='Challenge_created_time', freq='Y'))['Challenge_topic_macro'].count(
    ).cumsum().reset_index().rename(columns={'Challenge_created_time': 'Date', 'Challenge_topic_macro': 'All'})
    group_solved = pd.merge(group_closed_git, group_all_git, on='Date', how='outer').fillna(0).sort_values(by='Date')
    x = pd.to_datetime(group_all_git['Date']).values
    y = group_solved['Solved'] / group_solved['All'] * 100
    fig_challenge_solved_rate_git.add_trace(
        go.Scatter(x=x, y=y, mode='lines', name=name))

fig_challenge_topic_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_answer_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_score_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_word_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_unique_word_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_sentence_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_link_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_information_entropy_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_readability_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_topic_closed_count_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_solved_rate_qa.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))

fig_challenge_topic_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_answer_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_score_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_word_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_unique_word_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_sentence_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_link_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_information_entropy_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_readability_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_topic_closed_count_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))
fig_challenge_solved_rate_git.update_layout(
    width=2000,
    height=1000,
    margin=dict(l=0, r=0, t=0, b=0))

fig_challenge_topic_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_topic_count_increase_rate (QA).png'))
fig_challenge_answer_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_answer_count_increase_rate (QA).png'))
fig_challenge_score_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_score_increase_rate (QA).png'))
fig_challenge_link_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_link_count (QA).png'))
fig_challenge_word_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_word_count (QA).png'))
fig_challenge_unique_word_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_unique_word_count (QA).png'))
fig_challenge_sentence_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_sentence_count (QA).png'))
fig_challenge_information_entropy_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_information_entropy (QA).png'))
fig_challenge_readability_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_readability (QA).png'))
fig_challenge_topic_closed_count_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_topic_closed_count_increase_rate (QA).png'))
fig_challenge_solved_rate_qa.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_solved_rate (QA).png'))

fig_challenge_topic_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_topic_count_increase_rate (Git).png'))
fig_challenge_answer_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_answer_count_increase_rate (Git).png'))
fig_challenge_score_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_score_increase_rate (Git).png'))
fig_challenge_link_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_link_count (Git).png'))
fig_challenge_word_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_word_count (Git).png'))
fig_challenge_unique_word_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_unique_word_count (Git).png'))
fig_challenge_sentence_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_sentence_count (Git).png'))
fig_challenge_information_entropy_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_information_entropy (Git).png'))
fig_challenge_readability_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_readability (Git).png'))
fig_challenge_topic_closed_count_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_topic_closed_count_increase_rate (Git).png'))
fig_challenge_solved_rate_git.write_image(os.path.join(
    path_challenge_git_qa, f'Challenge_solved_rate (Git).png'))