In [47]:
import openai
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import n_colors
from matplotlib import pyplot as plt
import numpy as np
import pickle
from scipy.stats import mannwhitneyu
from scipy.stats import pearsonr
from scipy.stats import shapiro
import warnings
warnings.filterwarnings("ignore")


pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

os.environ["TOKENIZERS_PARALLELISM"] = "true"

openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_solution = os.path.join(path_result, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_challenge_git_qa = os.path.join(path_challenge, 'Git vs QA')
if not os.path.exists(path_challenge_git_qa):
    os.makedirs(path_challenge_git_qa)

path_challenge_open_closed = os.path.join(path_challenge, 'Open vs Closed')
if not os.path.exists(path_challenge_open_closed):
    os.makedirs(path_challenge_open_closed)

path_challenge_so_to = os.path.join(
    path_challenge, 'Stack Overflow vs Tool-specific')
if not os.path.exists(path_challenge_so_to):
    os.makedirs(path_challenge_so_to)

path_challenge_azureml_sagemaker = os.path.join(
    path_challenge, 'AzureML vs SageMaker')
if not os.path.exists(path_challenge_azureml_sagemaker):
    os.makedirs(path_challenge_azureml_sagemaker)

path_challenge_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_challenge_evolution):
    os.makedirs(path_challenge_evolution)

path_solution_evolution = os.path.join(path_solution, 'Evolution')
if not os.path.exists(path_solution_evolution):
    os.makedirs(path_solution_evolution)

In [3]:
# The significance level is the probability of rejecting the null hypothesis when it is true.
alpha = 0.05


In [39]:
# Create challenge topic distribution tree map

df_topics = pd.read_json(os.path.join(path_general, 'original.json'))
df_topics = df_topics[df_topics['Challenge_topic'] > -1]
df_topics['Challenge_comment_count'] = df_topics['Challenge_comment_count'].fillna(0)
df_topics['Challenge_answer_count'] = df_topics['Challenge_answer_count'].fillna(0)
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Challenge_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(
    path_challenge, 'Challenge_topic_distribution.png'))

In [37]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.
###\n'''

with open(os.path.join(path_challenge, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic +
               '\n'.join(topic_term_list) + '\n###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_challenge = completion.choices[0].message.content
print(topic_challenge)

Topic 0: Environment Setup - Setting up software environments for development and execution
Topic 1: Pipeline Automation - Automating the execution of data processing pipelines
Topic 2: Docker - Containerization platform for building, shipping, and running applications
Topic 3: Hyperparameter Tuning - Optimizing model performance by tuning hyperparameters
Topic 4: Git Version Control - Tracking changes to code and collaborating with others
Topic 5: GPU Acceleration - Using graphics processing units to speed up machine learning tasks
Topic 6: Artifact Management - Managing and storing artifacts such as models, datasets, and code
Topic 7: Model Deployment - Deploying machine learning models for use in production environments
Topic 8: Data Labeling - Assigning labels to data for use in supervised learning tasks
Topic 9: Data Visualization - Creating visual representations of data for analysis and communication
Topic 10: Logging Metrics - Recording and tracking performance metrics during m

In [24]:
topic_mapping_challenge = {
    0: ('Package Management', 'Installing and configuring software packages and dependencies'),
    1: ('Pipeline Configuration', 'Automating the execution of data processing pipelines'),
    2: ('Docker Configuration', 'Containerization platform for building, shipping, and running applications'),
    3: ('Hyperparameter Tuning', 'Optimizing model performance by tuning hyperparameters'),
    4: ('Code Versioning', 'Managing and tracking changes in a repository using Git'),
    5: ('GPU Configuration', 'Using graphics processing units to speed up machine learning tasks'),
    6: ('Artifact Management', 'Uploading, downloading, and storing artifacts'),
    7: ('Endpoint Deployment', 'Deploying machine learning models for use in production environments'),
    8: ('Data Labeling', 'Assigning labels to data for use in supervised learning tasks'),
    9: ('Data Visualization', 'Creating visual representations of data for analysis and communication'),
    10: ('Metrics Logging', 'Recording and tracking performance metrics during model training and evaluation'),
    11: ('Account Management', 'Managing user accounts and access to resources'),
    12: ('Apache Spark Configuration', 'Installing and configuring Apache Spark distributed computing system for big data processing'),
    13: ('TensorFlow Configuration', 'Installing and configuring the TensorFlow machine learning framework'),
    14: ('Text Processing', 'Analyzing and manipulating text data'),
    15: ('Pandas Dataframe', 'Manipulating and analyzing tabular data using the Pandas library'),
    16: ('Model Exporting', 'Saving and exporting trained machine learning models'),
    17: ('Role-based Access Control', 'Controlling access to resources based on user roles and permissions'),
    18: ('Batch Processing', 'Processing large amounts of data in batches'),
    19: ('Model Registry', 'Registering, managing, and versioning models'),
    20: ('Database Connectivity', 'Connecting to and interacting with databases'),
    21: ('Resource Quota Control', 'Setting and managing limits on resource usage'),
    22: ('API Invocation', 'Calling APIs to perform tasks or retrieve data'),
    23: ('Forecasting', 'Using automated machine learning to generate forecasts'),
    24: ('Columnar Manipulation', 'Working with and manipulating columns in datasets'),
    25: ('Object Detection', 'Using machine learning to analyze and interpret visual data'),
    26: ('Web Service', 'Deploying machine learning models as web services'),
    27: ('Kubernetes Orchestration', 'Open-source container orchestration platform for managing containerized applications'),
    28: ('Tree-based Model', 'Building, training, and cutting tree-like structure to make predictions'),
    29: ('CSV Manipulation', 'Reading, writing, and manipulating CSV files'),
    30: ('TensorBoard Logging', 'Visualizing and tracking model training and evaluation using TensorBoard'),
    31: ('Feature Roadmap', 'Planning and implementing new features for a platform or product'),
    32: ('Dataset Versioning', 'Managing and versioning datasets'),
    33: ('CloudWatch Monitoring', 'Monitoring and logging AWS resources and applications'),
    34: ('Speech-to-Text', 'Converting audio speech to text'),
    35: ('YAML Configuration', 'Configuring and defining stages in a pipeline using YAML files'),
    36: ('Data Storage', 'Storing and accessing data in cloud-based storage solutions'),
    37: ('VPC Neworking', 'Connecting to AWS services privately through a VPC'),
    38: ('Model Evaluation', 'Evaluating and improving the accuracy of machine learning models'),
    39: ('Model Serving', 'Preparing and querying input data for machine learning models'),
    40: ('Bucket Access Control', 'Managing access to cloud-based storage buckets'),
    41: ('Run Management', 'Managing and monitoring the execution of jobs and tasks'),
    42: ('Model Inference', 'Using trained machine learning models to make predictions'),
    43: ('Jupyter Notebook', 'Creating and running interactive notebooks for data analysis and visualization'),
}

In [None]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.
###\n'''

with open(os.path.join(path_solution, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic +
               '\n'.join(topic_term_list) + '\n###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_solution = completion.choices[0].message.content
print(topic_solution)

Topic 0: Git Tracking - Managing and tracking changes in a repository using Git.
Topic 1: Access Control - Managing user roles and permissions for accessing resources.
Topic 2: Environment Setup - Installing and configuring software packages and dependencies.
Topic 3: Logging Metrics - Capturing and analyzing data related to system performance and behavior.
Topic 4: Dataset Management - Organizing and manipulating data for use in machine learning models.
Topic 5: Docker Deployment - Packaging and deploying applications in containers using Docker.
Topic 6: Parameter Configuration - Setting and adjusting parameters for software programs.
Topic 7: YAML Configuration - Configuring and defining stages in a pipeline using YAML files.
Topic 8: Endpoint Deployment - Deploying and managing endpoints for accessing APIs and web services.
Topic 9: Jupyter Notebooks - Creating and running interactive notebooks for data analysis and visualization.
Topic 10: Pandas Dataframes - Manipulating and analy

In [23]:
topic_mapping_solution = {
    0: ('Code Versioning', 'Managing and tracking changes in a repository using Git'),
    1: ('Role-based Access Control', 'Controlling access to resources based on user roles and permissions'),
    2: ('Package Management', 'Installing and configuring software packages and dependencies'),
    3: ('Metrics Logging', 'Recording and tracking performance metrics during model training and evaluation'),
    4: ('Columnar Manipulation', 'Working with and manipulating columns in datasets'),
    5: ('Docker Configuration', 'Containerization platform for building, shipping, and running applications'),
    6: ('Hyperparameter Tuning', 'Optimizing model performance by tuning hyperparameters'),
    7: ('YAML Configuration', 'Configuring and defining stages in a pipeline using YAML files'),
    8: ('Endpoint Serving', 'Creating, deploying, and managing endpoints for REST and web services'),
    9: ('Jupyter Notebook', 'Creating and running interactive notebooks for data analysis and visualization'),
    10: ('Pandas Dataframe', 'Manipulating and analyzing tabular data using the Pandas library'),
    11: ('TensorFlow Configuration', 'Installing and configuring the TensorFlow machine learning framework'),
    12: ('Artifact Management', 'Uploading, downloading, and storing artifacts'),
    13: ('Endpoint Deployment', 'Deploying machine learning models for use in production environments'),
    14: ('Tree-based Model', 'Building, training, and cutting tree-like structure to make predictions'),
    15: ('Pipeline Configuration (Model)', 'Building, inputting, and parameterizing pipelines for API and object use'),
    16: ('JSON Payload', 'Formatting, serializing, and loading data'),
    17: ('Remote Configuration', 'Adding, modifying, and running remote URLs and resources'),
    18: ('Apache Spark Configuration', 'Installing and configuring Apache Spark distributed computing system for big data processing'),
    19: ('Model Wrapper', 'Using PyFunc and PythonModel interfaces, importing models, and loading models'),
    20: ('Data Transfer', 'Transferring data between cloud-based storage solutions'),
    21: ('Cluster Configuration', 'Running and managing distributed computing jobs'),
    22: ('Pipeline Configuration (Data)', 'Creating and managing data pipelines'),
    23: ('CSV Manipulation', 'Reading, writing, and manipulating CSV files'),
    24: ('Model Registry', 'Registering, managing, and versioning models'),
    25: ('Memory Management', 'Managing memory and distributing training for large datasets'),
    26: ('Model Application', 'Using neural networks for machine learning'),
    27: ('SDK Management', 'Managing SDK versions'),
    28: ('Serverless Serving', 'Invoking endpoints and APIs using Lambda functions and API gateways'),
}


In [50]:
topic_ensemble = [
    'Account Management',
    'Apache Spark Configuration',
    'API Invocation',
    'Artifact Management',
    'Batch Processing',
    'Bucket Access Control',
    'CloudWatch Monitoring',
    'Cluster Configuration',
    'Code Versioning',
    'Columnar Manipulation',
    'CSV Manipulation',
    'Data Labeling',
    'Data Storage',
    'Data Transfer',
    'Data Visualization',
    'Database Connectivity',
    'Dataset Versioning',
    'Docker Configuration',
    'GPU Configuration',
    'Hyperparameter Tuning',
    'JSON Payload',
    'Jupyter Notebook',
    'Kubernetes Orchestration',
    'Memory Management',
    'Metrics Logging',
    'Model Evaluation',
    'Model Exporting',
    'Model Inference',
    'Model Registry',
    'Model Serving',
    'Endpoint Serving',
    'Endpoint Deployment',
    'Serverless Serving',
    'Pandas Dataframe',
    'Pipeline Configuration',
    'Pipeline Configuration (Data)',
    'Pipeline Configuration (Model)',
    'Package Management',
    'Remote Configuration',
    'Resource Quota Control',
    'Role-based Access Control',
    'Run Management',
    'SDK Management',
    'TensorBoard Logging',
    'TensorFlow Configuration',
    'VPC Networking',
    'Web Service',
    'YAML Configuration',
]

topic_ensemble_high_level = [
    # All of these words relate to the configuration and management of infrastructure aspects of computer systems and networks. Specifically, they involve setting up and optimizing different components such as processing power, memory, network connections, and software to ensure that they work together efficiently and effectively.
    {'Infrastructure Management': ['Apache Spark Configuration', 'Cluster Configuration', 'Docker Configuration', 'GPU Configuration', 'VPC Networking', 'Memory Management',
                                   'Remote Configuration', 'Resource Quota Control', 'TensorFlow Configuration', 'Jupyter Notebook', 'Package Management', 'SDK Management', 'YAML Configuration']},
    # Code versioning refers to the practice of tracking changes to software code over time.
    {'Code Management': ['Code Versioning']},
    # These words are all related to data management and analysis. They refer to various tasks and techniques used to organize, manipulate, store, transfer, and analyze data.
    {'Data Management': ['Artifact Management', 'Columnar Manipulation', 'CSV Manipulation', 'Data Labeling', 'Data Storage',
                         'Data Transfer', 'Data Visualization', 'Database Connectivity', 'Dataset Versioning', 'Pandas Dataframe', 'Batch Processing']},
    # All of these words are related to the development and management of machine learning models.
    {'Model Management': ['Hyperparameter Tuning',
                          'Model Evaluation', 'Model Exporting', 'Model Registry']},
    # All of these words are related to the deployment and management of machine learning models or web services.
    {'Deployment Management': ['Endpoint Serving', 'Endpoint Deployment', 'Model Serving', 'Model Inference',
                               'JSON Payload', 'Web Service', 'Serverless Serving', 'API Invocation']},
    # All of these words are related to controlling access to information or resources in a system.
    {'Security Management': ['Account Management',
                             'Bucket Access Control', 'Role-based Access Control']},
    # All of these words are related to monitoring and logging data in various systems.
    {'Report Management': ['CloudWatch Monitoring',
                           'Metrics Logging', 'TensorBoard Logging', 'Metrics Logging']},
    # These words are all related to the management and optimization of data pipelines in software development.
    {'Lifecycle Management': ['Pipeline Configuration',
                              'Pipeline Configuration (Data)', 'Pipeline Configuration (Model)', 'Run Management', 'Kubernetes Orchestration']},
]

topic_ensemble_inverse = {}
for dictionary in topic_ensemble_high_level:
    for key, values in dictionary.items():
        for inner_value in values:
            topic_ensemble_inverse[inner_value] = key

colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(topic_ensemble_high_level), colortype='rgb')

In [25]:
# assign human-readable & high-level topics to challenges & solutions

df_topics = pd.read_json(os.path.join(path_general, 'original.json'))

for index, row in df_topics.iterrows():
    if row['Challenge_topic'] in topic_mapping_challenge:
        topic = topic_mapping_challenge[row['Challenge_topic']][0]
        df_topics.at[index, 'Challenge_topic'] = topic
        df_topics.at[index, 'Challenge_topic_higher_level'] = topic_ensemble_inverse[topic] if topic in topic_ensemble_inverse else np.nan
    else:
        df_topics.at[index, 'Challenge_topic'] = np.nan
        df_topics.at[index, 'Challenge_topic_higher_level'] = np.nan
    
    if row['Solution_topic'] in topic_mapping_solution:
        topic = topic_mapping_solution[row['Solution_topic']][0]
        df_topics.at[index, 'Solution_topic'] = topic
        df_topics.at[index, 'Solution_topic_higher_level'] = topic_ensemble_inverse[topic] if topic in topic_ensemble_inverse else np.nan
    else:
        df_topics.at[index, 'Solution_topic'] = np.nan
        df_topics.at[index, 'Solution_topic_higher_level'] = np.nan

df_topics.to_json(os.path.join(path_general, 'assigned.json'),
                  indent=4, orient='records')


In [45]:
df_topics = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_topics = df_topics[df_topics['Challenge_topic_higher_level'].notna(
) & df_topics['Solution_topic_higher_level'].notna()]

categories = ['Challenge_topic_higher_level', 'Solution_topic_higher_level']
df_topics = df_topics.groupby(categories).size().reset_index(name='value')

# we only visualize strong connection
df_topics = df_topics[df_topics['value'] > 50]

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_topics[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_topics[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict()
data = go.Sankey(
    link=link,
    node=dict(
        label=label,
        thickness=100,
        pad=30,
    ))

fig = go.Figure(data)
fig.update_layout(
    height=2000,
    width=2000,
    font=dict(size=20),
)
fig.write_image(os.path.join(path_general,
                'Challenge solution sankey.png'))

In [None]:
# topic_all = set(['Topic: ' + topic[0] + ', Description: ' + topic[1] for topic in ] + ['Topic: ' + topic[0] + ', Description: ' + topic[1] for topic in topic_mapping_challenge.values()])
# topic_all


In [None]:
# topic_all = list(set(list(topic_mapping_solution.values()) + list(topic_mapping_challenge.values())))
# for item in topic_ensemble:
#     for item2 in topic_all:
#         if item == item2[0]:
#             topic_all.remove(item2)
#             break
# topic_all


In [None]:
# prompt_classify = '''Please classify the following concepts as different categories based on their application scenarios and provide a short description for each category:\n'''

# completion = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "user", "content": prompt_classify + ', '.join(topic_ensemble) + '\n'},
#         ],
#     temperature=0,
#     max_tokens=500,
#     top_p=1,
#     frequency_penalty=0,
#     presence_penalty=0,
#     timeout=50,
#     stream=False
# )
# completion.choices[0].message.content


In [None]:
# # Collect general challenge statistics information

# df_challenge = pd.read_json(os.path.join(path_general, 'original.json'))
# df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]

# # total_count = df_challenge['Challenge_topic'].count()
# df_topics = []

# for name, group in df_challenge.groupby('Challenge_topic'):
#     count = group['Challenge_topic'].count()
#     Solved_ratio = group['Challenge_closed_time'].notna().sum() / count
#     Mean_score = group['Challenge_score'].mean()
#     Mean_favorite_count = group['Challenge_follower_count'].mean()
#     Mean_follower_count = group['Challenge_follower_count'].mean()
#     Mean_link_count = group['Challenge_link_count'].mean()
#     Mean_information_entropy = group['Challenge_information_entropy'].mean()
#     Mean_readability = group['Challenge_readability'].mean()
#     Mean_sentence_count = group['Challenge_sentence_count'].mean()
#     Mean_word_count = group['Challenge_word_count'].mean()
#     Mean_unique_word_count = group['Challenge_unique_word_count'].mean()
#     Mean_view_count = group['Challenge_view_count'].mean()
#     Mean_answer_count = group['Challenge_answer_count'].mean()
#     Mean_comment_count = group['Challenge_comment_count'].mean()
#     # Mean_participation_count = Mean_answer_count + Mean_comment_count
#     # Score_participation_ratio = Mean_score / Mean_participation_count
#     # Score_participation_weighted_product = (
#     #     group['Challenge_score'] * group['Challenge_participation_count']).mean()
#     # Mean_solved_time = group['Challenge_solved_time'].mean(
#     # ) / pd.Timedelta(hours=1)
#     # Median_solved_time = group['Challenge_solved_time'].median(
#     # ) / pd.Timedelta(hours=1)
#     # Mean_solved_time_edited = group['Challenge_solved_time_edited'].mean(
#     # ) / pd.Timedelta(hours=1)
#     # Median_solved_time_edited = group['Challenge_solved_time_edited'].median(
#     # ) / pd.Timedelta(hours=1)
#     topic_info = {
#         'Topic': name,
#         'Count': count,
#         'Solved ratio': Solved_ratio,
#         'Mean score': Mean_score,
#         'Mean follower count': Mean_favorite_count,
#         'Mean follower count': Mean_follower_count,
#         'Mean link count': Mean_link_count,
#         'Mean information entropy': Mean_information_entropy,
#         'Mean readability': Mean_readability,
#         'Mean sentence count': Mean_sentence_count,
#         'Mean word count': Mean_word_count,
#         'Mean unique word count': Mean_unique_word_count,
#         'Mean view count': Mean_view_count,
#         'Mean answer count': Mean_answer_count,
#         'Mean comment count': Mean_comment_count,
#         # 'Score participation ratio': Score_participation_ratio,
#         # 'Score participation weighted product': Score_participation_weighted_product,
#         # 'Mean solved time': Mean_solved_time,
#         # 'Median solved time': Median_solved_time,
#         # 'Mean solved time edited': Mean_solved_time_edited,
#         # 'Median solved time edited': Median_solved_time_edited,
#     }
#     df_topics.append(topic_info)

# df_topics = pd.DataFrame(df_topics)
# df_topics.to_json(os.path.join(path_challenge_open_closed,
#                   'general.json'), indent=4, orient='records')

# # df_topics['Topic'] = df_topics['Topic'].astype(str)

# # # plot count
# # df_topics = df_topics.sort_values('Count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_count.png'))
# # plt.close()

# # # plot solved ratio
# # df_topics = df_topics.sort_values('Solved ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Solved ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge solved ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_solved_ratio.png'))
# # plt.close()

# # # plot mean score
# # df_topics = df_topics.sort_values('Mean score', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean score'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean score',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_score.png'))
# # plt.close()

# # # plot mean favorite count
# # df_topics = df_topics.sort_values('Mean favorite count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean favorite count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean favorite count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_favorite_count.png'))
# # plt.close()

# # # plot mean follower count
# # df_topics = df_topics.sort_values('Mean follower count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean follower count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean follower count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_follower_count.png'))
# # plt.close()

# # # plot mean link count
# # df_topics = df_topics.sort_values('Mean link count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean link count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean link count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_link_count.png'))
# # plt.close()

# # # plot mean information entropy
# # df_topics = df_topics.sort_values('Mean information entropy', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean information entropy'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean info entropy',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_information_entropy.png'))
# # plt.close()

# # # plot mean readability
# # df_topics = df_topics.sort_values('Mean readability', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean readability'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_readability.png'))
# # plt.close()

# # # plot mean sentence count
# # df_topics = df_topics.sort_values('Mean sentence count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean sentence count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_sentence_count.png'))
# # plt.close()

# # # plot mean word count
# # df_topics = df_topics.sort_values('Mean word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean word count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_word_count.png'))
# # plt.close()

# # # plot mean unique word count
# # df_topics = df_topics.sort_values('Mean unique word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean unique word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean unique word count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_unique_word_count.png'))
# # plt.close()

# # # plot mean view count
# # df_topics = df_topics.sort_values('Mean view count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean view count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean view count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_view_count.png'))
# # plt.close()

# # # plot mean answer count
# # df_topics = df_topics.sort_values('Mean answer count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean answer count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean answer count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_answer_count.png'))
# # plt.close()

# # # plot mean comment count
# # df_topics = df_topics.sort_values('Mean comment count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean comment count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean comment count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_comment_count.png'))
# # plt.close()

# # # plot score participation ratio
# # df_topics = df_topics.sort_values('Score participation ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Score participation ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge score participation ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_score_participation_ratio.png'))
# # plt.close()

# # # plot score participation weighted product
# # df_topics = df_topics.sort_values('Score participation weighted product', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Score participation weighted product'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge score participation weighted product',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_score_participation_weighted_product.png'))
# # plt.close()

# # # plot mean solved time
# # df_topics = df_topics.sort_values('Mean solved time', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean solved time'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean solved time',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_solved_time.png'))
# # plt.close()

# # # plot median solved time
# # df_topics = df_topics.sort_values('Median solved time', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Median solved time'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge median solved time',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_median_solved_time.png'))
# # plt.close()

# # # plot Mean solved time edited
# # df_topics = df_topics.sort_values('Mean solved time edited', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean solved time edited'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge Mean solved time edited',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_Mean_solved_time_edited.png'))
# # plt.close()

# # # plot Median solved time edited
# # df_topics = df_topics.sort_values('Median solved time edited', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Median solved time edited'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge Median solved time edited',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_Median_solved_time_edited.png'))
# # plt.close()

In [None]:
# # Collect general solution statistics information

# df_solution = pd.read_json(os.path.join(path_general, 'original.json'))
# df_solution = df_solution[df_solution['Solution_topic'] > -1]
# # df_solution['Solution_topic'] = df_solution['Solution_topic'].astype(str)

# # total_count = df_solution['Solution_topic'].count()
# df_topics = []

# for name, group in df_solution.groupby('Solution_topic'):
#     count = group['Solution_topic'].count()
#     Mean_score = group['Solution_score'].mean()
#     Mean_link_count = group['Solution_link_count'].mean()
#     Mean_information_entropy = group['Solution_information_entropy'].mean()
#     Mean_readability = group['Solution_readability'].mean()
#     Mean_sentence_count = group['Solution_sentence_count'].mean()
#     Mean_word_count = group['Solution_word_count'].mean()
#     Mean_unique_word_count = group['Solution_unique_word_count'].mean()
#     Mean_comment_count = group['Solution_comment_count'].mean()
#     topic_info = {
#         'Topic': name,
#         'Count ratio': count,
#         'Mean score': Mean_score,
#         'Mean link count': Mean_link_count,
#         'Mean information entropy': Mean_information_entropy,
#         'Mean readability': Mean_readability,
#         'Mean sentence count': Mean_sentence_count,
#         'Mean word count': Mean_word_count,
#         'Mean unique word count': Mean_unique_word_count,
#         'Mean comment count': Mean_comment_count,
#     }
#     df_topics.append(topic_info)

# df_topics = pd.DataFrame(df_topics)
# df_topics.to_json(os.path.join(path_solution_information,
#                   'general.json'), indent=4, orient='records')

# # # plot count ratio
# # df_topics = df_topics.sort_values('Count ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Count ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge count ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_count_ratio.png'))
# # plt.close()

# # # plot solved ratio
# # df_topics = df_topics.sort_values('Mean score', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean score'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge solved ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_score.png'))
# # plt.close()

# # # plot mean score
# # df_topics = df_topics.sort_values('Mean link count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean link count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean score',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_link_count.png'))
# # plt.close()

# # # plot mean favorite count
# # df_topics = df_topics.sort_values('Mean information entropy', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean information entropy'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean favorite count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_information_entropy.png'))
# # plt.close()

# # # plot mean follower count
# # df_topics = df_topics.sort_values('Mean readability', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean readability'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean follower count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_readability.png'))
# # plt.close()

# # # plot mean link count
# # df_topics = df_topics.sort_values('Mean sentence count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean sentence count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean link count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_sentence_count.png'))
# # plt.close()

# # # plot mean information entropy
# # df_topics = df_topics.sort_values('Mean word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean info entropy',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_word_count.png'))
# # plt.close()

# # # plot mean readability
# # df_topics = df_topics.sort_values('Mean unique word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean unique word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_unique_word_count.png'))
# # plt.close()

# # # plot mean sentence count
# # df_topics = df_topics.sort_values('Mean comment count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean comment count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_comment_count.png'))
# # plt.close()

In [None]:
# test if the challenge sample comes from a normal distribution

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

_, p = shapiro(df['Challenge_answer_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge answer count: {result}Gaussian')

_, p = shapiro(df['Challenge_comment_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge comment count: {result}Gaussian')

_, p = shapiro(df['Challenge_favorite_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge favorite count: {result}Gaussian')

_, p = shapiro(df['Challenge_follower_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge follower count: {result}Gaussian')

_, p = shapiro(df['Challenge_link_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge link count: {result}Gaussian')

_, p = shapiro(df['Challenge_readability'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge readability: {result}Gaussian')

_, p = shapiro(df['Challenge_score'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge score: {result}Gaussian')

_, p = shapiro(df['Challenge_view_count'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge view count: {result}Gaussian')

_, p = shapiro(df['Challenge_solved_time'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge solved time: {result}Gaussian')

_, p = shapiro(df['Challenge_adjusted_solved_time'])
result = 'non-' if p < alpha else ''
print(f'p = {p}, indicating challenge adjusted solved time: {result}Gaussian')

p = 0.0, indicating challenge answer count: non-Gaussian
p = 1.0, indicating challenge comment count: Gaussian
p = 1.0, indicating challenge favorite count: Gaussian
p = 1.0, indicating challenge follower count: Gaussian
p = 0.0, indicating challenge link count: non-Gaussian
p = 0.0, indicating challenge readability: non-Gaussian
p = 1.0, indicating challenge score: Gaussian
p = 1.0, indicating challenge view count: Gaussian
p = 1.0, indicating challenge solved time: Gaussian
p = 1.0, indicating challenge adjusted solved time: Gaussian


In [None]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

# 12 sets of normal distributed random data, with increasing mean and standard deviation
data = (np.linspace(1, 2, 12)[:, np.newaxis] * np.random.randn(12, 200) +
            (np.arange(12) + 2 * np.random.random(12))[:, np.newaxis])

colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', 12, colortype='rgb')

fig = go.Figure()
for data_line, color in zip(data, colors):
    fig.add_trace(go.Violin(x=data_line, line_color=color))

fig.update_traces(orientation='h', side='positive', width=3, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.show()

In [56]:
# Collect and compared Q&A forum and Git repo challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]
df_challenge['Challenge_topic_higher_level'] = df_challenge['Challenge_topic'].map(
    topic_ensemble_inverse)

df_qa = df_challenge[df_challenge['Platform'].isin(
    ['Stack Overflow', 'Tool-specific'])]
df_git = df_challenge[df_challenge['Platform'].isin(['Github', 'Gitlab'])]

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_qa), 'Challenge topic count (higher level)'),
        y=df_qa['Challenge_topic_higher_level'],
        opacity=0.5,
        name='QA',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_git), 'Challenge topic count (higher level)'),
        y=df_git['Challenge_topic_higher_level'],
        opacity=0.5,
        name='Git',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge score.png'))

# Challenge favorite count
fig_challenge_favorite_count = go.Figure()
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge favorite count.png'))

# Challenge follower count
fig_challenge_follower_count = go.Figure()
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_follower_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_follower_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge follower count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge link count.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge readability.png'))

# Challenge view count
fig_challenge_view_count = go.Figure()
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_view_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_view_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge view count.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge answer count.png'))

# Challenge comment count
fig_challenge_comment_count = go.Figure()
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_qa['Challenge_topic_higher_level'],
        y=df_qa['Challenge_comment_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='QA',
        scalegroup='QA',
        name='QA',
    ))
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_git['Challenge_topic_higher_level'],
        y=df_git['Challenge_comment_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Git',
        scalegroup='Git',
        name='Git',
    ))
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge comment count.png'))

fig_challenge_solved_time_git = go.Figure()
fig_challenge_solved_time_qa = go.Figure()
fig_challenge_adjusted_solved_time_git = go.Figure()
fig_challenge_adjusted_solved_time_qa = go.Figure()

for name_group, color in zip(df_challenge.groupby('Challenge_topic_higher_level'), colors):
    name, group = name_group
    qa = group[group['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
    git = group[group['Platform'].isin(['Github', 'Gitlab'])]

    fig_challenge_solved_time_git.add_trace(go.Violin(x=git['Challenge_solved_time'], y=git['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_solved_time_qa.add_trace(go.Violin(x=qa['Challenge_solved_time'], y=qa['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_git.add_trace(go.Violin(x=git['Challenge_adjusted_solved_time'], y=git['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_qa.add_trace(go.Violin(x=qa['Challenge_adjusted_solved_time'], y=qa['Challenge_topic_higher_level'], line_color=color))

    # Challenge score
    challenge_score_qa = qa[qa['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_git = git[git['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_qa) * len(challenge_score_git) > 0:
        _, p = mannwhitneyu(challenge_score_qa, challenge_score_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge score')

    # Challenge favorite count
    challenge_favorite_count_qa = qa[qa['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_git = git[git['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_qa) * len(challenge_favorite_count_git) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_qa,
                            challenge_favorite_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge favorite count')

    # Challenge follower count
    challenge_follower_count_qa = qa[qa['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_git = git[git['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_qa) * len(challenge_follower_count_git) > 0:
        _, p = mannwhitneyu(challenge_follower_count_qa,
                            challenge_follower_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge follower count')

    # Challenge link count
    challenge_link_count_qa = qa[qa['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_git = git[git['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_qa) * len(challenge_link_count_git) > 0:
        _, p = mannwhitneyu(challenge_link_count_qa, challenge_link_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge link count')

    # Challenge readability
    challenge_readability_qa = qa[qa['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_git = git[git['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_qa) * len(challenge_readability_git) > 0:
        _, p = mannwhitneyu(challenge_readability_qa,
                            challenge_readability_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge readability')

    # Challenge view count
    challenge_view_count_qa = qa[qa['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    challenge_view_count_git = git[git['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    if len(challenge_view_count_qa) * len(challenge_view_count_git) > 0:
        _, p = mannwhitneyu(challenge_view_count_qa,
                            challenge_view_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge answer count')

    # Challenge answer count
    challenge_answer_count_qa = qa[qa['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_git = git[git['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_qa) * len(challenge_answer_count_git) > 0:
        _, p = mannwhitneyu(challenge_answer_count_qa,
                            challenge_answer_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge answer count')

    # Challenge comment count
    challenge_comment_count_qa = qa[qa['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_git = git[git['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_qa) * len(challenge_comment_count_git) > 0:
        _, p = mannwhitneyu(challenge_comment_count_qa,
                            challenge_comment_count_git)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic {name} in challenge comment count')

fig_challenge_solved_time_git.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_git.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - Git',
)
fig_challenge_solved_time_git.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge solved time (Git).png'))

fig_challenge_solved_time_qa.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_qa.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - QA',
)
fig_challenge_solved_time_qa.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge solved time (QA).png'))

fig_challenge_adjusted_solved_time_git.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_git.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - Git',
)
fig_challenge_adjusted_solved_time_git.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge adjusted solved time (Git).png'))

fig_challenge_adjusted_solved_time_qa.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_qa.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - QA',
)
fig_challenge_adjusted_solved_time_qa.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge adjusted solved time (QA).png'))

# Challenge hgher level mean solved time
challenge_mean_solved_time_qa = df_qa[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_solved_time']
challenge_mean_solved_time_git = df_git[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_mean_solved_time_qa,
                    challenge_mean_solved_time_git)
if p < alpha:
    print(f'p = {p}, indicating different distribution of Q&A fora vs Git repos in higher level mean challenge solved time')

# Challenge hgher level median solved time
challenge_median_solved_time_qa = df_qa[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_solved_time']
challenge_median_solved_time_git = df_git[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_median_solved_time_qa,
                    challenge_median_solved_time_git)
if p < alpha:
    print(f'p = {p}, indicating different distribution of Q&A fora vs Git repos in higher level median challenge solved time')

# Challenge hgher level adjusted mean solved time
challenge_adjusted_mean_solved_time_qa = df_qa[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_adjusted_solved_time']
challenge_adjusted_mean_solved_time_git = df_git[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_adjusted_solved_time']
_, p = mannwhitneyu(challenge_adjusted_mean_solved_time_qa,
                    challenge_adjusted_mean_solved_time_git)
if p < alpha:
    print(f'p = {p}, indicating different distribution of Q&A fora vs Git repos in higher level adjusted mean challenge solved time')

# Challenge hgher level adjusted median solved time
challenge_adjusted_median_solved_time_qa = df_qa[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_adjusted_solved_time']
challenge_adjusted_median_solved_time_git = df_git[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_adjusted_solved_time']
_, p = mannwhitneyu(challenge_adjusted_median_solved_time_qa,
                    challenge_adjusted_median_solved_time_git)
if p < alpha:
    print(f'p = {p}, indicating different distribution of Q&A fora vs Git repos in higher level adjusted median challenge solved time')


p = 2.9525342278607865e-08, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Code Management in challenge score
p = 3.413022979620236e-06, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Code Management in challenge readability
p = 0.013828080380161572, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Code Management in challenge answer count
p = 1.3297136900178895e-05, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Data Management in challenge score
p = 6.799266444675269e-07, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Data Management in challenge readability
p = 0.01339206263312274, indicating different distribution of Q&A fora vs Git repos challenge regarding higher level topic Deployment Management in challenge score
p = 0.0149760230

In [58]:
# Collect and compared open vs closed challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]
df_challenge['Challenge_topic_higher_level'] = df_challenge['Challenge_topic'].map(
    topic_ensemble_inverse)

df_open = df_challenge[df_challenge['Challenge_closed_time'].isna()]
df_closed = df_challenge[df_challenge['Challenge_closed_time'].notna()]

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_open), 'Challenge topic count (higher level)'),
        y=df_open['Challenge_topic_higher_level'],
        opacity=0.5,
        name='Open',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_closed), 'Challenge topic count (higher level)'),
        y=df_closed['Challenge_topic_higher_level'],
        opacity=0.5,
        name='Closed',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge score.png'))

# Challenge favorite count
fig_challenge_favorite_count = go.Figure()
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge favorite count.png'))

# Challenge follower count
fig_challenge_follower_count = go.Figure()
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_follower_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_follower_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge follower count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge link count.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge readability.png'))

# Challenge view count
fig_challenge_view_count = go.Figure()
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_view_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_view_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge view count.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge answer count.png'))

# Challenge comment count
fig_challenge_comment_count = go.Figure()
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_open['Challenge_topic_higher_level'],
        y=df_open['Challenge_comment_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Open',
        scalegroup='Open',
        name='Open',
    ))
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_closed['Challenge_topic_higher_level'],
        y=df_closed['Challenge_comment_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Closed',
        scalegroup='Closed',
        name='Closed',
    ))
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge comment count.png'))

fig_challenge_solved_time_closed = go.Figure()
fig_challenge_adjusted_solved_time_closed = go.Figure()

for name_group, color in zip(df_challenge.groupby('Challenge_topic_higher_level'), colors):
    name, group = name_group
    open = group[group['Challenge_closed_time'].isna()]
    closed = group[group['Challenge_closed_time'].notna()]

    fig_challenge_solved_time_closed.add_trace(go.Violin(x=closed['Challenge_solved_time'], y=closed['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_closed.add_trace(go.Violin(x=closed['Challenge_adjusted_solved_time'], y=closed['Challenge_topic_higher_level'], line_color=color))

    # Challenge score
    challenge_score_open = open[open['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_closed = closed[closed['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_open) * len(challenge_score_closed) > 0:
        _, p = mannwhitneyu(challenge_score_open, challenge_score_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge score')

    # Challenge favorite count
    challenge_favorite_count_open = open[open['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_closed = closed[closed['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_open) * len(challenge_favorite_count_closed) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_open,
                            challenge_favorite_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge favorite count')

    # Challenge follower count
    challenge_follower_count_open = open[open['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_closed = closed[closed['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_open) * len(challenge_follower_count_closed) > 0:
        _, p = mannwhitneyu(challenge_follower_count_open,
                            challenge_follower_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge follower count')

    # Challenge link count
    challenge_link_count_open = open[open['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_closed = closed[closed['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_open) * len(challenge_link_count_closed) > 0:
        _, p = mannwhitneyu(challenge_link_count_open, challenge_link_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge link count')

    # Challenge readability
    challenge_readability_open = open[open['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_closed = closed[closed['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_open) * len(challenge_readability_closed) > 0:
        _, p = mannwhitneyu(challenge_readability_open,
                            challenge_readability_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge readability')

    # Challenge view count
    challenge_view_count_open = open[open['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    challenge_view_count_closed = closed[closed['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    if len(challenge_view_count_open) * len(challenge_view_count_closed) > 0:
        _, p = mannwhitneyu(challenge_view_count_open,
                            challenge_view_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge answer count')

    # Challenge answer count
    challenge_answer_count_open = open[open['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_closed = closed[closed['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_open) * len(challenge_answer_count_closed) > 0:
        _, p = mannwhitneyu(challenge_answer_count_open,
                            challenge_answer_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge answer count')

    # Challenge comment count
    challenge_comment_count_open = open[open['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_closed = closed[closed['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_open) * len(challenge_comment_count_closed) > 0:
        _, p = mannwhitneyu(challenge_comment_count_open,
                            challenge_comment_count_closed)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of open vs closed challenge regarding higher level topic {name} in challenge comment count')

fig_challenge_solved_time_closed.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_closed.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - Closed',
)
fig_challenge_solved_time_closed.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge solved time (Closed).png'))

fig_challenge_adjusted_solved_time_closed.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_closed.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - Closed',
)
fig_challenge_adjusted_solved_time_closed.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge adjusted solved time (Closed).png'))


p = 0.003619997129911227, indicating different distribution of open and closed challenge regarding higher level topic Code Management in challenge readability
p = 0.0008067213000916159, indicating different distribution of open and closed challenge regarding higher level topic Code Management in challenge answer count
p = 0.0031505139398174107, indicating different distribution of open and closed challenge regarding higher level topic Code Management in challenge comment count
p = 0.0014299391886427608, indicating different distribution of open and closed challenge regarding higher level topic Data Management in challenge score
p = 0.029040710211535332, indicating different distribution of open and closed challenge regarding higher level topic Data Management in challenge favorite count
p = 1.135883397836137e-11, indicating different distribution of open and closed challenge regarding higher level topic Data Management in challenge answer count
p = 0.0010072633160112027, indicating dif

In [61]:
# Collect and compared Stack Overflow vs tool-specific fora challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]
df_challenge['Challenge_topic_higher_level'] = df_challenge['Challenge_topic'].map(
    topic_ensemble_inverse)

df_so = df_challenge[df_challenge['Platform'] == 'Stack Overflow']
df_to = df_challenge[df_challenge['Platform'] == 'Tool-specific']

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_so), 'Challenge topic count (higher level)'),
        y=df_so['Challenge_topic_higher_level'],
        opacity=0.5,
        name='Stack Overflow',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_to), 'Challenge topic count (higher level)'),
        y=df_to['Challenge_topic_higher_level'],
        opacity=0.5,
        name='Tool-specific',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_so_to, 'Challenge score.png'))

# Challenge favorite count
fig_challenge_favorite_count = go.Figure()
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge favorite count.png'))

# Challenge follower count
fig_challenge_follower_count = go.Figure()
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_follower_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_follower_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge follower count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge link count.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_so_to, 'Challenge readability.png'))

# Challenge view count
fig_challenge_view_count = go.Figure()
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_view_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_view_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge view count.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge answer count.png'))

# Challenge comment count
fig_challenge_comment_count = go.Figure()
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_so['Challenge_topic_higher_level'],
        y=df_so['Challenge_comment_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='Stack Overflow',
        scalegroup='Stack Overflow',
        name='Stack Overflow',
    ))
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_to['Challenge_topic_higher_level'],
        y=df_to['Challenge_comment_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='Tool-specific',
        scalegroup='Tool-specific',
        name='Tool-specific',
    ))
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge comment count.png'))

fig_challenge_solved_time_to = go.Figure()
fig_challenge_solved_time_so = go.Figure()
fig_challenge_adjusted_solved_time_to = go.Figure()
fig_challenge_adjusted_solved_time_so = go.Figure()

for name_group, color in zip(df_challenge.groupby('Challenge_topic_higher_level'), colors):
    name, group = name_group
    so = group[group['Platform'] == 'Stack Overflow']
    to = group[group['Platform'] == 'Tool-specific']

    fig_challenge_solved_time_to.add_trace(go.Violin(x=to['Challenge_solved_time'], y=to['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_solved_time_so.add_trace(go.Violin(x=so['Challenge_solved_time'], y=so['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_to.add_trace(go.Violin(x=to['Challenge_adjusted_solved_time'], y=to['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_so.add_trace(go.Violin(x=so['Challenge_adjusted_solved_time'], y=so['Challenge_topic_higher_level'], line_color=color))

    # Challenge score
    challenge_score_so = so[so['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_to = to[to['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_so) * len(challenge_score_to) > 0:
        _, p = mannwhitneyu(challenge_score_so, challenge_score_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge score')

    # Challenge favorite count
    challenge_favorite_count_so = so[so['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_to = to[to['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_so) * len(challenge_favorite_count_to) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_so,
                            challenge_favorite_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge favorite count')

    # Challenge follower count
    challenge_follower_count_so = so[so['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_to = to[to['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_so) * len(challenge_follower_count_to) > 0:
        _, p = mannwhitneyu(challenge_follower_count_so,
                            challenge_follower_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge follower count')

    # Challenge link count
    challenge_link_count_so = so[so['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_to = to[to['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_so) * len(challenge_link_count_to) > 0:
        _, p = mannwhitneyu(challenge_link_count_so, challenge_link_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge link count')

    # Challenge readability
    challenge_readability_so = so[so['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_to = to[to['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_so) * len(challenge_readability_to) > 0:
        _, p = mannwhitneyu(challenge_readability_so,
                            challenge_readability_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge readability')

    # Challenge view count
    challenge_view_count_so = so[so['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    challenge_view_count_to = to[to['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    if len(challenge_view_count_so) * len(challenge_view_count_to) > 0:
        _, p = mannwhitneyu(challenge_view_count_so,
                            challenge_view_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge answer count')

    # Challenge answer count
    challenge_answer_count_so = so[so['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_to = to[to['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_so) * len(challenge_answer_count_to) > 0:
        _, p = mannwhitneyu(challenge_answer_count_so,
                            challenge_answer_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge answer count')

    # Challenge comment count
    challenge_comment_count_so = so[so['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_to = to[to['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_so) * len(challenge_comment_count_to) > 0:
        _, p = mannwhitneyu(challenge_comment_count_so,
                            challenge_comment_count_to)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic {name} in challenge comment count')

fig_challenge_solved_time_to.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_to.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - Tool-specific',
)
fig_challenge_solved_time_to.write_image(os.path.join(
    path_challenge_so_to, 'Challenge solved time (Tool-specific).png'))

fig_challenge_solved_time_so.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_so.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - Stack Overflow',
)
fig_challenge_solved_time_so.write_image(os.path.join(
    path_challenge_so_to, 'Challenge solved time (Stack Overflow).png'))

fig_challenge_adjusted_solved_time_to.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_to.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - Tool-specific',
)
fig_challenge_adjusted_solved_time_to.write_image(os.path.join(
    path_challenge_so_to, 'Challenge adjusted solved time (Tool-specific).png'))

fig_challenge_adjusted_solved_time_so.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_so.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - Stack Overflow',
)
fig_challenge_adjusted_solved_time_so.write_image(os.path.join(
    path_challenge_so_to, 'Challenge adjusted solved time (Stack Overflow).png'))


p = 0.0004776617304235873, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic Code Management in challenge score
p = 0.026554652338517958, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic Code Management in challenge link count
p = 1.476590063394374e-17, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic Code Management in challenge answer count
p = 1.3870752530871918e-16, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic Code Management in challenge comment count
p = 8.44174508118179e-31, indicating different distribution of Stack Overflow vs tool-specific fora challenge regarding higher level topic Data Management in challenge score
p = 1.423483460088152e-12, indicating different distribution of Stack Overflow vs tool-specific fora chall

In [62]:
# Collect and compared SageMaker and AzureML challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]
df_challenge['Challenge_topic_higher_level'] = df_challenge['Challenge_topic'].map(
    topic_ensemble_inverse)

df_sagemaker = df_challenge[df_challenge['Tool'] == 'Amazon SageMaker']
df_azureml = df_challenge[df_challenge['Tool'] == 'Azure Machine Learning']

# Challenge topic count
fig_challenge_count = go.Figure()
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_azureml), 'Challenge topic count (higher level)'),
        y=df_azureml['Challenge_topic_higher_level'],
        opacity=0.5,
        name='SageMaker',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_sagemaker), 'Challenge topic count (higher level)'),
        y=df_sagemaker['Challenge_topic_higher_level'],
        opacity=0.5,
        name='AzureML',
    ))
fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge count.png'))

# Challenge score
fig_challenge_score = go.Figure()
fig_challenge_score.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_score'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_score.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_score'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_score.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge score.png'))

# Challenge favorite count
fig_challenge_favorite_count = go.Figure()
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_favorite_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_favorite_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge favorite count.png'))

# Challenge follower count
fig_challenge_follower_count = go.Figure()
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_follower_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_follower_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_follower_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge follower count.png'))

# Challenge link count
fig_challenge_link_count = go.Figure()
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_link_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_link_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_link_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge link count.png'))

# Challenge readability
fig_challenge_readability = go.Figure()
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_readability'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_readability.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_readability'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_readability.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge readability.png'))

# Challenge view count
fig_challenge_view_count = go.Figure()
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_view_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_view_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_view_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge view count.png'))

# Challenge answer count
fig_challenge_answer_count = go.Figure()
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_answer_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_answer_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_answer_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge answer count.png'))

# Challenge comment count
fig_challenge_comment_count = go.Figure()
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_azureml['Challenge_topic_higher_level'],
        y=df_azureml['Challenge_comment_count'],
        meanline_visible=True,
        line_color='blue',
        side='positive',
        opacity=0.5,
        legendgroup='SageMaker',
        scalegroup='SageMaker',
        name='SageMaker',
    ))
fig_challenge_comment_count.add_trace(
    go.Violin(
        x=df_sagemaker['Challenge_topic_higher_level'],
        y=df_sagemaker['Challenge_comment_count'],
        meanline_visible=True,
        line_color='orange',
        side='negative',
        opacity=0.5,
        legendgroup='AzureML',
        scalegroup='AzureML',
        name='AzureML',
    ))
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
)
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge comment count.png'))

fig_challenge_solved_time_sagemaker = go.Figure()
fig_challenge_solved_time_azureml = go.Figure()
fig_challenge_adjusted_solved_time_sagemaker = go.Figure()
fig_challenge_adjusted_solved_time_azureml = go.Figure()

for name_group, color in zip(df_challenge.groupby('Challenge_topic_higher_level'), colors):
    name, group = name_group
    sagemaker = group[group['Tool'] == 'Amazon SageMaker']
    azureml = group[group['Tool'] == 'Azure Machine Learning']

    fig_challenge_solved_time_sagemaker.add_trace(go.Violin(x=sagemaker['Challenge_solved_time'], y=sagemaker['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_solved_time_azureml.add_trace(go.Violin(x=azureml['Challenge_solved_time'], y=azureml['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_sagemaker.add_trace(go.Violin(x=sagemaker['Challenge_adjusted_solved_time'], y=sagemaker['Challenge_topic_higher_level'], line_color=color))
    fig_challenge_adjusted_solved_time_azureml.add_trace(go.Violin(x=azureml['Challenge_adjusted_solved_time'], y=azureml['Challenge_topic_higher_level'], line_color=color))

    # Challenge score
    challenge_score_azureml = azureml[azureml['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_sagemaker = sagemaker[sagemaker['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_azureml) * len(challenge_score_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_score_azureml, challenge_score_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge score')

    # Challenge favorite count
    challenge_favorite_count_azureml = azureml[azureml['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_sagemaker = sagemaker[sagemaker['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_azureml) * len(challenge_favorite_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_azureml,
                            challenge_favorite_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge favorite count')

    # Challenge follower count
    challenge_follower_count_azureml = azureml[azureml['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_sagemaker = sagemaker[sagemaker['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_azureml) * len(challenge_follower_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_follower_count_azureml,
                            challenge_follower_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge follower count')

    # Challenge link count
    challenge_link_count_azureml = azureml[azureml['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_sagemaker = sagemaker[sagemaker['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_azureml) * len(challenge_link_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_link_count_azureml, challenge_link_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge link count')

    # Challenge readability
    challenge_readability_azureml = azureml[azureml['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_sagemaker = sagemaker[sagemaker['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_azureml) * len(challenge_readability_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_readability_azureml,
                            challenge_readability_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge readability')

    # Challenge view count
    challenge_view_count_azureml = azureml[azureml['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    challenge_view_count_sagemaker = sagemaker[sagemaker['Challenge_view_count'].notna(
    )]['Challenge_view_count']
    if len(challenge_view_count_azureml) * len(challenge_view_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_view_count_azureml,
                            challenge_view_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge answer count')

    # Challenge answer count
    challenge_answer_count_azureml = azureml[azureml['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_sagemaker = sagemaker[sagemaker['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_azureml) * len(challenge_answer_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_answer_count_azureml,
                            challenge_answer_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge answer count')

    # Challenge comment count
    challenge_comment_count_azureml = azureml[azureml['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_sagemaker = sagemaker[sagemaker['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_azureml) * len(challenge_comment_count_sagemaker) > 0:
        _, p = mannwhitneyu(challenge_comment_count_azureml,
                            challenge_comment_count_sagemaker)
        if p < alpha:
            print(
                f'p = {p}, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic {name} in challenge comment count')

fig_challenge_solved_time_sagemaker.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_sagemaker.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - AzureML',
)
fig_challenge_solved_time_sagemaker.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge solved time (AzureML).png'))

fig_challenge_solved_time_azureml.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_solved_time_azureml.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge solved time (hours) - SageMaker',
)
fig_challenge_solved_time_azureml.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge solved time (SageMaker).png'))

fig_challenge_adjusted_solved_time_sagemaker.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_sagemaker.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - AzureML',
)
fig_challenge_adjusted_solved_time_sagemaker.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge adjusted solved time (AzureML).png'))

fig_challenge_adjusted_solved_time_azureml.update_traces(orientation='h', meanline_visible=True, side='positive', width=3, points=False)
fig_challenge_adjusted_solved_time_azureml.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
    violingap=0,
    violinmode='overlay',
    xaxis_showgrid=False, 
    xaxis_zeroline=False, 
    showlegend=False, 
    xaxis_title='Challenge adjusted solved time (hours) - SageMaker',
)
fig_challenge_adjusted_solved_time_azureml.write_image(os.path.join(
    path_challenge_azureml_sagemaker, 'Challenge adjusted solved time (SageMaker).png'))

# Challenge hgher level mean solved time
challenge_mean_solved_time_azureml = df_azureml[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_solved_time']
challenge_mean_solved_time_sagemaker = df_sagemaker[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_mean_solved_time_azureml,
                    challenge_mean_solved_time_sagemaker)
if p < alpha:
    print(f'p = {p}, indicating different distribution of SageMaker vs AzureML in higher level mean challenge solved time')

# Challenge hgher level median solved time
challenge_median_solved_time_azureml = df_azureml[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_solved_time']
challenge_median_solved_time_sagemaker = df_sagemaker[['Challenge_topic_higher_level', 'Challenge_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_solved_time']
_, p = mannwhitneyu(challenge_median_solved_time_azureml,
                    challenge_median_solved_time_sagemaker)
if p < alpha:
    print(f'p = {p}, indicating different distribution of SageMaker vs AzureML in higher level median challenge solved time')

# Challenge hgher level adjusted mean solved time
challenge_adjusted_mean_solved_time_azureml = df_azureml[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_adjusted_solved_time']
challenge_adjusted_mean_solved_time_sagemaker = df_sagemaker[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').mean()['Challenge_adjusted_solved_time']
_, p = mannwhitneyu(challenge_adjusted_mean_solved_time_azureml,
                    challenge_adjusted_mean_solved_time_sagemaker)
if p < alpha:
    print(f'p = {p}, indicating different distribution of SageMaker vs AzureML in higher level adjusted mean challenge solved time')

# Challenge hgher level adjusted median solved time
challenge_adjusted_median_solved_time_azureml = df_azureml[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_adjusted_solved_time']
challenge_adjusted_median_solved_time_sagemaker = df_sagemaker[['Challenge_topic_higher_level', 'Challenge_adjusted_solved_time']].groupby(
    'Challenge_topic_higher_level').median()['Challenge_adjusted_solved_time']
_, p = mannwhitneyu(challenge_adjusted_median_solved_time_azureml,
                    challenge_adjusted_median_solved_time_sagemaker)
if p < alpha:
    print(f'p = {p}, indicating different distribution of SageMaker vs AzureML in higher level adjusted median challenge solved time')


p = 0.0422680677963813, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Code Management in challenge comment count
p = 0.04173684390482671, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Data Management in challenge link count
p = 7.298861022444564e-07, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Data Management in challenge comment count
p = 0.004391090520645429, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Deployment Management in challenge score
p = 0.029168849670553245, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Deployment Management in challenge link count
p = 4.1922522268918286e-07, indicating different distribution of SageMaker vs AzureML challenge regarding higher level topic Deployment Management in challenge readability
p 

In [None]:
df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]

df_topics = []

for name, group in df_challenge.groupby('Challenge_topic'):
    Challenge_count = group['Challenge_topic'].count()
    Challenge_solved_ratio = group['Challenge_closed_time'].notna(
    ).sum() / Challenge_count
    Challenge_mean_score = group['Challenge_score'].mean()
    Challenge_mean_favorite_count = group['Challenge_follower_count'].mean()
    Challenge_mean_follower_count = group['Challenge_follower_count'].mean()
    Challenge_mean_link_count = group['Challenge_link_count'].mean()
    Challenge_mean_information_entropy = group['Challenge_information_entropy'].mean(
    )
    Challenge_mean_readability = group['Challenge_readability'].mean()
    Challenge_mean_sentence_count = group['Challenge_sentence_count'].mean()
    Challenge_mean_word_count = group['Challenge_word_count'].mean()
    Challenge_mean_unique_word_count = group['Challenge_unique_word_count'].mean(
    )
    Challenge_mean_view_count = group['Challenge_view_count'].mean()
    Challenge_mean_answer_count = group['Challenge_answer_count'].mean()
    Challenge_mean_comment_count = group['Challenge_comment_count'].mean()

    Solution_mean_score = group['Solution_score'].mean()
    Solution_mean_link_count = group['Solution_link_count'].mean()
    Solution_mean_information_entropy = group['Solution_information_entropy'].mean(
    )
    Solution_mean_readability = group['Solution_readability'].mean()
    Solution_mean_sentence_count = group['Solution_sentence_count'].mean()
    Solution_mean_word_count = group['Solution_word_count'].mean()
    Solution_mean_unique_word_count = group['Solution_unique_word_count'].mean(
    )
    Solution_mean_comment_count = group['Solution_comment_count'].mean()

    Challenge_mean_solved_time = group['Challenge_solved_time'].mean()
    Challenge_median_solved_time = group['Challenge_solved_time'].median()
    Challenge_adjusted_mean_solved_time = group['Challenge_adjusted_solved_time'].mean(
    )
    Challenge_adjusted_meadian_solved_time = group['Challenge_adjusted_solved_time'].median(
    )

    topic_info = {
        'Challenge topic': name,
        'Challenge count': Challenge_count,
        'Challenge solved ratio': Challenge_solved_ratio,
        'Challenge mean score': Challenge_mean_score,
        'Challenge mean favorite count': Challenge_mean_favorite_count,
        'Challenge mean follower count': Challenge_mean_follower_count,
        'Challenge mean link count': Challenge_mean_link_count,
        'Challenge mean information entropy': Challenge_mean_information_entropy,
        'Challenge mean readability': Challenge_mean_readability,
        'Challenge mean sentence count': Challenge_mean_sentence_count,
        'Challenge mean word count': Challenge_mean_word_count,
        'Challenge mean unique word count': Challenge_mean_unique_word_count,
        'Challenge mean view count': Challenge_mean_view_count,
        'Challenge mean answer count': Challenge_mean_answer_count,
        'Challenge mean comment count': Challenge_mean_comment_count,

        'Solution mean score': Solution_mean_score,
        'Solution mean link count': Solution_mean_link_count,
        'Solution mean information entropy': Solution_mean_information_entropy,
        'Solution mean readability': Solution_mean_readability,
        'Solution mean sentence count': Solution_mean_sentence_count,
        'Solution mean word count': Solution_mean_word_count,
        'Solution mean unique word count': Solution_mean_unique_word_count,
        'Solution mean comment count': Solution_mean_comment_count,

        'Challenge mean solved time': Challenge_mean_solved_time,
        'Challenge median solved time': Challenge_median_solved_time,
        'Challenge adjusted mean solved time': Challenge_adjusted_mean_solved_time,
        'Challenge adjusted median solved time': Challenge_adjusted_meadian_solved_time,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_challenge, 'general.json'),
                  indent=4, orient='records')

_, p = pearsonr(df_topics['Challenge mean solved time'],
                df_topics['Challenge median solved time'])
if p < alpha:
    print(f'p = {p}, indicating there is a linear dependence between challenge median solved time vs mean solved time')

_, p = pearsonr(df_topics['Challenge adjusted mean solved time'],
                df_topics['Challenge adjusted median solved time'])
if p < alpha:
    print(f'p = {p}, indicating there is a linear dependence between challenge adjusted median solved time vs mean solved time')

# Plot median solved time against mean solved time for each topic using count ratio as size
fig = px.scatter(df_topics, y="Challenge median solved time", x="Challenge mean solved time",
                 color="Challenge topic", hover_name="Challenge topic", size="Challenge count", trendline="ols")
# fig.update_xaxes(title_text="Challenge mean solved time (log scale)")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0))
fig.show()


p = 0.0005924311311671464, indicating there is a linear dependence between challenge median solved time vs mean solved time
p = 0.03621067065285101, indicating there is a linear dependence between challenge adjusted median solved time vs mean solved time


In [None]:
df_challenge = pd.read_json(os.path.join(
    path_challenge_open_closed, 'general.json'))

# Plot median solved time against mean solved time for each topic using count ratio as size
fig = px.scatter(df_challenge, y="Median solved time", x="Mean solved time", color="Topic",
                 hover_name="Topic", size="Count ratio", trendline="ols", log_x=True, trendline_options=dict(log_x=True))
fig.update_xaxes(title_text="Mean solved time (log scale)")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
    ))
fig.show()

# Plot mean score against mean solved time for each topic using mean view count as size
fig = px.scatter(df_challenge, y="Mean score", x="Median solved time",
                 color="Topic", hover_name="Topic", size="Mean view count")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
    ))
fig.show()

# Plot mean favorite count against median solved time for each topic using mean answer count as size
fig = px.scatter(df_challenge, y="Mean favorite count", x="Median solved time",
                 color="Topic", hover_name="Topic", size="Mean answer count")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
    ))
fig.show()

In [None]:
import scipy.interpolate
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess


def smooth(x, y, xgrid, lowess_kw=None):
    samples = np.random.choice(len(x), 50, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s, x_s, **lowess_kw)
    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(
        x_s, y_sm, fill_value='extrapolate')(xgrid)
    return y_grid


def lowess_with_confidence_bounds(x, y, conf_interval=0.95, lowess_kw=None):
    """
    Perform Lowess regression and determine a confidence interval by bootstrap resampling
    """
    xgrid = np.linspace(x.min(), x.max())

    K = 100
    smooths = np.stack([smooth(x, y, xgrid, lowess_kw) for _ in range(K)]).T

    mean = np.nanmean(smooths, axis=1)
    stderr = scipy.stats.sem(smooths, axis=1)

    clower = np.nanpercentile(smooths, (1-conf_interval)*50, axis=1)
    cupper = np.nanpercentile(smooths, (1+conf_interval)*50, axis=1)

    return xgrid, mean, stderr, clower, cupper


In [None]:
df_all = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_challenge = df_all[df_all['Challenge_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_challenge['Challenge_creation_time']), max(
    df_challenge['Challenge_creation_time'])

(Timestamp('2014-08-08 14:04:22.160000'),
 Timestamp('2023-02-22 01:36:03.995000'))

In [None]:
# Explore challenge topics evolution

df_challenge = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]
df_challenge = df_challenge[(df_challenge['Challenge_creation_time'] > '2014-09-14')
                            & (df_challenge['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_challenge.groupby('Challenge_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_creation_time', freq='2W')).agg(
        Count=('Challenge_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_creation_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_creation_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_challenge_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()


In [None]:
df_all = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_solution = df_all[df_all['Solution_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_solution['Challenge_creation_time']), max(
    df_solution['Challenge_creation_time'])


(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [None]:
# Explore solution topics evolution

df_solution = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]
df_solution = df_solution[(df_solution['Challenge_creation_time'] > '2014-09-14')
                          & (df_solution['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_solution.groupby('Solution_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='W')).agg(
        Count=('Solution_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_closed_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_closed_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_solution_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()
