In [5]:
import warnings
warnings.filterwarnings("ignore")

from scipy.stats import mannwhitneyu
from matplotlib import pyplot as plt

import pickle
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import openai
openai.api_key = os.getenv('OPENAI_API_KEY')

In [18]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_solution = os.path.join(path_result, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_challenge_git_qa = os.path.join(path_challenge, 'Git vs QA')
if not os.path.exists(path_challenge_git_qa):
    os.makedirs(path_challenge_git_qa)

path_challenge_open_closed = os.path.join(path_challenge, 'Open vs Closed')
if not os.path.exists(path_challenge_open_closed):
    os.makedirs(path_challenge_open_closed)

path_challenge_so_to = os.path.join(path_challenge, 'Stack Overflow vs Tool-specific')
if not os.path.exists(path_challenge_so_to):
    os.makedirs(path_challenge_so_to)

# path_solution_open_closed = os.path.join(path_solution, 'Open Closed')
# if not os.path.exists(path_solution_open_closed):
#     os.makedirs(path_solution_open_closed)

path_challenge_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_challenge_evolution):
    os.makedirs(path_challenge_evolution)

path_solution_evolution = os.path.join(path_solution, 'Evolution')
if not os.path.exists(path_solution_evolution):
    os.makedirs(path_solution_evolution)

In [328]:
# Create challenge topic distribution tree map

df_topics = pd.read_json(os.path.join(path_general, 'original.json'))
df_topics = df_topics[df_topics['Challenge_topic'] > -1]
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Challenge_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_challenge, 'Challenge_topic_distribution.png'))


In [329]:
# Create solution topic distribution tree map

df_topics = pd.read_json(os.path.join(path_general, 'original.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Solution_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_solution, 'Solution_topic_distribution.png'))


In [37]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.
###\n'''

with open(os.path.join(path_challenge, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic + '\n'.join(topic_term_list) + '\n###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_challenge = completion.choices[0].message.content
print(topic_challenge)


Topic 0: Environment Setup - Setting up software environments for development and execution
Topic 1: Pipeline Automation - Automating the execution of data processing pipelines
Topic 2: Docker - Containerization platform for building, shipping, and running applications
Topic 3: Hyperparameter Tuning - Optimizing model performance by tuning hyperparameters
Topic 4: Git Version Control - Tracking changes to code and collaborating with others
Topic 5: GPU Acceleration - Using graphics processing units to speed up machine learning tasks
Topic 6: Artifact Management - Managing and storing artifacts such as models, datasets, and code
Topic 7: Model Deployment - Deploying machine learning models for use in production environments
Topic 8: Data Labeling - Assigning labels to data for use in supervised learning tasks
Topic 9: Data Visualization - Creating visual representations of data for analysis and communication
Topic 10: Logging Metrics - Recording and tracking performance metrics during m

In [260]:
topic_mapping_challenge = {
    0: ('Package Management', 'Installing and configuring software packages and dependencies'),
    1: ('Pipeline Configuration', 'Automating the execution of data processing pipelines'),
    2: ('Docker Configuration', 'Containerization platform for building, shipping, and running applications'),
    3: ('Hyperparameter Tuning', 'Optimizing model performance by tuning hyperparameters'),
    4: ('Code Versioning', 'Managing and tracking changes in a repository using Git'),
    5: ('GPU Configuration', 'Using graphics processing units to speed up machine learning tasks'),
    6: ('Artifact Management', 'Uploading, downloading, and storing artifacts'),
    7: ('Endpoint Deployment', 'Deploying machine learning models for use in production environments'),
    8: ('Data Labeling', 'Assigning labels to data for use in supervised learning tasks'),
    9: ('Data Visualization', 'Creating visual representations of data for analysis and communication'),
    10: ('Metrics Logging', 'Recording and tracking performance metrics during model training and evaluation'), 
    11: ('Account Management', 'Managing user accounts and access to resources'),
    12: ('Apache Spark Configuration', 'Installing and configuring Apache Spark distributed computing system for big data processing'),
    13: ('TensorFlow Configuration', 'Installing and configuring the TensorFlow machine learning framework'),
    14: ('Text Processing', 'Analyzing and manipulating text data'),
    15: ('Pandas Dataframe', 'Manipulating and analyzing tabular data using the Pandas library'),
    16: ('Model Exporting', 'Saving and exporting trained machine learning models'),
    17: ('Role-based Access Control', 'Controlling access to resources based on user roles and permissions'),
    18: ('Batch Processing', 'Processing large amounts of data in batches'),
    19: ('Model Registry', 'Registering, managing, and versioning models'),
    20: ('Database Connectivity', 'Connecting to and interacting with databases'),
    21: ('Resource Quota Control', 'Setting and managing limits on resource usage'),
    22: ('API Invocation', 'Calling APIs to perform tasks or retrieve data'),
    23: ('Forecasting', 'Using automated machine learning to generate forecasts'),
    24: ('Columnar Manipulation', 'Working with and manipulating columns in datasets'),
    25: ('Object Detection', 'Using machine learning to analyze and interpret visual data'),
    26: ('Web Service', 'Deploying machine learning models as web services'),
    27: ('Kubernetes Orchestration', 'Open-source container orchestration platform for managing containerized applications'),
    28: ('Tree-based Model', 'Building, training, and cutting tree-like structure to make predictions'),
    29: ('CSV Manipulation', 'Reading, writing, and manipulating CSV files'),
    30: ('TensorBoard Logging', 'Visualizing and tracking model training and evaluation using TensorBoard'),
    31: ('Feature Roadmap', 'Planning and implementing new features for a platform or product'),
    32: ('Dataset Versioning', 'Managing and versioning datasets'),
    33: ('CloudWatch Monitoring', 'Monitoring and logging AWS resources and applications'),
    34: ('Speech-to-Text', 'Converting audio speech to text'),
    35: ('YAML Configuration', 'Configuring and defining stages in a pipeline using YAML files'),
    36: ('Data Storage', 'Storing and accessing data in cloud-based storage solutions'),
    37: ('VPC Neworking', 'Connecting to AWS services privately through a VPC'),
    38: ('Model Evaluation', 'Evaluating and improving the accuracy of machine learning models'),
    39: ('Model Service', 'Preparing and querying input data for machine learning models'),
    40: ('Bucket Access Control', 'Managing access to cloud-based storage buckets'),
    41: ('Run Management', 'Managing and monitoring the execution of jobs and tasks'),
    42: ('Model Inference', 'Using trained machine learning models to make predictions'),
    43: ('Jupyter Notebook', 'Creating and running interactive notebooks for data analysis and visualization'),
}

In [151]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.
###\n'''

with open(os.path.join(path_solution, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic + '\n'.join(topic_term_list) + '\n###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_solution = completion.choices[0].message.content
print(topic_solution)


Topic 0: Git Tracking - Managing and tracking changes in a repository using Git.
Topic 1: Access Control - Managing user roles and permissions for accessing resources.
Topic 2: Environment Setup - Installing and configuring software packages and dependencies.
Topic 3: Logging Metrics - Capturing and analyzing data related to system performance and behavior.
Topic 4: Dataset Management - Organizing and manipulating data for use in machine learning models.
Topic 5: Docker Deployment - Packaging and deploying applications in containers using Docker.
Topic 6: Parameter Configuration - Setting and adjusting parameters for software programs.
Topic 7: YAML Configuration - Configuring and defining stages in a pipeline using YAML files.
Topic 8: Endpoint Deployment - Deploying and managing endpoints for accessing APIs and web services.
Topic 9: Jupyter Notebooks - Creating and running interactive notebooks for data analysis and visualization.
Topic 10: Pandas Dataframes - Manipulating and analy

In [259]:
topic_mapping_solution = {
    0: ('Code Versioning', 'Managing and tracking changes in a repository using Git'),
    1: ('Role-based Access Control', 'Controlling access to resources based on user roles and permissions'),
    2: ('Package Management', 'Installing and configuring software packages and dependencies'),
    3: ('Metrics Logging', 'Recording and tracking performance metrics during model training and evaluation'),
    4: ('Columnar Manipulation', 'Flexible schema model designed to scale horizontally across many servers'),
    5: ('Docker Configuration', 'Containerization platform for building, shipping, and running applications'),
    6: ('Hyperparameter Tuning', 'Optimizing model performance by tuning hyperparameters'),
    7: ('YAML Configuration', 'Configuring and defining stages in a pipeline using YAML files'),
    8: ('Endpoint Service', 'Creating, deploying, and managing endpoints for REST and web services'),
    9: ('Jupyter Notebook', 'Creating and running interactive notebooks for data analysis and visualization'),
    10: ('Pandas Dataframe', 'Manipulating and analyzing tabular data using the Pandas library'),
    11: ('TensorFlow Configuration', 'Installing and configuring the TensorFlow machine learning framework'),
    12: ('Artifact Management', 'Uploading, downloading, and storing artifacts'),
    13: ('Endpoint Deployment', 'Deploying machine learning models for use in production environments'),
    14: ('Tree-based Model', 'Building, training, and cutting tree-like structure to make predictions'),
    15: ('Pipeline Configuration (Model)', 'Building, inputting, and parameterizing pipelines for API and object use'),
    16: ('JSON Payload', 'Formatting, serializing, and loading data'),
    17: ('Remote Configuration', 'Adding, modifying, and running remote URLs and resources'),
    18: ('Apache Spark Configuration', 'Installing and configuring Apache Spark distributed computing system for big data processing'),
    19: ('Model Wrapper', 'Using PyFunc and PythonModel interfaces, importing models, and loading models'),
    20: ('Data Transfer', 'Transferring data between cloud-based storage solutions'),
    21: ('Cluster Configuration', 'Running and managing distributed computing jobs'),
    22: ('Pipeline Configuration (Data)', 'Creating and managing data pipelines'),
    23: ('CSV Manipulation', 'Reading, writing, and manipulating CSV files'),
    24: ('Model Registry', 'Registering, managing, and versioning models'),
    25: ('Memory Management', 'Managing memory and distributing training for large datasets'),
    26: ('Model Application', 'Using neural networks for machine learning'),
    27: ('SDK Management', 'Managing SDK versions'),
    28: ('Serverless Service', 'Invoking endpoints and APIs using Lambda functions and API gateways'),
}

In [231]:
# assign human-readable & high-level topics to challenges & solutions

df_topics = pd.read_json(os.path.join(path_general, 'original.json'))
df_topics['Challenge_topic'] = df_topics['Challenge_topic'].map(lambda x: topic_mapping_challenge[x][0] if x in topic_mapping_challenge else np.nan)
df_topics['Solution_topic'] = df_topics['Solution_topic'].map(lambda x: topic_mapping_solution[x][0] if x in topic_mapping_solution else np.nan)
df_topics.to_json(os.path.join(path_general, 'assigned.json'), indent=4, orient='records')


In [None]:
# add solved time to each challenge

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

for index, row in df.iterrows():
    creation_time = row['Challenge_creation_time']
    closed_time = row['Challenge_closed_time']
    df.at[index, 'Challenge_solved_time'] = closed_time - creation_time
    if pd.notna(row['Challenge_last_edit_time']):
        creation_time = row['Challenge_last_edit_time']
    if pd.notna(row['Solution_last_edit_time']):
        closed_time = row['Solution_last_edit_time']
    df.at[index, 'Challenge_solved_time_adjusted'] = closed_time - creation_time
    
df.to_json(os.path.join(path_general, 'assigned.json'), indent=4, orient='records')

In [327]:
df_topics = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_topics = df_topics[df_topics['Challenge_topic'].notna() & df_topics['Solution_topic'].notna()]

categories = ['Challenge_topic', 'Solution_topic']
df_topics = df_topics.groupby(categories).size().reset_index(name='value')

# we only visualize large topics
df_topics = df_topics[df_topics['value'] > 30]

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_topics[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_topics[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict()
data = go.Sankey(
    link=link,
    node = dict(
        label=label,
        thickness = 200,
        pad = 30,
))

fig = go.Figure(data)
fig.update_layout(
    height=2000,
    width=2000,
    font=dict(size=20),
)
fig.write_image(os.path.join(path_general,
                'Challenge solution sankey.png'))

In [3]:
topic_ensemble = [
    'Account Management',  
    'Apache Spark Configuration',
    'Artifact Management',
    'Bucket Access Control',  
    'CloudWatch Monitoring',
    'Cluster Configuration',
    'Code Versioning',
    'Dataset Versioning',
    'Docker Configuration',
    'GPU Configuration',
    'Hyperparameter Tuning',  
    'Kubernetes Orchestration',  
    'Memory Management',
    'Metrics Logging',
    'Model Evaluation',
    'Model Registry',
    'Pipeline Configuration',
    'Pipeline Configuration (Data)',
    'Pipeline Configuration (Model)',
    'Package Management',
    'Remote Configuration',  
    'Resource Quota Control',  
    'Role-based Access Control',  
    'Run Management',
    'SDK Management',
    'TensorBoard Logging',
    'TensorFlow Configuration',
    'YAML Configuration',
]


In [None]:
# topic_all = set(['Topic: ' + topic[0] + ', Description: ' + topic[1] for topic in ] + ['Topic: ' + topic[0] + ', Description: ' + topic[1] for topic in topic_mapping_challenge.values()])
# topic_all

In [None]:
# topic_all = list(set(list(topic_mapping_solution.values()) + list(topic_mapping_challenge.values())))
# for item in topic_ensemble:
#     for item2 in topic_all:
#         if item == item2[0]:
#             topic_all.remove(item2)
#             break
# topic_all

In [None]:
# prompt_classify = '''Please classify the following concepts as different categories based on their application scenarios and provide a short description for each category:\n'''

# completion = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "user", "content": prompt_classify + ', '.join(topic_ensemble) + '\n'},
#         ],
#     temperature=0,
#     max_tokens=500,
#     top_p=1,
#     frequency_penalty=0,
#     presence_penalty=0,
#     timeout=50,
#     stream=False
# )
# completion.choices[0].message.content

In [96]:
# # Collect general challenge statistics information

# df_challenge = pd.read_json(os.path.join(path_general, 'original.json'))
# df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]

# # total_count = df_challenge['Challenge_topic'].count()
# df_topics = []

# for name, group in df_challenge.groupby('Challenge_topic'):
#     count = group['Challenge_topic'].count()
#     Solved_ratio = group['Challenge_closed_time'].notna().sum() / count
#     Mean_score = group['Challenge_score'].mean()
#     Mean_favorite_count = group['Challenge_follower_count'].mean()
#     Mean_follower_count = group['Challenge_follower_count'].mean()
#     Mean_link_count = group['Challenge_link_count'].mean()
#     Mean_information_entropy = group['Challenge_information_entropy'].mean()
#     Mean_readability = group['Challenge_readability'].mean()
#     Mean_sentence_count = group['Challenge_sentence_count'].mean()
#     Mean_word_count = group['Challenge_word_count'].mean()
#     Mean_unique_word_count = group['Challenge_unique_word_count'].mean()
#     Mean_view_count = group['Challenge_view_count'].mean()
#     Mean_answer_count = group['Challenge_answer_count'].mean()
#     Mean_comment_count = group['Challenge_comment_count'].mean()
#     # Mean_participation_count = Mean_answer_count + Mean_comment_count
#     # Score_participation_ratio = Mean_score / Mean_participation_count
#     # Score_participation_weighted_product = (
#     #     group['Challenge_score'] * group['Challenge_participation_count']).mean()
#     # Mean_solved_time = group['Challenge_solved_time'].mean(
#     # ) / pd.Timedelta(hours=1)
#     # Median_solved_time = group['Challenge_solved_time'].median(
#     # ) / pd.Timedelta(hours=1)
#     # Mean_solved_time_edited = group['Challenge_solved_time_edited'].mean(
#     # ) / pd.Timedelta(hours=1)
#     # Median_solved_time_edited = group['Challenge_solved_time_edited'].median(
#     # ) / pd.Timedelta(hours=1)
#     topic_info = {
#         'Topic': name,
#         'Count': count,
#         'Solved ratio': Solved_ratio,
#         'Mean score': Mean_score,
#         'Mean follower count': Mean_favorite_count,
#         'Mean follower count': Mean_follower_count,
#         'Mean link count': Mean_link_count,
#         'Mean information entropy': Mean_information_entropy,
#         'Mean readability': Mean_readability,
#         'Mean sentence count': Mean_sentence_count,
#         'Mean word count': Mean_word_count,
#         'Mean unique word count': Mean_unique_word_count,
#         'Mean view count': Mean_view_count,
#         'Mean answer count': Mean_answer_count,
#         'Mean comment count': Mean_comment_count,
#         # 'Score participation ratio': Score_participation_ratio,
#         # 'Score participation weighted product': Score_participation_weighted_product,
#         # 'Mean solved time': Mean_solved_time,
#         # 'Median solved time': Median_solved_time,
#         # 'Mean solved time edited': Mean_solved_time_edited,
#         # 'Median solved time edited': Median_solved_time_edited,
#     }
#     df_topics.append(topic_info)

# df_topics = pd.DataFrame(df_topics)
# df_topics.to_json(os.path.join(path_challenge_open_closed,
#                   'general.json'), indent=4, orient='records')

# # df_topics['Topic'] = df_topics['Topic'].astype(str)

# # # plot count
# # df_topics = df_topics.sort_values('Count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_count.png'))
# # plt.close()

# # # plot solved ratio  
# # df_topics = df_topics.sort_values('Solved ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Solved ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge solved ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_solved_ratio.png'))
# # plt.close()

# # # plot mean score  
# # df_topics = df_topics.sort_values('Mean score', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean score'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean score',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_score.png'))
# # plt.close()

# # # plot mean favorite count 
# # df_topics = df_topics.sort_values('Mean favorite count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean favorite count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean favorite count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_favorite_count.png'))
# # plt.close()

# # # plot mean follower count 
# # df_topics = df_topics.sort_values('Mean follower count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean follower count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean follower count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_follower_count.png'))
# # plt.close()

# # # plot mean link count 
# # df_topics = df_topics.sort_values('Mean link count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean link count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean link count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_link_count.png'))
# # plt.close()

# # # plot mean information entropy 
# # df_topics = df_topics.sort_values('Mean information entropy', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean information entropy'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean info entropy',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_information_entropy.png'))
# # plt.close()

# # # plot mean readability  
# # df_topics = df_topics.sort_values('Mean readability', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean readability'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_readability.png'))
# # plt.close()

# # # plot mean sentence count 
# # df_topics = df_topics.sort_values('Mean sentence count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean sentence count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_sentence_count.png'))
# # plt.close()

# # # plot mean word count 
# # df_topics = df_topics.sort_values('Mean word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean word count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_word_count.png'))
# # plt.close()

# # # plot mean unique word count
# # df_topics = df_topics.sort_values('Mean unique word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean unique word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean unique word count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_unique_word_count.png'))
# # plt.close()

# # # plot mean view count 
# # df_topics = df_topics.sort_values('Mean view count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean view count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean view count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_view_count.png'))
# # plt.close()

# # # plot mean answer count 
# # df_topics = df_topics.sort_values('Mean answer count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean answer count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean answer count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_answer_count.png'))
# # plt.close()

# # # plot mean comment count 
# # df_topics = df_topics.sort_values('Mean comment count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean comment count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean comment count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_comment_count.png'))
# # plt.close()

# # # plot score participation ratio 
# # df_topics = df_topics.sort_values('Score participation ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Score participation ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge score participation ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_score_participation_ratio.png'))
# # plt.close()

# # # plot score participation weighted product
# # df_topics = df_topics.sort_values('Score participation weighted product', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Score participation weighted product'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge score participation weighted product',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_score_participation_weighted_product.png'))
# # plt.close()

# # # plot mean solved time 
# # df_topics = df_topics.sort_values('Mean solved time', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean solved time'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean solved time',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_mean_solved_time.png'))
# # plt.close()

# # # plot median solved time 
# # df_topics = df_topics.sort_values('Median solved time', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Median solved time'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge median solved time',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_median_solved_time.png'))
# # plt.close()

# # # plot Mean solved time edited
# # df_topics = df_topics.sort_values('Mean solved time edited', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean solved time edited'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge Mean solved time edited',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_Mean_solved_time_edited.png'))
# # plt.close()

# # # plot Median solved time edited
# # df_topics = df_topics.sort_values('Median solved time edited', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Median solved time edited'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge Median solved time edited',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_challenge_open_closed, 'Challenge_Median_solved_time_edited.png'))
# # plt.close()


In [None]:
# # Collect general solution statistics information

# df_solution = pd.read_json(os.path.join(path_general, 'original.json'))
# df_solution = df_solution[df_solution['Solution_topic'] > -1]
# # df_solution['Solution_topic'] = df_solution['Solution_topic'].astype(str)

# # total_count = df_solution['Solution_topic'].count()
# df_topics = []

# for name, group in df_solution.groupby('Solution_topic'):
#     count = group['Solution_topic'].count()
#     Mean_score = group['Solution_score'].mean()
#     Mean_link_count = group['Solution_link_count'].mean()
#     Mean_information_entropy = group['Solution_information_entropy'].mean()
#     Mean_readability = group['Solution_readability'].mean()
#     Mean_sentence_count = group['Solution_sentence_count'].mean()
#     Mean_word_count = group['Solution_word_count'].mean()
#     Mean_unique_word_count = group['Solution_unique_word_count'].mean()
#     Mean_comment_count = group['Solution_comment_count'].mean()
#     topic_info = {
#         'Topic': name,
#         'Count ratio': count,
#         'Mean score': Mean_score,
#         'Mean link count': Mean_link_count,
#         'Mean information entropy': Mean_information_entropy,
#         'Mean readability': Mean_readability,
#         'Mean sentence count': Mean_sentence_count,
#         'Mean word count': Mean_word_count,
#         'Mean unique word count': Mean_unique_word_count,
#         'Mean comment count': Mean_comment_count,
#     }
#     df_topics.append(topic_info)

# df_topics = pd.DataFrame(df_topics)
# df_topics.to_json(os.path.join(path_solution_information,
#                   'general.json'), indent=4, orient='records')

# # # plot count ratio  
# # df_topics = df_topics.sort_values('Count ratio', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Count ratio'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge count ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_count_ratio.png'))
# # plt.close()

# # # plot solved ratio  
# # df_topics = df_topics.sort_values('Mean score', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean score'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge solved ratio',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_score.png'))
# # plt.close()

# # # plot mean score  
# # df_topics = df_topics.sort_values('Mean link count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean link count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean score',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_link_count.png'))
# # plt.close()

# # # plot mean favorite count 
# # df_topics = df_topics.sort_values('Mean information entropy', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean information entropy'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean favorite count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_information_entropy.png'))
# # plt.close()

# # # plot mean follower count 
# # df_topics = df_topics.sort_values('Mean readability', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean readability'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean follower count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_readability.png'))
# # plt.close()

# # # plot mean link count 
# # df_topics = df_topics.sort_values('Mean sentence count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean sentence count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean link count',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_sentence_count.png'))
# # plt.close()

# # # plot mean information entropy 
# # df_topics = df_topics.sort_values('Mean word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean info entropy',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_word_count.png'))
# # plt.close()

# # # plot mean readability  
# # df_topics = df_topics.sort_values('Mean unique word count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean unique word count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_unique_word_count.png'))
# # plt.close()

# # # plot mean sentence count 
# # df_topics = df_topics.sort_values('Mean comment count', ascending=False)
# # fig = go.Figure()
# # fig.add_trace(go.Scatter(
# #     x=df_topics['Topic'],
# #     y=df_topics['Mean comment count'],
# #     mode='lines+markers'
# # ))
# # fig.update_layout(
# #     width=1000,
# #     height=500,
# #     margin=dict(l=0, r=0, t=0, b=0),
# #     # title='Challenge mean readability',
# #     xaxis=dict(
# #         tickmode='linear'
# #     )
# # )
# # fig.write_image(os.path.join(path_solution_information, 'Solution_mean_comment_count.png'))
# # plt.close()


In [20]:
# Collect and compared Q&A forum and Git repo challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]

df_qa = df_challenge[df_challenge['Platform'].isin(['Stack Overflow', 'Tool-specific'])]
df_git = df_challenge[df_challenge['Platform'].isin(['Github', 'Gitlab'])]

alpha = 0.05

fig_challenge_count = go.Figure()
fig_challenge_score = go.Figure()
fig_challenge_favorite_count = go.Figure()
fig_challenge_follower_count = go.Figure()
fig_challenge_link_count = go.Figure()
# fig_challenge_information_entropy = go.Figure()
fig_challenge_readability = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_answer_count = go.Figure()
fig_challenge_comment_count = go.Figure()
fig_challenge_solved_time = go.Figure()
fig_challenge_solved_time_adjusted = go.Figure()

for name, group in df_challenge.groupby('Challenge_topic'):
    git = df_git[df_git['Challenge_topic'] == name]
    qa = df_qa[df_qa['Challenge_topic'] == name]

    # Challenge score
    challenge_score_so = qa[qa['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_to = git[git['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_so) * len(challenge_score_to) > 0:
        _, p = mannwhitneyu(challenge_score_so, challenge_score_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge score')
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # Challenge favorite count
    challenge_favorite_count_so = qa[qa['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_to = git[git['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_so) * len(challenge_favorite_count_to) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_so,
                            challenge_favorite_count_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge favorite count')
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # Challenge follower count
    challenge_follower_count_so = qa[qa['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_to = git[git['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_so) * len(challenge_follower_count_to) > 0:
        _, p = mannwhitneyu(challenge_follower_count_so,
                            challenge_follower_count_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge follower count')
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # Challenge link count
    challenge_link_count_so = qa[qa['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_to = git[git['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_so) * len(challenge_link_count_to) > 0:
        _, p = mannwhitneyu(challenge_link_count_so,
                            challenge_link_count_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge link count')
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # # Challenge information entropy
    # challenge_information_entropy_so = qa[qa['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # challenge_information_entropy_to = git[git['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # if len(challenge_information_entropy_so) * len(challenge_information_entropy_to) > 0:
    #     _, p = mannwhitneyu(challenge_information_entropy_so,
    #                         challenge_information_entropy_to)
    #     if p < alpha:
    #         print(
    #             f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge information entropy')
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(qa), name),
    #                 y=qa['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='QA',
    #             ))
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(git), name),
    #                 y=git['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='Git',
    #             ))

    # Challenge readability
    challenge_readability_so = qa[qa['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_to = git[git['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_so) * len(challenge_readability_to) > 0:
        _, p = mannwhitneyu(challenge_readability_so,
                            challenge_readability_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge readability')
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # Challenge answer count
    challenge_answer_count_so = qa[qa['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_to = git[git['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_so) * len(challenge_answer_count_to) > 0:
        _, p = mannwhitneyu(challenge_answer_count_so,
                            challenge_answer_count_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge answer count')
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

    # Challenge comment count
    challenge_comment_count_so = qa[qa['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_to = git[git['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_so) * len(challenge_comment_count_to) > 0:
        _, p = mannwhitneyu(challenge_comment_count_so,
                            challenge_comment_count_to)
        if p < alpha:
            print(
                f'Different distribution of Q&A fora vs Git repos challenge regarding topic {name} in challenge comment count')
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(qa), name),
                    y=qa['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='QA',
                ))
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(git), name),
                    y=git['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Git',
                ))

# Challenge topic count
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_qa), 'Challenge topic'),
        y=df_qa['Challenge_topic'],
        opacity=0.5,
        name='QA',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_git), 'Challenge topic'),
        y=df_git['Challenge_topic'],
        opacity=0.5,
        name='Git',
    ))

fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
# fig_challenge_information_entropy.update_layout(
#     height=500,
#     width=1000,
#     font=dict(size=20),
#     margin=dict(l=0, r=0, t=0, b=0),
# )

fig_challenge_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge count.png'))
fig_challenge_score.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge score.png'))
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge favorite count.png'))
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge follower count.png'))
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge link count.png'))
# fig_challenge_information_entropy.write_image(os.path.join(
#     path_challenge_git_qa, 'Challenge information entropy.png'))
fig_challenge_readability.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge readability.png'))
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge view count.png'))
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge answer count.png'))
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_git_qa, 'Challenge comment count.png'))


Different distribution of Q&A fora vs Git repos challenge regarding topic Account Management in challenge link count
Different distribution of Q&A fora vs Git repos challenge regarding topic Artifact Management in challenge score
Different distribution of Q&A fora vs Git repos challenge regarding topic Artifact Management in challenge readability
Different distribution of Q&A fora vs Git repos challenge regarding topic CloudWatch Monitoring in challenge answer count
Different distribution of Q&A fora vs Git repos challenge regarding topic Code Versioning in challenge score
Different distribution of Q&A fora vs Git repos challenge regarding topic Code Versioning in challenge readability
Different distribution of Q&A fora vs Git repos challenge regarding topic Code Versioning in challenge answer count
Different distribution of Q&A fora vs Git repos challenge regarding topic Docker Configuration in challenge score
Different distribution of Q&A fora vs Git repos challenge regarding topic D

In [21]:
# Collect and compared open and closed challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]

df_open = df_challenge[df_challenge['Challenge_closed_time'].isna()]
df_closed = df_challenge[df_challenge['Challenge_closed_time'].notna()]

alpha = 0.05

fig_challenge_count = go.Figure()
fig_challenge_score = go.Figure()
fig_challenge_favorite_count = go.Figure()
fig_challenge_follower_count = go.Figure()
fig_challenge_link_count = go.Figure()
# fig_challenge_information_entropy = go.Figure()
fig_challenge_readability = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_answer_count = go.Figure()
fig_challenge_comment_count = go.Figure()
fig_challenge_solved_time = go.Figure()
fig_challenge_solved_time_adjusted = go.Figure()

for name, group in df_challenge.groupby('Challenge_topic'):
    closed = df_closed[df_closed['Challenge_topic'] == name]
    open = df_open[df_open['Challenge_topic'] == name]

    # Challenge score
    challenge_score_open = open[open['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_closed = closed[closed['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_open) * len(challenge_score_closed) > 0:
        _, p = mannwhitneyu(challenge_score_open, challenge_score_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge score')
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # Challenge favorite count
    challenge_favorite_count_open = open[open['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_closed = closed[closed['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_open) * len(challenge_favorite_count_closed) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_open,
                            challenge_favorite_count_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge favorite count')
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # Challenge follower count
    challenge_follower_count_open = open[open['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_closed = closed[closed['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_open) * len(challenge_follower_count_closed) > 0:
        _, p = mannwhitneyu(challenge_follower_count_open,
                            challenge_follower_count_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge follower count')
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # Challenge link count
    challenge_link_count_open = open[open['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_closed = closed[closed['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_open) * len(challenge_link_count_closed) > 0:
        _, p = mannwhitneyu(challenge_link_count_open,
                            challenge_link_count_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge link count')
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # # Challenge information entropy
    # challenge_information_entropy_open = open[open['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # challenge_information_entropy_closed = closed[closed['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # if len(challenge_information_entropy_open) * len(challenge_information_entropy_closed) > 0:
    #     _, p = mannwhitneyu(challenge_information_entropy_open,
    #                         challenge_information_entropy_closed)
    #     if p < alpha:
    #         print(
    #             f'Different distribution of open vs closed challenge regarding topic {name} in challenge information entropy')
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(open), name),
    #                 y=open['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='Open',
    #             ))
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(closed), name),
    #                 y=closed['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='Closed',
    #             ))

    # Challenge readability
    challenge_readability_open = open[open['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_closed = closed[closed['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_open) * len(challenge_readability_closed) > 0:
        _, p = mannwhitneyu(challenge_readability_open,
                            challenge_readability_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge readability')
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # Challenge answer count
    challenge_answer_count_open = open[open['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_closed = closed[closed['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_open) * len(challenge_answer_count_closed) > 0:
        _, p = mannwhitneyu(challenge_answer_count_open,
                            challenge_answer_count_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge answer count')
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

    # Challenge comment count
    challenge_comment_count_open = open[open['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_closed = closed[closed['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_open) * len(challenge_comment_count_closed) > 0:
        _, p = mannwhitneyu(challenge_comment_count_open,
                            challenge_comment_count_closed)
        if p < alpha:
            print(
                f'Different distribution of open vs closed challenge regarding topic {name} in challenge comment count')
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(open), name),
                    y=open['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Open',
                ))
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(closed), name),
                    y=closed['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Closed',
                ))

# Challenge topic count
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_open), 'Challenge topic'),
        y=df_open['Challenge_topic'],
        opacity=0.5,
        name='Stack Overflow',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_closed), 'Challenge topic'),
        y=df_closed['Challenge_topic'],
        opacity=0.5,
        name='Tool-specific',
    ))

fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
# fig_challenge_information_entropy.update_layout(
#     height=500,
#     width=1000,
#     font=dict(size=20),
#     margin=dict(l=0, r=0, t=0, b=0),
# )

fig_challenge_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge count.png'))
fig_challenge_score.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge score.png'))
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge favorite count.png'))
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge follower count.png'))
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge link count.png'))
# fig_challenge_information_entropy.write_image(os.path.join(
#     path_challenge_open_closed, 'Challenge information entropy.png'))
fig_challenge_readability.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge readability.png'))
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge view count.png'))
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge answer count.png'))
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_open_closed, 'Challenge comment count.png'))


Different distribution of open vs closed challenge regarding topic Apache Spark Configuration in challenge score
Different distribution of open vs closed challenge regarding topic Apache Spark Configuration in challenge answer count
Different distribution of open vs closed challenge regarding topic Bucket Access Control in challenge answer count
Different distribution of open vs closed challenge regarding topic CloudWatch Monitoring in challenge answer count
Different distribution of open vs closed challenge regarding topic Code Versioning in challenge readability
Different distribution of open vs closed challenge regarding topic Code Versioning in challenge answer count
Different distribution of open vs closed challenge regarding topic Code Versioning in challenge comment count
Different distribution of open vs closed challenge regarding topic Dataset Versioning in challenge readability
Different distribution of open vs closed challenge regarding topic Docker Configuration in challeng

In [22]:
# Collect and compared Stack Overflow vs Tool-specific fora challenges across different topics

df = pd.read_json(os.path.join(path_general, 'assigned.json'))

df_challenge = df[df['Challenge_topic'].notna()]
df_challenge = df_challenge[df_challenge['Challenge_topic'].isin(
    topic_ensemble)]

df_so = df_challenge[df_challenge['Platform'] == 'Stack Overflow']
df_to = df_challenge[df_challenge['Platform'] == 'Tool-specific']

alpha = 0.05

fig_challenge_count = go.Figure()
fig_challenge_score = go.Figure()
fig_challenge_favorite_count = go.Figure()
fig_challenge_follower_count = go.Figure()
fig_challenge_link_count = go.Figure()
# fig_challenge_information_entropy = go.Figure()
fig_challenge_readability = go.Figure()
fig_challenge_view_count = go.Figure()
fig_challenge_answer_count = go.Figure()
fig_challenge_comment_count = go.Figure()
fig_challenge_solved_time = go.Figure()
fig_challenge_solved_time_adjusted = go.Figure()

for name, group in df_challenge.groupby('Challenge_topic'):
    to = df_to[df_to['Challenge_topic'] == name]
    so = df_so[df_so['Challenge_topic'] == name]

    # Challenge score
    challenge_score_so = so[so['Challenge_score'].notna(
    )]['Challenge_score']
    challenge_score_to = to[to['Challenge_score'].notna(
    )]['Challenge_score']
    if len(challenge_score_so) * len(challenge_score_to) > 0:
        _, p = mannwhitneyu(challenge_score_so, challenge_score_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge score')
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_score.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_score'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # Challenge favorite count
    challenge_favorite_count_so = so[so['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    challenge_favorite_count_to = to[to['Challenge_favorite_count'].notna(
    )]['Challenge_favorite_count']
    if len(challenge_favorite_count_so) * len(challenge_favorite_count_to) > 0:
        _, p = mannwhitneyu(challenge_favorite_count_so,
                            challenge_favorite_count_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge favorite count')
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_favorite_count.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_favorite_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # Challenge follower count
    challenge_follower_count_so = so[so['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    challenge_follower_count_to = to[to['Challenge_follower_count'].notna(
    )]['Challenge_follower_count']
    if len(challenge_follower_count_so) * len(challenge_follower_count_to) > 0:
        _, p = mannwhitneyu(challenge_follower_count_so,
                            challenge_follower_count_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge follower count')
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_follower_count.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_follower_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # Challenge link count
    challenge_link_count_so = so[so['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    challenge_link_count_to = to[to['Challenge_link_count'].notna(
    )]['Challenge_link_count']
    if len(challenge_link_count_so) * len(challenge_link_count_to) > 0:
        _, p = mannwhitneyu(challenge_link_count_so,
                            challenge_link_count_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge link count')
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_link_count.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_link_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # # Challenge information entropy
    # challenge_information_entropy_so = so[so['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # challenge_information_entropy_to = to[to['Challenge_information_entropy'].notna(
    # )]['Challenge_information_entropy']
    # if len(challenge_information_entropy_so) * len(challenge_information_entropy_to) > 0:
    #     _, p = mannwhitneyu(challenge_information_entropy_so,
    #                         challenge_information_entropy_to)
    #     if p < alpha:
    #         print(
    #             f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge information entropy')
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(so), name),
    #                 y=so['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='Stack Overflow',
    #             ))
    #         fig_challenge_information_entropy.add_trace(
    #             go.Violin(
    #                 x=np.full(len(to), name),
    #                 y=to['Challenge_information_entropy'],
    #                 meanline_visible=True,
    #                 opacity=0.5,
    #                 name='Tool-specific',
    #             ))

    # Challenge readability
    challenge_readability_so = so[so['Challenge_readability'].notna(
    )]['Challenge_readability']
    challenge_readability_to = to[to['Challenge_readability'].notna(
    )]['Challenge_readability']
    if len(challenge_readability_so) * len(challenge_readability_to) > 0:
        _, p = mannwhitneyu(challenge_readability_so,
                            challenge_readability_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge readability')
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_readability.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_readability'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # Challenge answer count
    challenge_answer_count_so = so[so['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    challenge_answer_count_to = to[to['Challenge_answer_count'].notna(
    )]['Challenge_answer_count']
    if len(challenge_answer_count_so) * len(challenge_answer_count_to) > 0:
        _, p = mannwhitneyu(challenge_answer_count_so,
                            challenge_answer_count_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge answer count')
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_answer_count.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_answer_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

    # Challenge comment count
    challenge_comment_count_so = so[so['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    challenge_comment_count_to = to[to['Challenge_comment_count'].notna(
    )]['Challenge_comment_count']
    if len(challenge_comment_count_so) * len(challenge_comment_count_to) > 0:
        _, p = mannwhitneyu(challenge_comment_count_so,
                            challenge_comment_count_to)
        if p < alpha:
            print(
                f'Different distribution of Stack Overflow vs Tool-specific fora challenge regarding topic {name} in challenge comment count')
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(so), name),
                    y=so['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Stack Overflow',
                ))
            fig_challenge_comment_count.add_trace(
                go.Violin(
                    x=np.full(len(to), name),
                    y=to['Challenge_comment_count'],
                    meanline_visible=True,
                    opacity=0.5,
                    name='Tool-specific',
                ))

# Challenge topic count
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_so), 'Challenge topic'),
        y=df_so['Challenge_topic'],
        opacity=0.5,
        name='Stack Overflow',
    ))
fig_challenge_count.add_trace(
    go.Violin(
        x=np.full(len(df_to), 'Challenge topic'),
        y=df_to['Challenge_topic'],
        opacity=0.5,
        name='Tool-specific',
    ))

fig_challenge_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_score.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_favorite_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_follower_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_link_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_readability.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_view_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_answer_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
fig_challenge_comment_count.update_layout(
    height=1000,
    width=2000,
    font=dict(size=20),
    margin=dict(l=0, r=0, t=0, b=0),
)
# fig_challenge_information_entropy.update_layout(
#     height=500,
#     width=1000,
#     font=dict(size=20),
#     margin=dict(l=0, r=0, t=0, b=0),
# )

fig_challenge_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge count.png'))
fig_challenge_score.write_image(os.path.join(
    path_challenge_so_to, 'Challenge score.png'))
fig_challenge_favorite_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge favorite count.png'))
fig_challenge_follower_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge follower count.png'))
fig_challenge_link_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge link count.png'))
# fig_challenge_information_entropy.write_image(os.path.join(
#     path_challenge_so_to, 'Challenge information entropy.png'))
fig_challenge_readability.write_image(os.path.join(
    path_challenge_so_to, 'Challenge readability.png'))
fig_challenge_view_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge view count.png'))
fig_challenge_answer_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge answer count.png'))
fig_challenge_comment_count.write_image(os.path.join(
    path_challenge_so_to, 'Challenge comment count.png'))


Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Account Management in challenge score
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Account Management in challenge link count
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Account Management in challenge answer count
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Apache Spark Configuration in challenge score
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Apache Spark Configuration in challenge answer count
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Artifact Management in challenge score
Different distribution of StackOverflow vs Tool-specific fora challenge regarding topic Artifact Management in challenge link count
Different distribution of StackOverflow vs Tool-specific fora challenge rega

In [93]:
df_challenge = pd.read_json(os.path.join(path_challenge_open_closed, 'general.json'))

# Plot median solved time against mean solved time for each topic using count ratio as size
fig = px.scatter(df_challenge, y="Median solved time", x="Mean solved time", color="Topic", hover_name="Topic", size="Count ratio", trendline="ols", log_x=True, trendline_options=dict(log_x=True))
fig.update_xaxes(title_text="Mean solved time (log scale)")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
))
fig.show()

# Plot mean score against mean solved time for each topic using mean view count as size
fig = px.scatter(df_challenge, y="Mean score", x="Median solved time", color="Topic", hover_name="Topic", size="Mean view count")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
))
fig.show()

# Plot mean favorite count against median solved time for each topic using mean answer count as size
fig = px.scatter(df_challenge, y="Mean favorite count", x="Median solved time", color="Topic", hover_name="Topic", size="Mean answer count")
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=0, r=0, t=0, b=0),
    coloraxis_colorbar=dict(
        title="Challenge topic",
))
fig.show()

In [12]:
import scipy.interpolate
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess


def smooth(x, y, xgrid, lowess_kw=None):
    samples = np.random.choice(len(x), 50, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s, x_s, **lowess_kw)
    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(
        x_s, y_sm, fill_value='extrapolate')(xgrid)
    return y_grid


def lowess_with_confidence_bounds(x, y, conf_interval=0.95, lowess_kw=None):
    """
    Perform Lowess regression and determine a confidence interval by bootstrap resampling
    """
    xgrid = np.linspace(x.min(), x.max())

    K = 100
    smooths = np.stack([smooth(x, y, xgrid, lowess_kw) for _ in range(K)]).T

    mean = np.nanmean(smooths, axis=1)
    stderr = scipy.stats.sem(smooths, axis=1)

    clower = np.nanpercentile(smooths, (1-conf_interval)*50, axis=1)
    cupper = np.nanpercentile(smooths, (1+conf_interval)*50, axis=1)

    return xgrid, mean, stderr, clower, cupper

In [13]:
df_all = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_challenge = df_all[df_all['Challenge_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_challenge['Challenge_creation_time']), max(df_challenge['Challenge_creation_time'])


(Timestamp('2014-08-08 14:04:22.160000'),
 Timestamp('2023-02-22 01:36:03.995000'))

In [14]:
# Explore challenge topics evolution

df_challenge = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]
df_challenge = df_challenge[(df_challenge['Challenge_creation_time'] > '2014-09-14')
                            & (df_challenge['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_challenge.groupby('Challenge_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_creation_time', freq='2W')).agg(
        Count=('Challenge_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_creation_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_creation_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_challenge_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()

In [15]:
df_all = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_solution = df_all[df_all['Solution_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_solution['Challenge_creation_time']), max(
    df_solution['Challenge_creation_time'])

(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [16]:
# Explore solution topics evolution

df_solution = pd.read_json(os.path.join(path_general, 'assigned.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]
df_solution = df_solution[(df_solution['Challenge_creation_time'] > '2014-09-14')
                          & (df_solution['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_solution.groupby('Solution_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='W')).agg(
        Count=('Solution_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_closed_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_closed_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_solution_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()