In [28]:
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

import time
import pickle

import numpy as np

import plotly.graph_objects as go
import plotly.express as px

import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import openai
openai.api_key = os.getenv('OPENAI_API_KEY')

In [16]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_solution = os.path.join(path_result, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_challenge_information = os.path.join(path_challenge, 'Information')
if not os.path.exists(path_challenge_information):
    os.makedirs(path_challenge_information)

path_solution_information = os.path.join(path_solution, 'Information')
if not os.path.exists(path_solution_information):
    os.makedirs(path_solution_information)

path_challenge_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_challenge_evolution):
    os.makedirs(path_challenge_evolution)

path_solution_evolution = os.path.join(path_solution, 'Evolution')
if not os.path.exists(path_solution_evolution):
    os.makedirs(path_solution_evolution)

In [None]:
openai.api_key = "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"


In [37]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.'''

with open(os.path.join(path_challenge, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic +
               '\n###' + '\n'.join(topic_term_list) + '###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_challenge = completion.choices[0].message.content
print(topic_challenge)


Topic 0: Environment Setup - Setting up software environments for development and execution
Topic 1: Pipeline Automation - Automating the execution of data processing pipelines
Topic 2: Docker - Containerization platform for building, shipping, and running applications
Topic 3: Hyperparameter Tuning - Optimizing model performance by tuning hyperparameters
Topic 4: Git Version Control - Tracking changes to code and collaborating with others
Topic 5: GPU Acceleration - Using graphics processing units to speed up machine learning tasks
Topic 6: Artifact Management - Managing and storing artifacts such as models, datasets, and code
Topic 7: Model Deployment - Deploying machine learning models for use in production environments
Topic 8: Data Labeling - Assigning labels to data for use in supervised learning tasks
Topic 9: Data Visualization - Creating visual representations of data for analysis and communication
Topic 10: Logging Metrics - Recording and tracking performance metrics during m

In [38]:
topic_mapping_challenge = {
    -1: np.nan,
    # Setting up software environments for development and execution
    0: 'Environment Setup',
    # Automating the execution of data processing pipelines
    1: 'Pipeline Automation',
    # Containerization platform for building, shipping, and running applications
    2: 'Docker',
    # Optimizing model performance by tuning hyperparameters
    3: 'Hyperparameter Tuning',
    # Tracking changes to code and collaborating with others
    4: 'Git Version Control',
    # Using graphics processing units to speed up machine learning tasks
    5: 'GPU Acceleration',
    # Managing and storing artifacts such as models, datasets, and code
    6: 'Artifact Management',
    # Deploying machine learning models for use in production environments
    7: 'Model Deployment',
    # Assigning labels to data for use in supervised learning tasks
    8: 'Data Labeling',
    # Creating visual representations of data for analysis and communication
    9: 'Data Visualization',
    # Recording and tracking performance metrics during model training and evaluation
    10: 'Logging Metrics',
    # Managing user accounts and access to resources
    11: 'Account Management',
    # Open-source distributed computing system for big data processing
    12: 'Apache Spark',
    # Open-source machine learning framework for building and training models
    13: 'TensorFlow',
    # Analyzing and manipulating text data
    14: 'Text Processing',
    # Data structure for manipulating and analyzing tabular data
    15: 'Pandas DataFrames',
    # Saving and exporting trained machine learning models
    16: 'Model Export',
    # Controlling access to resources based on user roles and permissions
    17: 'Role-Based Access Control',
    # Processing large amounts of data in batches
    18: 'Batch Processing',
    # Managing and versioning machine learning models
    19: 'Model Registry',
    # Connecting to and interacting with databases
    20: 'Database Connectivity',
    # Setting and managing limits on resource usage
    21: 'Resource Quotas',
    # Calling APIs to perform tasks or retrieve data
    22: 'API Invocation',
    # Using automated machine learning to generate forecasts
    23: 'AutoML Forecasting',
    # Working with and manipulating columns in datasets
    24: 'Column Manipulation',
    # Using machine learning to analyze and interpret visual data
    25: 'Computer Vision',
    # Deploying machine learning models as web services
    26: 'Web Service Deployment',
    # Open-source container orchestration platform for managing containerized applications
    27: 'Kubernetes Management',
    # Ensemble learning method for classification and regression tasks
    28: 'Random Forest',
    # File format for storing and exchanging tabular data
    29: 'CSV Files',
    # Visualizing and tracking model training and evaluation using TensorBoard
    30: 'TensorBoard Logging',
    # Planning and implementing new features for a platform or product
    31: 'Feature Roadmap',
    # Managing and versioning datasets
    32: 'Dataset Versioning',
    # Monitoring and logging AWS resources and applications
    33: 'CloudWatch Logging',
    # Converting audio speech to text
    34: 'Speech-to-Text',
    # Using YAML files to configure applications and services
    35: 'YAML Configuration',
    # Storing and accessing data in cloud-based storage solutions
    36: 'Data Storage',
    # Connecting to AWS services privately through a VPC
    37: 'VPC Endpoints',
    # Evaluating and improving the accuracy of machine learning models
    38: 'Model Accuracy',
    # Preparing and querying input data for machine learning models
    39: 'Model Input',
    # Managing access to cloud-based storage buckets
    40: 'Bucket Access',
    # Managing and monitoring the execution of jobs and tasks
    41: 'Run Management',
    # Using trained machine learning models to make predictions
    42: 'Model Inference',
    # Creating and managing cloud-based notebook instances for data analysis and experimentation
    43: 'Notebook Instances',
}

In [39]:
prompt_topic = '''You will be given a list of keywords for each topic, I want you to provide a description of each topic in a two-word phrase but guarantee that each description is exclusive to the other. Also, for each description, you need to attach short comments on what these keywords are talking about in general.'''

with open(os.path.join(path_solution, 'Topic terms.pickle'), 'rb') as handle:
    topic_terms = pickle.load(handle)

topic_term_list = []
for index, topic in enumerate(topic_terms):
    terms = ', '.join([term[0] for term in topic])
    topic_term = f'Topic {index}: {terms}'
    topic_term_list.append(topic_term)

completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt_topic +
               '\n###' + '\n'.join(topic_term_list) + '###\n'}],
    temperature=0,
    max_tokens=1500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    timeout=100,
    stream=False)

topic_solution = completion.choices[0].message.content
print(topic_solution)


Topic 0: Git Management - This topic is about managing files and repositories using Git, including tracking changes, pushing updates, and managing repositories.
Topic 1: Role-Based Access Control - This topic is about controlling access to resources based on user roles and permissions, including creating, attaching, and executing roles.
Topic 2: Package Management - This topic is about managing software packages and environments using tools like Conda and Pip, including installation, updating, and environment management.
Topic 3: Logging and Metrics - This topic is about logging and tracking metrics in a pipeline, including logging data, creating tables, and using loggers.
Topic 4: Dataset Management - This topic is about managing datasets and columns, including primary datasets, target columns, and scoring models.
Topic 5: Docker Management - This topic is about managing Docker containers and images, including creating, running, and managing Docker environments.
Topic 6: Parameter Man

In [None]:
topic_mapping_solution = {
    -1: np.nan,
    # This topic is about managing files and repositories using Git, including tracking changes, pushing updates, and managing repositories.
    0: 'Git Version Control',
    # This topic is about controlling access to resources based on user roles and permissions, including creating, attaching, and executing roles.
    1: 'Role-Based Access Control',
    # This topic is about managing software packages and environments using tools like Conda and Pip, including installation, updating, and environment management.
    2: 'Package Management',
    # This topic is about logging and tracking metrics in a pipeline, including logging data, creating tables, and using loggers.
    3: 'Logging and Metrics',
    # This topic is about managing datasets and columns, including primary datasets, target columns, and scoring models.
    4: 'Dataset Management',
    # This topic is about managing Docker containers and images, including creating, running, and managing Docker environments.
    5: 'Docker',
    # This topic is about managing programmatic parameters, including setting and configuring hyperparameters, global parameters, and standard Python parameter types.
    6: 'Parameter Management',
    # This topic is about configuring pipelines and stages using YAML files, including specifying directories, paths, and output stages.
    7: 'YAML Configuration',
    # This topic is about managing endpoints and APIs, including creating, deploying, and configuring endpoints for REST and web services.
    8: 'Endpoint Management',
    # This topic is about running and managing Jupyter Notebooks, including opening files, restarting kernels, and managing directories.
    9: 'Jupyter Notebook',
    # This topic is about managing tabular datasets using Pandas dataframes, including creating, scoring, and manipulating datasets.
    10: 'Pandas Dataframe',
    # This topic is about managing TensorFlow models, including installation, training, and logging.
    11: 'TensorFlow',
    # This topic is about managing artifacts, including uploading, downloading, and storing
    12: 'Artifact Management',
    # This topic is about deploying models to endpoints, including creating, deploying, and managing endpoints for cloud and model services.
    13: 'Model Deployment',
    # This topic is about using the random forest algorithm for machine learning, including building, training, and cutting forests.
    14: 'Random Forest',
    # This topic is about modeling pipelines, including building, inputting, and parameterizing pipelines for API and object use.
    15: 'Pipeline Modeling',
    # This topic is about managing JSON payloads, including formatting, serializing, and loading data.
    16: 'JSON Payload',
    # This topic is about configuring remote resources, including adding, modifying, and running remote URLs and resources.
    17: 'Remote Configuration',
    # This topic is about managing Spark clusters and datasets, including running, testing, and using Spark for machine learning.
    18: 'Apache Spark',
    # This topic is about implementing Python models, including using PyFunc and PythonModel interfaces, importing models, and loading models.
    19: 'Python Model',
    # This topic is about uploading and downloading data and files, including saving CSV files and uploading file objects.
    20: 'Data Transfer',
    # This topic is about using parallel computing and clusters for machine learning, including exploring, creating, and running clusters.
    21: 'Cluster Computing',
    # This topic is about managing pipeline data, including inputting datasets, creating pipeline classes, and using file inputs.
    22: 'Pipeline Data',
    # This topic is about managing CSV files and formatting, including writing, converting, and importing CSV data.
    23: 'CSV Files',
    # This topic is about registering and managing models, including registering models, managing metadata, and versioning models.
    24: 'Model Registry',
    # This topic is about managing memory and distributing training for large datasets, including using CPUs and notebooks for training.
    25: 'Memory Management',
    # This topic is about using neural networks for machine learning, including NLP, vision, OCR, and speech-to-text analysis.
    26: 'Model Training',
    # This topic is about managing SDK versions, including updating, upgrading, and installing SDKs like PyTorch.
    27: 'SDK Versioning',
    # This topic is about invoking endpoints and APIs using Lambda functions and API gateways, including waiting for responses and using SDKs.
    28: 'API Invocation',
}

In [5]:
df_topics = pd.read_json(os.path.join(path_general, 'topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]

# as if we assign the topic id as the label
label_challenge_original = df_topics['Challenge_topic'].unique().tolist()
label_challenge_refined = [f'c_{label}' for label in label_challenge_original]
label_challenge_map = dict(
    zip(label_challenge_original, label_challenge_refined))

label_solution_original = df_topics['Solution_topic'].unique().tolist()
label_solution_refined = [f's_{label}' for label in label_solution_original]
label_solution_map = dict(zip(label_solution_original, label_solution_refined))

df_topics = df_topics.replace(
    {'Challenge_topic': label_challenge_map, 'Solution_topic': label_solution_map})

categories = ['Challenge_topic', 'Solution_topic']
df_topics = df_topics.groupby(categories).size().reset_index(name='value')

# we only visualize large topics
df_topics = df_topics[df_topics['value'] > 20]

newDf = pd.DataFrame()
for i in range(len(categories)-1):
    tempDf = df_topics[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()

label = list(np.unique(df_topics[categories].values))
source = newDf['source'].apply(lambda x: label.index(x))
target = newDf['target'].apply(lambda x: label.index(x))
value = newDf['value']

link = dict(source=source, target=target, value=value)
node = dict(label=label)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(height=1000, width=1000, font=dict(size=30))
fig.write_image(os.path.join(path_challenge_information,
                'Challenge solution sankey.png'))

In [8]:
# Create challenge topic distribution tree map

df_topics = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Challenge_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_challenge_information,
                'Challenge_topic_distribution.png'))


In [9]:
# Create solution topic distribution tree map

df_topics = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_topics = df_topics[df_topics['Solution_topic'] > -1]
df_topics['Challenge_participation_count'] = df_topics['Challenge_answer_count'] + \
    df_topics['Challenge_comment_count']

fig = px.treemap(
    df_topics,
    path=['Tool', 'Platform'],
    values='Challenge_participation_count',
    color='Solution_topic',
    width=2000,
    height=1000,
)
fig.write_image(os.path.join(path_solution_information,
                'Solution_topic_distribution.png'))


In [10]:
# Collect challenge statistics information

df_challenge = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]

df_challenge['Challenge_comment_count'] = df_challenge['Challenge_comment_count'].fillna(0)
df_challenge['Challenge_solved_time'] = df_challenge['Challenge_closed_time'] - \
    df_challenge['Challenge_creation_time']
df_challenge['Challenge_adjusted_solved_time'] = df_challenge['Solution_last_edit_time'] - \
    df_challenge['Challenge_last_edit_time']
df_challenge['Challenge_participation_count'] = df_challenge['Challenge_answer_count'] + \
    df_challenge['Challenge_comment_count']

total_count = df_challenge['Challenge_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Challenge_topic'):
    Mean_score = group['Challenge_score'].mean()
    Mean_favorite_count = group['Challenge_favorite_count'].mean()
    Mean_follower_count = group['Challenge_follower_count'].mean()
    Mean_link_count = group['Challenge_link_count'].mean()
    Mean_information_entropy = group['Challenge_information_entropy'].mean()
    Mean_readability = group['Challenge_readability'].mean()
    Mean_sentence_count = group['Challenge_sentence_count'].mean()
    Mean_word_count = group['Challenge_word_count'].mean()
    Mean_unique_word_count = group['Challenge_unique_word_count'].mean()
    Mean_view_count = group['Challenge_view_count'].mean()
    Mean_answer_count = group['Challenge_answer_count'].mean()
    Mean_comment_count = group['Challenge_comment_count'].mean()
    Mean_participation_count = Mean_answer_count + Mean_comment_count
    Score_participation_ratio = Mean_score / Mean_participation_count
    Score_participation_weighted_product = (
        group['Challenge_score'] * group['Challenge_participation_count']).mean()
    Count = group['Challenge_topic'].count()
    Count_ratio = Count / total_count * 100
    Solved_ratio = group['Challenge_closed_time'].notna().sum() / Count
    Mean_solved_time = group['Challenge_solved_time'].mean(
    ) / pd.Timedelta(hours=1)
    Median_solved_time = group['Challenge_solved_time'].median(
    ) / pd.Timedelta(hours=1)
    Mean_adjusted_solved_time = group['Challenge_adjusted_solved_time'].mean(
    ) / pd.Timedelta(hours=1)
    Median_adjusted_solved_time = group['Challenge_adjusted_solved_time'].median(
    ) / pd.Timedelta(hours=1)
    topic_info = {
        'Challenge_topic': name,
        'Mean_score': Mean_score,
        'Mean_favorite_count': Mean_favorite_count,
        'Mean_follower_count': Mean_follower_count,
        'Mean_link_count': Mean_link_count,
        'Mean_information_entropy': Mean_information_entropy,
        'Mean_readability': Mean_readability,
        'Mean_sentence_count': Mean_sentence_count,
        'Mean_word_count': Mean_word_count,
        'Mean_unique_word_count': Mean_unique_word_count,
        'Mean_view_count': Mean_view_count,
        'Mean_answer_count': Mean_answer_count,
        'Mean_comment_count': Mean_comment_count,
        'Score_participation_ratio': Score_participation_ratio,
        'Score_participation_weighted_product': Score_participation_weighted_product,
        'Count_ratio': Count_ratio,
        'Solved_ratio': Solved_ratio,
        'Mean_solved_time': Mean_solved_time,
        'Median_solved_time': Median_solved_time,
        'Mean_adjusted_solved_time': Mean_adjusted_solved_time,
        'Median_adjusted_solved_time': Median_adjusted_solved_time,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_challenge_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Challenge_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_favorite_count', ascending=False)['Mean_favorite_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean favorite count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_favorite_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_follower_count', ascending=False)['Mean_follower_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean follower count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_follower_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_information_entropy', ascending=False)['Mean_information_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_information_entropy.png'))
plt.close()

fig = df_topics.sort_values('Mean_readability', ascending=False)['Mean_readability'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean readability', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_readability.png'))
plt.close()

fig = df_topics.sort_values('Mean_sentence_count', ascending=False)['Mean_sentence_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean sentence count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_sentence_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_word_count', ascending=False)['Mean_word_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean word count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_word_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_unique_word_count', ascending=False)['Mean_unique_word_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean unique word count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_unique_word_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_view_count', ascending=False)['Mean_view_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean view count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_view_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_answer_count', ascending=False)['Mean_answer_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean answer count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_answer_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_comment_count', ascending=False)['Mean_comment_count'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean comment count', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_comment_count.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_ratio', ascending=False)['Score_participation_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_ratio.png'))
plt.close()

fig = df_topics.sort_values('Score_participation_weighted_product', ascending=False)['Score_participation_weighted_product'].plot(
    kind='bar', figsize=(15, 8), title='Challenge score participation weighted product', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information,
            'Score_participation_weighted_product.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Count_ratio.png'))
plt.close()

fig = df_topics.sort_values('Solved_ratio')['Solved_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Challenge Solved ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'solved_ratio.png'))
plt.close()

fig = df_topics.sort_values('Mean_solved_time', ascending=False)['Mean_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge median solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'mean_solved_time.png'))
plt.close()

fig = df_topics.sort_values('Median_solved_time', ascending=False)['Median_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'median_solved_time.png'))
plt.close()

fig = df_topics.sort_values('Mean_adjusted_solved_time', ascending=False)['Mean_adjusted_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge mean adjusted solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Mean_adjusted_solved_time.png'))
plt.close()

fig = df_topics.sort_values('Median_adjusted_solved_time', ascending=False)['Median_adjusted_solved_time'].plot(
    kind='bar', figsize=(15, 8), title='Challenge median adjusted solved time', rot=15).get_figure()
fig.savefig(os.path.join(path_challenge_information, 'Median_adjusted_solved_time.png'))
plt.close()


In [11]:
# Collect solution statistics information

df_solution = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]

total_count = df_challenge['Solution_topic'].count()
df_topics = []

for name, group in df_challenge.groupby('Solution_topic'):
    Mean_score = group['Solution_score'].mean()
    Mean_link_count = group['Solution_link_count'].mean()
    Mean_information_entropy = group['Solution_information_entropy'].mean()
    Mean_readability = group['Solution_readability'].mean()
    Mean_sentence_count = group['Solution_sentence_count'].mean()
    Mean_word_count = group['Solution_word_count'].mean()
    Mean_unique_word_count = group['Solution_unique_word_count'].mean()
    Mean_comment_count = group['Solution_comment_count'].mean()
    Count_ratio = group['Solution_topic'].count() / total_count * 100
    topic_info = {
        'Solution_topic': name,
        'Mean_score': Mean_score,
        'Mean_link_count': Mean_link_count,
        'Mean_information_entropy': Mean_information_entropy,
        'Mean_readability': Mean_readability,
        'Mean_sentence_count': Mean_sentence_count,
        'Mean_word_count': Mean_word_count,
        'Mean_unique_word_count': Mean_unique_word_count,
        'Mean_comment_count': Mean_comment_count,
        'Count_ratio': Count_ratio,
    }
    df_topics.append(topic_info)

df_topics = pd.DataFrame(df_topics)
df_topics.to_json(os.path.join(path_solution_information,
                  'general.json'), indent=4, orient='records')
df_topics = df_topics.set_index('Solution_topic')

fig = df_topics.sort_values('Mean_score', ascending=False)['Mean_score'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean score', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_score.png'))
plt.close()

fig = df_topics.sort_values('Mean_link_count', ascending=False)['Mean_link_count'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean link count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_link_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_information_entropy', ascending=False)['Mean_information_entropy'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean info entropy', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_information_entropy.png'))
plt.close()

fig = df_topics.sort_values('Mean_readability', ascending=False)['Mean_readability'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean readability', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_readability.png'))
plt.close()

fig = df_topics.sort_values('Mean_sentence_count', ascending=False)['Mean_sentence_count'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean sentence count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_sentence_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_word_count', ascending=False)['Mean_word_count'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean word count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_word_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_unique_word_count', ascending=False)['Mean_unique_word_count'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean unique word count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_unique_word_count.png'))
plt.close()

fig = df_topics.sort_values('Mean_comment_count', ascending=False)['Mean_comment_count'].plot(
    kind='bar', figsize=(15, 8), title='Solution mean comment count', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Mean_comment_count.png'))
plt.close()

fig = df_topics.sort_values('Count_ratio', ascending=False)['Count_ratio'].plot(
    kind='bar', figsize=(15, 8), title='Solution count ratio', rot=15).get_figure()
fig.savefig(os.path.join(path_solution_information, 'Count_ratio.png'))
plt.close()


In [12]:
import scipy.interpolate
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess


def smooth(x, y, xgrid, lowess_kw=None):
    samples = np.random.choice(len(x), 50, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s, x_s, **lowess_kw)
    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(
        x_s, y_sm, fill_value='extrapolate')(xgrid)
    return y_grid


def lowess_with_confidence_bounds(x, y, conf_interval=0.95, lowess_kw=None):
    """
    Perform Lowess regression and determine a confidence interval by bootstrap resampling
    """
    xgrid = np.linspace(x.min(), x.max())

    K = 100
    smooths = np.stack([smooth(x, y, xgrid, lowess_kw) for _ in range(K)]).T

    mean = np.nanmean(smooths, axis=1)
    stderr = scipy.stats.sem(smooths, axis=1)

    clower = np.nanpercentile(smooths, (1-conf_interval)*50, axis=1)
    cupper = np.nanpercentile(smooths, (1+conf_interval)*50, axis=1)

    return xgrid, mean, stderr, clower, cupper

In [13]:
df_all = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_challenge = df_all[df_all['Challenge_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_challenge['Challenge_creation_time']), max(df_challenge['Challenge_creation_time'])


(Timestamp('2014-08-08 14:04:22.160000'),
 Timestamp('2023-02-22 01:36:03.995000'))

In [14]:
# Explore challenge topics evolution

df_challenge = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_challenge = df_challenge[df_challenge['Challenge_topic'] > -1]
df_challenge = df_challenge[(df_challenge['Challenge_creation_time'] > '2014-09-14')
                            & (df_challenge['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_challenge.groupby('Challenge_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_creation_time', freq='2W')).agg(
        Count=('Challenge_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_creation_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_creation_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_challenge_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()

In [15]:
df_all = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_solution = df_all[df_all['Solution_topic'] > -1]
# BigQuery Stack Overflow public dataset is updated until Nov 24, 2022, 1:39:22 PM UTC-5
min(df_solution['Challenge_creation_time']), max(
    df_solution['Challenge_creation_time'])

(Timestamp('2014-09-14 22:12:24.493000'),
 Timestamp('2023-02-21 18:36:06.284000'))

In [16]:
# Explore solution topics evolution

df_solution = pd.read_json(os.path.join(path_general, 'Topics.json'))
df_solution = df_solution[df_solution['Solution_topic'] > -1]
df_solution = df_solution[(df_solution['Challenge_creation_time'] > '2014-09-14')
                          & (df_solution['Challenge_creation_time'] < '2022-11-21')]

for name, group in df_solution.groupby('Solution_topic'):
    group = group.groupby(pd.Grouper(key='Challenge_closed_time', freq='W')).agg(
        Count=('Solution_topic', 'count')).reset_index()
    x = pd.to_datetime(group['Challenge_closed_time']).values
    x = np.array([i.astype('datetime64[D]').astype(int) for i in x])
    y = group['Count'].values
    # 95% confidence interval
    xgrid, mean, stderr, clower, cupper = lowess_with_confidence_bounds(
        x, y, conf_interval=0.95, lowess_kw={"frac": 0.5, "it": 5, "return_sorted": False})
    x = pd.to_datetime(group['Challenge_closed_time']).values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.plot(x, y, 'k.', label='Observations')
    plt.plot(xgrid, mean, color='tomato', label='LOWESS')
    plt.fill_between(xgrid, clower, cupper, alpha=0.3,
                     label='LOWESS uncertainty')
    plt.legend(loc='best')
    fig.savefig(os.path.join(path_solution_evolution,
                f'Topic_{name}'), bbox_inches="tight")
    plt.close()