In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from scipy.stats import kruskal, shapiro, mannwhitneyu, kendalltau, chi2_contingency, f_oneway

In [2]:
path_dataset = '../../Dataset'
path_result = '../../Result'

path_rq12 = os.path.join(path_result, 'RQ12')
path_rq3 = os.path.join(path_result, 'RQ3')
path_rq4 = os.path.join(path_result, 'RQ4')

warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

macro_topic_indexing = {
    0: 'Code Development',
    1: 'Cost Management',
    2: 'Compute Management',
    3: 'Data Development',
    4: 'Data Management',
    5: 'Environment Management',
    6: 'Experiment Management',
    7: 'File Management',
    8: 'Model Development',
    9: 'Model Management',
    10: 'Model Serving',
    11: 'Network Management',
    12: 'Observability Management',
    13: 'Pipeline Management',
    14: 'Quality Assurance Management',
    15: 'Security Management'
}


In [24]:
def remove_nan_columns(arr):
    """Removes columns from a 2D nd-array that contain any NaN values."""
    # Get a boolean mask for columns without NaN values
    mask = ~np.isnan(arr).any(axis=0)
    
    # Use the mask to filter out columns with NaN values
    return arr[:, mask]

In [44]:
df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))
df['Challenge_contributor_issue_ratio'] = df['Challenge_repo_contributor_count'] / df['Challenge_repo_issue_count']
df['Challenge_watch_issue_ratio'] = df['Challenge_repo_watch_count'] / df['Challenge_repo_issue_count']

for index, row in df.iterrows():
    if 'Stack' in row['Platform']:
        df.loc[index, 'Platform'] = 'General Discussion Forum'
    elif 'Tool' in row['Platform']:
        df.loc[index, 'Platform'] = 'Tool-specific Discussion Forum'
    elif 'Issue' in row['Platform']:
        df.loc[index, 'Platform'] = 'Repo-specific Discussion Forum'
    else:
        df.drop(index, inplace=True)

df.to_json(os.path.join(path_rq4, 'forums.json'), orient='records', indent=4)

In [12]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    df_inquiry = []

    for name, group in forum_posts.groupby('Challenge_type'):
        info = {
            'Type': name,
            'Prevalence (%)': round(len(group) / len(df) * 100, 2),
            'Unresolved rate (%)': round(len(group[group['Challenge_closed_time'].isna()]) / len(group) * 100, 2),
            'Median resolution time (hours)': round(group['Challenge_resolved_time'].median(), 2),
        }
        df_inquiry.append(info)

    df_inquiry = pd.DataFrame(df_inquiry)
    print(forum_name)
    print(df_inquiry.to_latex(index=False, float_format="%.2f"))

General Discussion Forum
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 22.76 & 64.23 & 22.32 \\
problem & 27.37 & 69.23 & 25.87 \\
\bottomrule
\end{tabular}

Repo-specific Discussion Forum
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 3.16 & 37.67 & 209.17 \\
problem & 14.10 & 29.51 & 224.79 \\
\bottomrule
\end{tabular}

Tool-specific Discussion Forum
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 15.56 & 77.25 & 11.51 \\
problem & 17.04 & 85.07 & 14.83 \\
\bottomrule
\end{tabular}



In [13]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts = forum_posts[forum_posts['Challenge_resolved_time'].notna()]

    df_problem = forum_posts[forum_posts['Challenge_type'] == 'problem']['Challenge_resolved_time'].tolist()
    df_knowledge = forum_posts[forum_posts['Challenge_type'] == 'knowledge']['Challenge_resolved_time'].tolist()

    print("Forum:", forum_name)
    print("P-value of problem inquiry:", shapiro(df_problem)[1])
    print("P-value of knowledge inquiry:", shapiro(df_knowledge)[1])
    print("P-value of two type of inquiries:", mannwhitneyu(df_problem, df_knowledge)[1])
    print()


Forum: General Discussion Forum
P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 0.0
P-value of two type of inquiries: 0.5493045237750223

Forum: Repo-specific Discussion Forum
P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 2.0156321067808223e-24
P-value of two type of inquiries: 0.7707157340555867

Forum: Tool-specific Discussion Forum
P-value of problem inquiry: 1.1686826365657608e-33
P-value of knowledge inquiry: 2.0624422901809804e-39
P-value of two type of inquiries: 0.1713358974500302



In [16]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    values_k = []
    values_p = []

    for index, group in forum_posts.groupby('Challenge_topic'):
        group_k = group[group['Challenge_type'] == 'knowledge']
        group_p = group[group['Challenge_type'] == 'problem']
        values_k.append(len(group_k))
        values_p.append(len(group_p))
    
    contingency_table = np.array([values_k, values_p])
    print("Forum:", forum_name)
    print("P-value of two type of inquiries:", chi2_contingency(contingency_table)[1])
    
    for index, (p, k) in enumerate(zip(values_p, values_k)):
        if abs(p-k) > 40:
            print(f'{index+1}: p: {p}, k: {k}, diff: {abs(p-k)}')
    print()

Forum: General Discussion Forum
P-value of two type of inquiries: 2.799237376237899e-26
14: p: 72, k: 20, diff: 52

Forum: Repo-specific Discussion Forum
P-value of two type of inquiries: 0.02758134265144551
1: p: 63, k: 10, diff: 53
2: p: 62, k: 19, diff: 43
7: p: 50, k: 5, diff: 45

Forum: Tool-specific Discussion Forum
P-value of two type of inquiries: 6.312495048647691e-14



In [46]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))
df = df[df['Challenge_resolved_time'].notna()]

basic_columns = ['Challenge_score_count', 'Challenge_readability', 'Challenge_reading_time', 'Challenge_link_count', 'Challenge_code_count']
extra_columns = {
    'General Discussion Forum': ['Challenge_comment_count', 'Challenge_comment_readability', 'Challenge_comment_reading_time', 'Challenge_comment_link_count', 'Challenge_favorite_count', 'Challenge_view_count', 'Poster_reputation_count', 'Poster_view_count'],
    'Repo-specific Discussion Forum': ['Challenge_comment_count', 'Challenge_comment_readability', 'Challenge_comment_reading_time', 'Challenge_comment_link_count', 'Challenge_contributor_issue_ratio', 'Challenge_repo_contributor_count', 'Challenge_repo_fork_count', 'Challenge_repo_issue_count', 'Challenge_repo_star_count', 'Challenge_repo_watch_count', 'Challenge_watch_issue_ratio'],
    'Tool-specific Discussion Forum': [],
}

for forum_name, forum_posts in df.groupby('Platform'):
    resolved_time = forum_posts['Challenge_resolved_time']
    subset_columns = basic_columns + extra_columns[forum_name]
    df_test = []

    for column in subset_columns:
        tau, pvalue = kendalltau(resolved_time, forum_posts[column])
        column_name = ' '.join(column.split('_')[1:])
        kendall_tau = pd.Series({'Metrics': column_name, 'tau': tau, 'p-value': pvalue})
        df_test.append(kendall_tau)

    df_test = pd.DataFrame(df_test)
    print(f'Forum: {forum_name}')
    print(df_test.to_latex(index=False, float_format="%.3f"))

Forum: General Discussion Forum
\begin{tabular}{lrr}
\toprule
Metrics & tau & p-value \\
\midrule
Challenge_score_count & 0.028 & 0.105 \\
Challenge_answer_count & 0.083 & 0.000 \\
Challenge_comment_count & 0.081 & 0.000 \\
Challenge_favorite_count & NaN & NaN \\
Challenge_view_count & 0.036 & 0.018 \\
Poster_reputation_count & NaN & NaN \\
Poster_view_count & NaN & NaN \\
\bottomrule
\end{tabular}

Forum: Repo-specific Discussion Forum
\begin{tabular}{lrr}
\toprule
Metrics & tau & p-value \\
\midrule
Challenge_score_count & 0.094 & 0.000 \\
Challenge_comment_count & 0.121 & 0.000 \\
Challenge_contributor_issue_ratio & NaN & NaN \\
Challenge_repo_contributor_count & 0.083 & 0.000 \\
Challenge_repo_fork_count & 0.082 & 0.000 \\
Challenge_repo_issue_count & 0.116 & 0.000 \\
Challenge_repo_star_count & 0.060 & 0.001 \\
Challenge_repo_watch_count & 0.098 & 0.000 \\
Challenge_watch_issue_ratio & NaN & NaN \\
\bottomrule
\end{tabular}

Forum: Tool-specific Discussion Forum
\begin{tabular}{lr

In [17]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts = forum_posts[forum_posts['Challenge_resolved_time'].notna()]
    resolved_time_list = []

    for name, group in forum_posts.groupby('Challenge_topic_macro'):
        resolved_time = group['Challenge_resolved_time'].apply(lambda x: np.log(x+1))
        resolved_time_list.append(resolved_time)

    print(f'Forum: {forum_name}')
    print(f'Kruskal-Wallis H-test: p-value = {kruskal(*resolved_time_list).pvalue:.3f}')
    print(f'ANOVA: p-value = {f_oneway(*resolved_time_list).pvalue:.3f}')
    print()

Forum: General Discussion Forum
Kruskal-Wallis H-test: p-value = 0.093
ANOVA: p-value = 0.236

Forum: Repo-specific Discussion Forum
Kruskal-Wallis H-test: p-value = 0.359
ANOVA: p-value = 0.475

Forum: Tool-specific Discussion Forum
Kruskal-Wallis H-test: p-value = 0.071
ANOVA: p-value = 0.137



In [29]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))
df['Challenge_resolved_time'] = df['Challenge_resolved_time'].apply(lambda x: np.log(x+1))

for forum_name, forum_posts in df.groupby('Platform'):
    resolved_time_list = []

    for name, group in df.groupby('Challenge_topic_macro'):
        resolved_time_list.append(group['Challenge_resolved_time'])

    traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
    fig = go.Figure(data=traces)
    fig.update_layout(
        title_text=f'{forum_name}',
        xaxis_title="Macro-topics",
        yaxis_title="Resolved time (hours) in log scale",
        showlegend=False
    )
    fig.show()
    fig.write_image(os.path.join(path_rq4, f'Resolved time in log scale - {forum_name}.pdf'))

In [30]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts['Challenge_resolved_time'] = forum_posts['Challenge_resolved_time'].apply(lambda x: np.log(x+1))
    forum_posts['Challenge_topic_macro'] = forum_posts['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
    
    fig = px.box(forum_posts, x="Challenge_topic_macro", y="Challenge_resolved_time", color="Challenge_type")
    # fig.update_traces(quartilemethod="inclusive") # or "inclusive", or "linear" by default
    fig.update_layout(
        title_text=f'{forum_name}',
        xaxis_title="Macro-topics",
        yaxis_title="Resolved time (hours) in log scale",
        legend_title='Inquiry type',
    )
    fig.show()
    fig.write_image(os.path.join(path_rq4, f'Resolved time in log scale (based on inquiry types) - {forum_name}.pdf'))

In [28]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    values_k = []
    values_p = []

    for index, group in forum_posts.groupby('Challenge_topic_macro'):
        group_k = group[group['Challenge_type'] == 'knowledge']
        group_p = group[group['Challenge_type'] == 'problem']
        values_k.append(group_k['Challenge_resolved_time'].isna().sum()/len(group_k))
        values_p.append(group_p['Challenge_resolved_time'].isna().sum()/len(group_p))

    contingency_table = np.array([values_k, values_p])
    contingency_table = remove_nan_columns(contingency_table)

    print("Forum:", forum_name)
    print("P-value of two type of inquiries:", chi2_contingency(contingency_table)[1])
    print()

Forum: General Discussion Forum
P-value of two type of inquiries: 0.9999999999999925

Forum: Repo-specific Discussion Forum
P-value of two type of inquiries: 0.9999999930577732

Forum: Tool-specific Discussion Forum
P-value of two type of inquiries: 0.9999999999999578

