In [19]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from scipy.stats import kruskal, shapiro, mannwhitneyu, kendalltau, chi2_contingency, f_oneway

In [55]:
path_dataset = '../../Dataset'
path_result = '../../Result'

path_rq12 = os.path.join(path_result, 'RQ12')
path_rq3 = os.path.join(path_result, 'RQ3')
path_rq4 = os.path.join(path_result, 'RQ4')

warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

macro_topic_indexing = {
    0: 'Code Development',
    1: 'Cost Management',
    2: 'Compute Management',
    3: 'Data Development',
    4: 'Data Management',
    5: 'Environment Management',
    6: 'Experiment Management',
    7: 'File Management',
    8: 'Model Development',
    9: 'Model Management',
    10: 'Model Serving',
    11: 'Network Management',
    12: 'Observability Management',
    13: 'Pipeline Management',
    14: 'Quality Assurance Management',
    15: 'Security Management'
}

topic_indexing = {
    0: 'Pipeline Step',
    1: 'Log',
    2: 'Docker Image',
    3: 'Model Tar',
    4: 'File Directory',
    5: 'Plot',
    6: 'Runs',
    7: 'Artifacts',
    8: 'Predict',
    9: 'Labeling',
    10: 'Model Training',
    11: 'Sweep',
    12: 'Workspace',
    13: 'Install Package',
    14: 'Managed Notebook',
    15: 'Deploying Model',
    16: 'Experiment',
    17: 'Parameters',
    18: 'Compute Instance',
    19: 'Batch Transform',
    20: 'Python Install',
    21: 'Permissions Role',
    22: 'Web Service',
    23: 'Metrics',
    24: 'Server',
    25: 'Authentication',
    26: 'Creating Studio',
    27: 'Loading Data',
    28: 'Inference Endpoint',
    29: 'Saving Model',
    30: 'Model Endpoint',
    31: 'Environment',
    32: 'Spark',
    33: 'Log Metrics',
    34: 'Bucket',
    35: 'Scoring',
    36: 'Training',
    37: 'Limit Exceeded',
    38: 'Deployment',
    39: 'Model Registry',
    40: 'Custom Job',
    41: 'Storage',
    42: 'Push',
    43: 'Cluster',
    44: 'Notebook Instance',
    45: 'Service Timeout',
    46: 'Loading Model',
    47: 'Tracking',
    48: 'Optimization',
    49: 'Object Attribute',
    50: 'Studio Lab',
    51: 'Upload',
    52: 'Train Model',
    53: 'File Notebook',
    54: 'Batch Prediction',
    55: 'Connect',
    56: 'Executing Script',
    57: 'Validation',
    58: 'Import',
    59: 'Handle Columns',
    60: 'Drift Monitoring',
    61: 'Job',
    62: 'Pandas',
    63: 'Installation',
    64: 'Index Range',
    65: 'Data Versioning',
    66: 'Service',
    67: 'Train File',
    68: 'Encoding Mismatch',
    69: 'Search',
    70: 'Memory Leak',
    71: 'Training Stuck',
    72: 'Download',
    73: 'Models',
    74: 'Distributed Training',
    75: 'Model Deployment',
    76: 'Running Script',
    77: 'Model Inference',
    78: 'Notebook',
    79: 'Version Conflict',
    80: 'Studio Domain',
    81: 'Deploying Endpoint',
    82: 'Pipeline',
    83: 'Register Model',
    84: 'Input Type',
    85: 'Component',
    86: 'Clone',
    87: 'Custom Training',
    88: 'Endpoint',
    89: 'Invoke Endpoint',
    90: 'File Pickle',
    91: 'Feature Store',
    92: 'Estimator',
    93: 'Account',
    94: 'Update',
    95: 'Table',
    96: 'Installing Package',
    97: 'Import Data',
    98: 'Face Transformers',
    99: 'Training Job',
    100: 'Report',
    101: 'Output',
    102: 'Export',
    103: 'Trained Object',
    104: 'Run',
    105: 'Version',
    106: 'Auto Scaling',
    107: 'Batch Size',
    108: 'Deploy Model',
    109: 'Blob Storage',
    110: 'Cost',
    111: 'Access',
    112: 'Scheduling Notebook',
    113: 'Tuning Model',
    114: 'Tests',
    115: 'Tracked Directory',
    116: 'Create Endpoint',
    117: 'Designer',
    118: 'Request Endpoint',
    119: 'Resource Group',
    120: 'Model Serve',
    121: 'File Size'
}

In [None]:
def remove_nan_columns(arr):
    """Removes columns from a 2D nd-array that contain any NaN values."""
    # Get a boolean mask for columns without NaN values
    mask = ~np.isnan(arr).any(axis=0)
    
    # Use the mask to filter out columns with NaN values
    return arr[:, mask]

In [24]:
df = pd.read_json(os.path.join(path_rq12, 'macro-topics.json'))

for index, row in df.iterrows():
    if 'Stack' in row['Platform']:
        df.loc[index, 'Platform'] = 'General Discussion Forum'
    elif 'Tool' in row['Platform']:
        df.loc[index, 'Platform'] = 'Tool-specific Discussion Forum'
    elif 'Issue' in row['Platform']:
        df.loc[index, 'Platform'] = 'Repo-specific Discussion Forum (Issue)'
    else:
        df.loc[index, 'Platform'] = 'Repo-specific Discussion Forum (Discussion)'

df.to_json(os.path.join(path_rq4, 'forums.json'), orient='records', indent=4)

In [11]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    df_inquiry = []

    for name, group in forum_posts.groupby('Challenge_type'):
        info = {
            'Type': name,
            'Prevalence (%)': round(len(group) / len(df) * 100, 2),
            'Unresolved rate (%)': round(len(group[group['Challenge_closed_time'].isna()]) / len(group) * 100, 2),
            'Median resolution time (hours)': round(group['Challenge_resolved_time'].median(), 2),
        }
        df_inquiry.append(info)

    df_inquiry = pd.DataFrame(df_inquiry)
    print(forum_name)
    print(df_inquiry.to_latex(index=False, float_format="%.2f"))

Discussion
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 0.13 & 53.33 & 15.35 \\
problem & 0.14 & 56.25 & 27.25 \\
\bottomrule
\end{tabular}

General
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 22.76 & 64.23 & 22.32 \\
problem & 27.37 & 69.23 & 25.87 \\
\bottomrule
\end{tabular}

Issue
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 3.03 & 36.99 & 226.32 \\
problem & 13.96 & 29.24 & 228.81 \\
\bottomrule
\end{tabular}

Tool
\begin{tabular}{lrrr}
\toprule
Type & Prevalence (%) & Unresolved rate (%) & Median resolution time (hours) \\
\midrule
knowledge & 15.56 & 77.25 & 11.51 \\
problem & 17.04 & 85.07 & 14.83 \\
\bottomrule
\end{tabular}



In [13]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts = forum_posts[forum_posts['Challenge_resolved_time'].notna()]

    df_problem = forum_posts[forum_posts['Challenge_type'] == 'problem']['Challenge_resolved_time'].tolist()
    df_knowledge = forum_posts[forum_posts['Challenge_type'] == 'knowledge']['Challenge_resolved_time'].tolist()

    print("Forum:", forum_name)
    print("P-value of problem inquiry:", shapiro(df_problem)[1])
    print("P-value of knowledge inquiry:", shapiro(df_knowledge)[1])
    print("P-value of two type of inquiries:", mannwhitneyu(df_problem, df_knowledge)[1])
    print()


Forum: Discussion
P-value of problem inquiry: 0.020418530330061913
P-value of knowledge inquiry: 0.00021079121506772935
P-value of two type of inquiries: 0.534965034965035

Forum: General
P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 0.0
P-value of two type of inquiries: 0.5493045237750223

Forum: Issue
P-value of problem inquiry: 0.0
P-value of knowledge inquiry: 6.837003823008954e-24
P-value of two type of inquiries: 0.4715740159473104

Forum: Tool
P-value of problem inquiry: 1.1686826365657608e-33
P-value of knowledge inquiry: 2.0624422901809804e-39
P-value of two type of inquiries: 0.1713358974500302



In [17]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    values_k = []
    values_p = []

    for index, group in forum_posts.groupby('Challenge_topic'):
        group_k = group[group['Challenge_type'] == 'knowledge']
        group_p = group[group['Challenge_type'] == 'problem']
        values_k.append(len(group_k))
        values_p.append(len(group_p))
    
    contingency_table = np.array([values_k, values_p])
    print("Forum:", forum_name)
    print("P-value of two type of inquiries:", chi2_contingency(contingency_table)[1])
    
    for index, (p, k) in enumerate(zip(values_p, values_k)):
        if abs(p-k) > 50:
            print(f'{index+1}: p: {p}, k: {k}, diff: {abs(p-k)}')
    print()

Forum: Discussion
P-value of two type of inquiries: 0.41263359569895813

Forum: General
P-value of two type of inquiries: 2.799237376237899e-26
14: p: 72, k: 20, diff: 52

Forum: Issue
P-value of two type of inquiries: 0.01606882428343408
1: p: 63, k: 10, diff: 53

Forum: Tool
P-value of two type of inquiries: 6.312495048647691e-14



In [18]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    df_topics = []

    for name, group in df.groupby('Challenge_topic_macro'):
        topic_info = {
            'Prevalence': group['Challenge_topic_macro'].count(),
            'Unresolved rate (%)': group['Challenge_resolved_time'].isna().sum() / len(group) * 100,
            'Median resolved time (h)': group['Challenge_resolved_time'].median(),
        }
        df_topics.append(topic_info)

    df_topics = pd.DataFrame(df_topics)
    df_correlation = pd.DataFrame(index=df_topics.columns, columns=df_topics.columns)

    # Iterate over each pair of columns
    for i, col1 in enumerate(df_topics.columns):
        for j, col2 in enumerate(df_topics.columns):
            if i < j:
                tau, pvalue = kendalltau(df_topics[col1], df_topics[col2])
                df_correlation.loc[col1, col2] = f'{tau:.3f}/{pvalue:.3f}'

    print(f'Forum: {forum_name}')
    print(df_correlation.to_latex())

Forum: Discussion
\begin{tabular}{llll}
\toprule
 & Prevalence & Unresolved rate (%) & Median resolved time (h) \\
\midrule
Prevalence & NaN & -0.083/0.690 & 0.150/0.450 \\
Unresolved rate (%) & NaN & NaN & -0.367/0.052 \\
Median resolved time (h) & NaN & NaN & NaN \\
\bottomrule
\end{tabular}

Forum: General
\begin{tabular}{llll}
\toprule
 & Prevalence & Unresolved rate (%) & Median resolved time (h) \\
\midrule
Prevalence & NaN & -0.083/0.690 & 0.150/0.450 \\
Unresolved rate (%) & NaN & NaN & -0.367/0.052 \\
Median resolved time (h) & NaN & NaN & NaN \\
\bottomrule
\end{tabular}

Forum: Issue
\begin{tabular}{llll}
\toprule
 & Prevalence & Unresolved rate (%) & Median resolved time (h) \\
\midrule
Prevalence & NaN & -0.083/0.690 & 0.150/0.450 \\
Unresolved rate (%) & NaN & NaN & -0.367/0.052 \\
Median resolved time (h) & NaN & NaN & NaN \\
\bottomrule
\end{tabular}

Forum: Tool
\begin{tabular}{llll}
\toprule
 & Prevalence & Unresolved rate (%) & Median resolved time (h) \\
\midrule
Pr

In [21]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts = forum_posts[forum_posts['Challenge_resolved_time'].notna()]
    resolved_time_list = []

    for name, group in forum_posts.groupby('Challenge_topic_macro'):
        resolved_time = group['Challenge_resolved_time'].apply(lambda x: np.log(x+1))
        resolved_time_list.append(resolved_time)

    print(f'Forum: {forum_name}')
    print(f'Kruskal-Wallis H-test: p-value = {kruskal(*resolved_time_list).pvalue:.3f}')
    print(f'ANOVA: p-value = {f_oneway(*resolved_time_list).pvalue:.3f}')
    print()

Forum: Discussion
Kruskal-Wallis H-test: p-value = 0.628
ANOVA: p-value = 0.886

Forum: General
Kruskal-Wallis H-test: p-value = 0.093
ANOVA: p-value = 0.236

Forum: Issue
Kruskal-Wallis H-test: p-value = 0.383
ANOVA: p-value = 0.503

Forum: Tool
Kruskal-Wallis H-test: p-value = 0.071
ANOVA: p-value = 0.137



In [25]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    df['Challenge_resolved_time'] = df['Challenge_resolved_time'].apply(lambda x: np.log(x+1))
    resolved_time_list = []

    for name, group in df.groupby('Challenge_topic_macro'):
        resolved_time_list.append(group['Challenge_resolved_time'])

    traces = [go.Box(y=lst, name=f"{macro_topic_indexing[i]}") for i, lst in enumerate(resolved_time_list)]
    fig = go.Figure(data=traces)
    fig.update_layout(
        title_text=f'{forum_name}',
        xaxis_title="Macro-topics",
        yaxis_title="Resolved time (hours) in log scale",
        showlegend=False
    )
    fig.show()
    fig.write_image(os.path.join(path_rq4, f'Resolved time in log scale ({forum_name}).pdf'))

In [29]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    forum_posts['Challenge_resolved_time'] = forum_posts['Challenge_resolved_time'].apply(lambda x: np.log(x+1))
    forum_posts['Challenge_topic_macro'] = forum_posts['Challenge_topic_macro'].apply(lambda x: macro_topic_indexing[x])
    
    fig = px.box(forum_posts, x="Challenge_topic_macro", y="Challenge_resolved_time", color="Challenge_type")
    # fig.update_traces(quartilemethod="inclusive") # or "inclusive", or "linear" by default
    fig.update_layout(
        title_text=f'{forum_name}',
        xaxis_title="Macro-topics",
        yaxis_title="Resolved time (hours) in log scale",
        legend_title='Inquiry type',
    )
    fig.show()
    fig.write_image(os.path.join(path_rq12, f'Resolved time in log scale (based on inquiry types) ({forum_name}).pdf'))

In [None]:
# import numpy as np

# def remove_nan_rows(arr):
#     if arr.ndim == 1:
#         return arr[~np.isnan(arr)]

#     # Recursively handle higher dimensions
#     return np.array([remove_nan_rows(subarr) for subarr in arr if not np.isnan(subarr).any()])

# # Example usage:
# arr = np.array([
#     [[1, 2], [3, np.nan], [5, 6]],
#     [[7, 8], [9, 10], [np.nan, np.nan]],
#     [[11, 12], [13, 14], [15, 16]]
# ])

# print(remove_nan_rows(arr))


In [47]:
df = pd.read_json(os.path.join(path_rq4, 'forums.json'))

for forum_name, forum_posts in df.groupby('Platform'):
    values_k = []
    values_p = []

    for index, group in forum_posts.groupby('Challenge_topic_macro'):
        group_k = group[group['Challenge_type'] == 'knowledge']
        group_p = group[group['Challenge_type'] == 'problem']
        values_k.append(group_k['Challenge_resolved_time'].isna().sum()/len(group_k))
        values_p.append(group_p['Challenge_resolved_time'].isna().sum()/len(group_p))

    contingency_table = np.array([values_k, values_p])
    contingency_table = remove_nan_columns(contingency_table)

    print("Forum:", forum_name)
    print("P-value of two type of inquiries:", chi2_contingency(contingency_table)[1])
    print()

Forum: General Discussion Forum
P-value of two type of inquiries: 0.9999999999999925

Forum: Repo-specific Discussion Forum (Discussion)
P-value of two type of inquiries: 0.8995242032487154

Forum: Repo-specific Discussion Forum (Issue)
P-value of two type of inquiries: 0.9999999926145932

Forum: Tool-specific Discussion Forum
P-value of two type of inquiries: 0.9999999999999578

