In [1]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import random
import nltk
import glob
import json
import os


In [26]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [61]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': 'aim',
    'Amazon SageMaker': 'sagemaker',
    'Azure Machine Learning': 'azure',
    'ClearML': 'clearml',
    'cnvrg.io': 'cnvrg',
    'Codalab': 'codalab',
    'Comet': 'comet',
    'Determined': 'determined',
    'Domino': 'domino',
    'DVC': 'dvc',
    'Guild AI': 'guildai',
    'Kedro': 'kedro',
    'MLflow': 'mlflow',
    'MLRun': 'mlrun',
    'ModelDB': 'modeldb',
    'Neptune': 'neptune',
    'Polyaxon': 'polyaxon',
    'Sacred': 'sacred',
    'SigOpt': 'sigopt',
    'Valohai': 'valohai',
    'Vertex AI': 'vertex',
    'Weights & Biases': 'wandb'
}

ignore_tools = {
    
}

issue_labels = {
    'bug',
    'error',
    'invalid',
    'questions',
    'triage'
}

In [28]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_repo_raw = os.path.join(path_github_repo, 'Raw')
path_gitlab_repo_raw = os.path.join(path_gitlab_repo, 'Raw')
path_github_repo_scraped = os.path.join(path_github_repo, 'Scraped')
path_gitlab_repo_scraped = os.path.join(path_gitlab_repo, 'Scraped')

path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')
path_github_issue_raw = os.path.join(path_github_issue, 'Raw')
path_gitlab_issue_raw = os.path.join(path_gitlab_issue, 'Raw')
path_github_issue_filtered = os.path.join(path_github_issue, 'Filtered')
path_gitlab_issue_filtered = os.path.join(path_gitlab_issue, 'Filtered')
path_github_issue_sampled = os.path.join(path_github_issue, 'Sampled')
path_gitlab_issue_sampled = os.path.join(path_gitlab_issue, 'Sampled')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

if not os.path.exists(path_github_repo_raw):
    os.makedirs(path_github_repo_raw)

if not os.path.exists(path_gitlab_repo_raw):
    os.makedirs(path_gitlab_repo_raw)

if not os.path.exists(path_github_issue_raw):
    os.makedirs(path_github_issue_raw)

if not os.path.exists(path_gitlab_issue_raw):
    os.makedirs(path_gitlab_issue_raw)

if not os.path.exists(path_github_issue_sampled):
    os.makedirs(path_github_issue_sampled)

if not os.path.exists(path_gitlab_issue_sampled):
    os.makedirs(path_gitlab_issue_sampled)

if not os.path.exists(path_github_issue_filtered):
    os.makedirs(path_github_issue_filtered)

if not os.path.exists(path_gitlab_issue_filtered):
    os.makedirs(path_gitlab_issue_filtered)

if not os.path.exists(path_github_repo_scraped):
    os.makedirs(path_github_repo_scraped)

if not os.path.exists(path_gitlab_repo_scraped):
    os.makedirs(path_gitlab_repo_scraped)

In [2]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)


In [12]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name, release_date=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)
    else:
        print(error_data)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')
tools_data


Unnamed: 0,Repo,Link,Repo Creation Date,Last Commit Date,Topics,Language,Size,#Star,#Watch,#Fork,#Contributors,#Branches,#Releases,#Commits,#Pull Requests,#Pull Requests (Open),#Issues,#Issues (Open),Name,First Release Date
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2023-01-13 11:15:25,"[python, ai, data-science, data-visualization,...",TypeScript,58809.0,2994.0,36.0,186.0,49.0,83.0,46.0,1998.0,1688.0,25.0,2455.0,215.0,Aim,2022-01-22 13:45:58
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2023-01-14 23:34:37,"[aws, mxnet, tensorflow, machine-learning, pyt...",Python,108363.0,1760.0,130.0,918.0,299.0,15.0,476.0,2825.0,2275.0,55.0,3427.0,453.0,Amazon SageMaker,2017-11-19 00:00:00
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2023-01-16 01:45:20,"[python, azure, azure-sdk, hacktoberfest]",Python,534097.0,3455.0,360.0,2167.0,399.0,559.0,2683.0,13412.0,20706.0,125.0,28272.0,888.0,Azure Machine Learning,2015-02-18 00:00:00
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2023-01-11 08:30:46,"[version-control, experiment-manager, version,...",Python,38911.0,3992.0,82.0,534.0,58.0,3.0,74.0,1957.0,172.0,5.0,875.0,334.0,ClearML,2019-06-11 17:27:11
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2022-12-27 07:05:33,[],Python,28021.0,136.0,18.0,79.0,54.0,131.0,113.0,4537.0,2235.0,27.0,4350.0,375.0,Codalab,2017-05-14 00:32:55
5,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2023-01-16 02:52:15,"[data-science, machine-learning, reproducibili...",Python,17430.0,10905.0,136.0,1017.0,253.0,15.0,412.0,8302.0,4481.0,6.0,8546.0,616.0,DVC,2017-05-04 08:03:08
6,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:17:59,2023-01-15 21:13:44,"[datasets, deep-learning, machine-learning, da...",Python,63700.0,5155.0,66.0,417.0,88.0,123.0,81.0,6722.0,1677.0,21.0,2080.0,67.0,Deep Lake,2020-12-15 18:46:17
7,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2023-01-13 21:31:05,"[deep-learning, machine-learning, ml-platform,...",TypeScript,105854.0,2008.0,61.0,271.0,68.0,177.0,75.0,4792.0,5495.0,78.0,5745.0,96.0,Determined,2020-04-08 20:01:20
8,dominodatalab/python-domino,https://github.com/dominodatalab/python-domino,2016-05-16 22:58:02,2023-01-10 14:01:50,[],Python,474.0,50.0,27.0,50.0,32.0,49.0,13.0,199.0,128.0,3.0,163.0,14.0,Domino,2020-08-05 05:16:39
9,tensorchord/envd,https://github.com/tensorchord/envd,2022-04-11 09:04:19,2023-01-13 08:24:18,"[developer-tools, development-environment, mlo...",Go,2420.0,1428.0,20.0,117.0,58.0,4.0,90.0,792.0,845.0,13.0,1326.0,135.0,envd,2022-04-26 08:49:03


In [14]:
dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name) as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix('github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name) as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')

In [15]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
# dependents_summary.sort_values(by=['#GitHub Dependents', '#GitLab Dependents'], ascending=False, inplace=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,92,1
1,Amazon SageMaker,931,3
2,Azure Machine Learning,689,0
3,ClearML,303,0
4,Codalab,30,0
5,Comet,480,0
6,Deep Lake,3,0
7,Determined,44,0
8,Domino,2,0
9,DVC,4229,0


In [28]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df_tools = pd.read_json(os.path.join(path_dataset, 'Tools.json'))


In [33]:
# Gitlab dependents retrieval for labelling
for index, row in df_dependents.iterrows():
    if not row['GitLab Dependents']:
        continue
    dependent = ['gitlab.com/' +
                 repo_name for repo_name in row['GitLab Dependents']]
    dependent = pd.DataFrame({'Link': dependent})
    dependent.to_json(os.path.join(path_gitlab_repo_labelled,
                      f'{row["Tool"]}.json'), indent=4, orient='records')


In [6]:
# scrape Gitlab valid dependents general information for each tool
project_categories = {'Project', 'Toolkit', 'Research'}
for file_name in glob.glob(os.path.join(path_gitlab_repo_labelled, '*.json')):
    repos = pd.read_json(file_name)
    repos_name = []
    for index, row in repos.iterrows():
        if row['Label'] in project_categories:
            repo_name = row['Link'].removeprefix('gitlab.com/')
            repos_name.append(repo_name)

    if not repos_name:
        continue

    repos_data, errors_data = gitlab_miner.scrape_repo_list(repos_name)
    tool_name = os.path.split(file_name)[1].split('.')[0]

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'{tool_name}.json'), indent=4, orient='records')
    if not errors_data.empty:
        errors_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'Discarded.{tool_name}.json'), indent=4, orient='records')


In [None]:
# scrape issues of Gitlab dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_gitlab_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos without any issues
        repos = repos[repos['#Issues'] > 0]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(f'{row["Name"]}: {repos["#Issues"].sum()}')
        # scrape issues for the current tool
        issues = gitlab_miner.scrape_issue_list(repos['Repo'].tolist())
        issues.to_json(os.path.join(path_gitlab_issue_raw,
                       f'{row["Name"]}.json'), indent=4, orient='records')


In [71]:
# exclude Gitlab issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_gitlab_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]
    for index, issue in issues.iterrows():
        if tools_keywords[tool_name] in nltk.word_tokenize(issue['Issue_title'].lower()):
            valid_issue = pd.DataFrame([issue])
            valid_issues = pd.concat(
                [valid_issues, valid_issue], ignore_index=True)
            if not pd.isnull(issue['Issue_closed_time']):
                valid_fixes = pd.concat(
                    [valid_fixes, valid_issue], ignore_index=True)
    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)
valid_issues_all = valid_issues_all[~valid_issues_all['Tool'].isin(ignore_tools)]
valid_fixes_all = valid_fixes_all[~valid_fixes_all['Tool'].isin(ignore_tools)]
valid_issues_all
valid_fixes_all

Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_body,Tool
2,https://gitlab.com/librespacefoundation/polari...,Saving behave logs in MLflow,[],2021-07-06 19:33:14.309,NaT,0,0,"Like we do in learn, we should also save the p...",MLflow
3,https://gitlab.com/librespacefoundation/polari...,ML Database vs. MLflow,[learn],2020-04-25 17:51:29.061,NaT,1,0,I am mainly working on the feature selection p...,MLflow
4,https://gitlab.com/librespacefoundation/polari...,Namespacing polaris runs for logging purposes ...,"[improvement, learn]",2020-01-31 21:57:59.518,NaT,0,0,Background\n==========\n\nEverytime analysis i...,MLflow


Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_body,Tool


In [72]:
'''
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final
'''

{'improvement', 'learn'}

In [64]:
# filter out issues that are not related to challenges
valid_issues_filtered = []

for index, row in valid_issues_all.iterrows():
    if not len(row['Issue_label']):
        valid_issues_filtered.append(row)
    else:
        for label_1 in issue_labels:
            for label_2 in row['Issue_label']:
                if label_1 in label_2.lower():
                    valid_issues_filtered.append(row)
                    break

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T

valid_issues_filtered.to_json(os.path.join(
    path_gitlab_issue_filtered, 'issues.json'), indent=4, orient='records')

In [36]:
summary_gitlab = valid_issues_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_gitlab.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_gitlab.to_csv(os.path.join(
    path_gitlab_issue, 'summary.csv'), index=False)
summary_gitlab

Unnamed: 0,Tool,#Issue
0,MLflow,1


In [38]:
# After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
df_summary = pd.read_csv(os.path.join(path_gitlab_issue, 'summary.csv'))
df_summary

Unnamed: 0,Tool,#Issue,#Sample Issue
0,MLflow,1,1


In [52]:
# sample Gitlab issues and fixes accordingly
df_issue_samples = pd.DataFrame()
df_fix_samples = pd.DataFrame()

for index, row in df_summary.iterrows():
    df_issue_sample = valid_issues_filtered[valid_issues_filtered['Tool'] == row['Tool']].sample(
        n=row['#Sample Issue'], random_state=0)
    df_issue_samples = pd.concat(
        [df_issue_samples, df_issue_sample], ignore_index=True)

df_issue_samples.to_json(os.path.join(
    path_gitlab_issue_sampled, 'issues.json'), indent=4, orient='records')

In [None]:
# scrape Github dependents general information for each tool
for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos_data, errors_data = github_miner.scrape_repo_list(
        row['GitHub Dependents'])

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_github_repo_scraped, f'{row["Tool"]}.json'), indent=4, orient='records')

    if not errors_data.empty:
        errors_data.to_json(os.path.join(path_github_repo_scraped,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

In [None]:
# scrape issues of Github dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_github_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos with only pr-based issues
        repos = repos[repos['#Issues'] > repos['#Pull Requests']]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(
            f'{row["Name"]}: {repos["#Issues"].sum() - repos["#Pull Requests"].sum()}')
        # scrape issues for the current tool
        issues = github_miner.scrape_issue_list(repos['Repo'].tolist())
        if not issues.empty:
            issues.to_json(os.path.join(path_github_issue_raw,
                           f'{row["Name"]}.json'), indent=4, orient='records')


In [65]:
# exclude Github issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_github_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]

    for index, issue in issues.iterrows():
        if tools_keywords[tool_name] in nltk.word_tokenize(issue['Issue_title'].lower()):
            valid_issue = pd.DataFrame([issue])
            valid_issues = pd.concat(
                [valid_issues, valid_issue], ignore_index=True)
            if not pd.isnull(issue['Issue_closed_time']):
                valid_fixes = pd.concat(
                    [valid_fixes, valid_issue], ignore_index=True)

    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)

valid_issues_all = valid_issues_all[~valid_issues_all['Tool'].isin(ignore_tools)]
valid_fixes_all = valid_fixes_all[~valid_fixes_all['Tool'].isin(ignore_tools)]
valid_issues_all
valid_fixes_all

Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_comment_count,Issue_body,Tool,Issue_state
0,https://github.com/BiomedSciAI/fuse-med-ml/iss...,integrate clearml to track experiments,[enhancement],2022-10-06 12:50:08,NaT,0,0,0,,ClearML,
1,https://github.com/sillsdev/silnlp/issues/131,'use_test_set_from' parameter is not supported...,[enhancement],2022-03-15 15:40:54,NaT,0,0,0,The 'data:use_test_set_from' parameter allows ...,ClearML,
2,https://github.com/sillsdev/silnlp/issues/130,'parent' folder not copied to ClearML after pr...,[enhancement],2022-03-15 12:12:15,NaT,0,0,0,"If the preprocessing step is done locally, and...",ClearML,
3,https://github.com/sillsdev/silnlp/issues/125,Child models need to copy the dict.*.txt files...,[bug],2022-02-13 20:58:01,NaT,0,0,0,If a parent model was trained with the Alignme...,ClearML,
4,https://github.com/sillsdev/silnlp/issues/120,Execute translate script without creating Clea...,"[bug, pipeline 6: infer]",2022-01-07 09:03:27,2022-07-16 14:07:12,0,0,3,"Currently, the `silnlp.nmt.translate` script a...",ClearML,
...,...,...,...,...,...,...,...,...,...,...,...
2674,https://github.com/equinor/gordo/issues/938,Terminate and remove stale MLFlow runs,[],2020-02-18 11:22:11,NaT,0,0,1,"Currently, if MLFlow logging fails, the newly ...",MLflow,
2675,https://github.com/equinor/gordo/issues/931,Error in mlflow reporting Enum Error code ...,[],2020-02-14 06:56:40,2020-02-20 08:16:57,0,0,2,Seems to be something with the mlflow logging....,MLflow,
2676,https://github.com/equinor/gordo/issues/861,Add notebook that demos Mlflow querying,[],2020-01-13 12:10:42,NaT,0,0,0,This was never added to the repo. Need to clea...,MLflow,
2677,https://github.com/equinor/gordo/issues/843,AzureML / MLFlow remote logging seems to not work,[],2020-01-09 07:07:35,2020-01-16 13:48:43,0,0,2,Using version 0.46.0 of Gordo I don't seem to ...,MLflow,


Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_comment_count,Issue_body,Tool
0,https://github.com/sillsdev/silnlp/issues/120,Execute translate script without creating Clea...,"[bug, pipeline 6: infer]",2022-01-07 09:03:27,2022-07-16 14:07:12,0,0,3,"Currently, the `silnlp.nmt.translate` script a...",ClearML
1,https://github.com/sillsdev/silnlp/issues/109,Translate is trying to use ClearML even though...,"[bug, pipeline 6: infer]",2021-11-22 13:00:10,2021-11-22 17:10:38,0,0,2,I tried to translate with the following comman...,ClearML
2,https://github.com/sillsdev/silnlp/issues/102,ClearML: failure in set_memory_growth(),[],2021-11-04 18:44:29,2022-01-20 15:26:04,0,0,3,When launching a ClearML [experiment ](https:/...,ClearML
3,https://github.com/sillsdev/silnlp/issues/81,Add Test to clearml experiment,[],2021-10-04 14:44:56,2022-01-20 15:28:00,0,0,2,Add the test step. Fix any S3 bucket things.\...,ClearML
4,https://github.com/sillsdev/silnlp/issues/79,Add parent child capability for ClearML,[],2021-09-21 10:19:50,2021-09-22 14:09:41,0,0,0,Important functions/data:\r\n* parent_config\r...,ClearML
...,...,...,...,...,...,...,...,...,...,...
1902,https://github.com/microsoft/qlib/issues/185,Would you introduce about library of mlflow?,"[question, stale]",2021-01-11 08:52:41,2021-06-08 15:06:29,0,0,17,## ‚ùì Questions and Help\r\n\r\nWhat is the fun...,MLflow
1903,https://github.com/microsoft/qlib/issues/127,"Failed to download ""mlflow"" lib","[question, stale]",2020-12-16 14:59:10,2021-05-15 18:11:11,0,0,2,,MLflow
1904,https://github.com/equinor/gordo/issues/931,Error in mlflow reporting Enum Error code ...,[],2020-02-14 06:56:40,2020-02-20 08:16:57,0,0,2,Seems to be something with the mlflow logging....,MLflow
1905,https://github.com/equinor/gordo/issues/843,AzureML / MLFlow remote logging seems to not work,[],2020-01-09 07:07:35,2020-01-16 13:48:43,0,0,2,Using version 0.46.0 of Gordo I don't seem to ...,MLflow


In [62]:
'''
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final
'''

{'"experiments"',
 '0.4.6',
 '1.1',
 '1.4',
 '1.6',
 '1.7',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 '? - Needs Triage',
 'A: example-get-started',
 'ADO',
 'API',
 'Auto\u202fML',
 'Cloud',
 'Community',
 'DRL',
 'ERRATA_CANDIDATE',
 'Evaluation',
 'Feature - Medium Priority',
 'HIGH',
 'HPO',
 'Important',
 'Ingestion',
 'Issue: Feature Request',
 'L',
 'MLOps',
 'NLP',
 'NUM',
 'Not related to PyCaret',
 'Optional',
 'P0',
 'P1',
 'P2',
 'Priority 1',
 'RepoOfficiel',
 'Stage: Technical Design üé®',
 'TODO',
 'VISION',
 'WIP',
 'accelerator: tpu',
 'ai',
 'air',
 'alonet',
 'app-ui',
 'area / SDK-storage',
 'area / integrations',
 'area/registry',
 'august-rewrite',
 'automations',
 'awaiting response',
 'aws',
 'azure',
 'backlog',
 'benchmark',
 'bittensor',
 'blocked',
 'breakdown',
 'breaking',
 'bug',
 'build',
 'checkpointing',
 'chore',
 'cleanup',
 'code',
 'concerns: agents',
 'concerns: documentation',
 'conce

In [66]:
# filter out issues that are not related to challenges
valid_issues_filtered = []
valid_fixes_filtered = []

for index, row in valid_issues_all.iterrows():
    if not len(row['Issue_label']):
        valid_issues_filtered.append(row)
    else:
        for label_1 in issue_labels:
            for label_2 in row['Issue_label']:
                if label_1 in label_2.lower():
                    valid_issues_filtered.append(row)
                    break

for index, row in valid_fixes_all.iterrows():
    if not len(row['Issue_label']):
        valid_fixes_filtered.append(row)
    else:
        for label_1 in issue_labels:
            for label_2 in row['Issue_label']:
                if label_1 in label_2.lower():
                    valid_fixes_filtered.append(row)
                    break

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T
valid_fixes_filtered = pd.concat(valid_fixes_filtered, axis=1).T

valid_issues_filtered.to_json(os.path.join(
    path_github_issue_filtered, 'issues.json'), indent=4, orient='records')
valid_fixes_filtered.to_json(os.path.join(
    path_github_issue_filtered, 'fixes.json'), indent=4, orient='records')

In [68]:
summary_issues = valid_issues_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_fixes = valid_fixes_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_issues.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_fixes.rename(columns={'Issue_title': '#Closed'}, inplace=True)
summary_github = summary_issues.merge(
    summary_fixes, on='Tool', how='outer').fillna(0)
summary_github = summary_github.astype({'#Issue': 'int32', '#Closed': 'int32'})
summary_github.to_csv(os.path.join(
    path_github_issue, 'summary.csv'), index=False)
summary_github


Unnamed: 0,Tool,#Issue,#Closed
0,ClearML,15,11
1,Comet,23,19
2,DVC,135,82
3,Determined,1,0
4,Kedro,63,35
5,MLflow,256,181
6,Neptune,22,17
7,Pachyderm,1,0
8,Sacred,34,13
9,SigOpt,7,5


In [69]:
# After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
df_summary = pd.read_csv(os.path.join(path_github_issue, 'summary.csv'))
df_summary

Unnamed: 0,Tool,#Issue,#Closed,#Sample Issue,#Sample Closed
0,ClearML,15,11,15,11
1,Comet,23,19,22,19
2,DVC,135,82,101,68
3,Determined,1,0,1,0
4,Kedro,63,35,55,33
5,MLflow,256,181,154,124
6,Neptune,22,17,21,17
7,Pachyderm,1,0,1,0
8,Sacred,34,13,32,13
9,SigOpt,7,5,7,5


In [70]:
# sample Github issues and fixes accordingly
df_issue_samples = pd.DataFrame()
df_fix_samples = pd.DataFrame()

for index, row in df_summary.iterrows():
    df_issue_sample = valid_issues_filtered[valid_issues_filtered['Tool'] == row['Tool']].sample(
        n=row['#Sample Issue'], random_state=0)
    df_fix_sample = valid_fixes_filtered[valid_fixes_filtered['Tool'] == row['Tool']].sample(
        n=row['#Sample Closed'], random_state=0)
    df_issue_samples = pd.concat(
        [df_issue_samples, df_issue_sample], ignore_index=True)
    df_fix_samples = pd.concat(
        [df_fix_samples, df_fix_sample], ignore_index=True)

df_issue_samples.to_json(os.path.join(
    path_github_issue_sampled, 'issues.json'), indent=4, orient='records')
df_fix_samples.to_json(os.path.join(
    path_github_issue_sampled, 'fixes.json'), indent=4, orient='records')
