In [1]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import random
import nltk
import glob
import json
import os


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Deep Lake': 'activeloopai/deeplake',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'envd': 'tensorchord/envd',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Pachyderm': 'pachyderm/pachyderm',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'Comet': '2017-01-01',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01',
    'cnvrg.io': '2020-03-31'
}

tools_link = {
    'Comet': 'https://github.com/comet-ml',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai',
    'cnvrg.io': 'https://github.com/cnvrg'
}

tools_keywords = {
    'Aim': 'aim',
    'Amazon SageMaker': 'sagemaker',
    'Azure Machine Learning': 'azure',
    'ClearML': 'clearml',
    'cnvrg.io': 'cnvrg',
    'Codalab': 'codalab',
    'Comet': 'comet',
    'Deep Lake': 'deeplake',
    'Determined': 'determined',
    'Domino': 'domino',
    'DVC': 'dvc',
    'envd': 'envd',
    'Guild AI': 'guildai',
    'Kedro': 'kedro',
    'MLflow': 'mlflow',
    'MLRun': 'mlrun',
    'ModelDB': 'modeldb',
    'Neptune': 'neptune',
    'Pachyderm': 'pachyderm',
    'Polyaxon': 'polyaxon',
    'Sacred': 'sacred',
    'SigOpt': 'sigopt',
    'Valohai': 'valohai',
    'Vertex AI': 'vertex',
    'Weights & Biases': 'wandb'
}

ignore_tools = {
    'Amazon SageMaker',
    'Azure Machine Learning',
    'Vertex AI'
}

In [4]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_repo_raw = os.path.join(path_github_repo, 'Raw')
path_gitlab_repo_raw = os.path.join(path_gitlab_repo, 'Raw')
path_github_repo_scraped = os.path.join(path_github_repo, 'Scraped')
path_gitlab_repo_scraped = os.path.join(path_gitlab_repo, 'Scraped')
path_gitlab_repo_labelled = os.path.join(path_gitlab_repo, 'labelled')

path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')
path_github_issue_raw = os.path.join(path_github_issue, 'Raw')
path_gitlab_issue_raw = os.path.join(path_gitlab_issue, 'Raw')
path_github_issue_filtered = os.path.join(path_github_issue, 'Filtered')
path_gitlab_issue_filtered = os.path.join(path_gitlab_issue, 'Filtered')
path_github_issue_sampled = os.path.join(path_github_issue, 'Sampled')
path_gitlab_issue_sampled = os.path.join(path_gitlab_issue, 'Sampled')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

if not os.path.exists(path_github_repo_raw):
    os.makedirs(path_github_repo_raw)

if not os.path.exists(path_gitlab_repo_raw):
    os.makedirs(path_gitlab_repo_raw)

if not os.path.exists(path_github_issue_raw):
    os.makedirs(path_github_issue_raw)

if not os.path.exists(path_gitlab_issue_raw):
    os.makedirs(path_gitlab_issue_raw)

if not os.path.exists(path_github_issue_sampled):
    os.makedirs(path_github_issue_sampled)

if not os.path.exists(path_gitlab_issue_sampled):
    os.makedirs(path_gitlab_issue_sampled)

if not os.path.exists(path_github_issue_filtered):
    os.makedirs(path_github_issue_filtered)

if not os.path.exists(path_gitlab_issue_filtered):
    os.makedirs(path_gitlab_issue_filtered)

if not os.path.exists(path_github_repo_scraped):
    os.makedirs(path_github_repo_scraped)

if not os.path.exists(path_gitlab_repo_scraped):
    os.makedirs(path_gitlab_repo_scraped)

if not os.path.exists(path_gitlab_repo_labelled):
    os.makedirs(path_gitlab_repo_labelled)

In [5]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)


In [12]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name, release_date=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)
    else:
        print(error_data)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')
tools_data


Unnamed: 0,Repo,Link,Repo Creation Date,Last Commit Date,Topics,Language,Size,#Star,#Watch,#Fork,#Contributors,#Branches,#Releases,#Commits,#Pull Requests,#Pull Requests (Open),#Issues,#Issues (Open),Name,First Release Date
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2023-01-13 11:15:25,"[python, ai, data-science, data-visualization,...",TypeScript,58809.0,2994.0,36.0,186.0,49.0,83.0,46.0,1998.0,1688.0,25.0,2455.0,215.0,Aim,2022-01-22 13:45:58
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2023-01-14 23:34:37,"[aws, mxnet, tensorflow, machine-learning, pyt...",Python,108363.0,1760.0,130.0,918.0,299.0,15.0,476.0,2825.0,2275.0,55.0,3427.0,453.0,Amazon SageMaker,2017-11-19 00:00:00
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2023-01-16 01:45:20,"[python, azure, azure-sdk, hacktoberfest]",Python,534097.0,3455.0,360.0,2167.0,399.0,559.0,2683.0,13412.0,20706.0,125.0,28272.0,888.0,Azure Machine Learning,2015-02-18 00:00:00
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2023-01-11 08:30:46,"[version-control, experiment-manager, version,...",Python,38911.0,3992.0,82.0,534.0,58.0,3.0,74.0,1957.0,172.0,5.0,875.0,334.0,ClearML,2019-06-11 17:27:11
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2022-12-27 07:05:33,[],Python,28021.0,136.0,18.0,79.0,54.0,131.0,113.0,4537.0,2235.0,27.0,4350.0,375.0,Codalab,2017-05-14 00:32:55
5,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2023-01-16 02:52:15,"[data-science, machine-learning, reproducibili...",Python,17430.0,10905.0,136.0,1017.0,253.0,15.0,412.0,8302.0,4481.0,6.0,8546.0,616.0,DVC,2017-05-04 08:03:08
6,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:17:59,2023-01-15 21:13:44,"[datasets, deep-learning, machine-learning, da...",Python,63700.0,5155.0,66.0,417.0,88.0,123.0,81.0,6722.0,1677.0,21.0,2080.0,67.0,Deep Lake,2020-12-15 18:46:17
7,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2023-01-13 21:31:05,"[deep-learning, machine-learning, ml-platform,...",TypeScript,105854.0,2008.0,61.0,271.0,68.0,177.0,75.0,4792.0,5495.0,78.0,5745.0,96.0,Determined,2020-04-08 20:01:20
8,dominodatalab/python-domino,https://github.com/dominodatalab/python-domino,2016-05-16 22:58:02,2023-01-10 14:01:50,[],Python,474.0,50.0,27.0,50.0,32.0,49.0,13.0,199.0,128.0,3.0,163.0,14.0,Domino,2020-08-05 05:16:39
9,tensorchord/envd,https://github.com/tensorchord/envd,2022-04-11 09:04:19,2023-01-13 08:24:18,"[developer-tools, development-environment, mlo...",Go,2420.0,1428.0,20.0,117.0,58.0,4.0,90.0,792.0,845.0,13.0,1326.0,135.0,envd,2022-04-26 08:49:03


In [14]:
dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name) as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix('github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name) as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')

In [15]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
# dependents_summary.sort_values(by=['#GitHub Dependents', '#GitLab Dependents'], ascending=False, inplace=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,92,1
1,Amazon SageMaker,931,3
2,Azure Machine Learning,689,0
3,ClearML,303,0
4,Codalab,30,0
5,Comet,480,0
6,Deep Lake,3,0
7,Determined,44,0
8,Domino,2,0
9,DVC,4229,0


In [28]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df_tools = pd.read_json(os.path.join(path_dataset, 'Tools.json'))


In [33]:
# Gitlab dependents retrieval for labelling
for index, row in df_dependents.iterrows():
    if not row['GitLab Dependents']:
        continue
    dependent = ['gitlab.com/' +
                 repo_name for repo_name in row['GitLab Dependents']]
    dependent = pd.DataFrame({'Link': dependent})
    dependent.to_json(os.path.join(path_gitlab_repo_labelled,
                      f'{row["Tool"]}.json'), indent=4, orient='records')


In [6]:
# scrape Gitlab valid dependents general information for each tool
project_categories = {'Project', 'Toolkit', 'Research'}
for file_name in glob.glob(os.path.join(path_gitlab_repo_labelled, '*.json')):
    repos = pd.read_json(file_name)
    repos_name = []
    for index, row in repos.iterrows():
        if row['Label'] in project_categories:
            repo_name = row['Link'].removeprefix('gitlab.com/')
            repos_name.append(repo_name)

    if not repos_name:
        continue

    repos_data, errors_data = gitlab_miner.scrape_repo_list(repos_name)
    tool_name = os.path.split(file_name)[1].split('.')[0]

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'{tool_name}.json'), indent=4, orient='records')
    if not errors_data.empty:
        errors_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'Discarded.{tool_name}.json'), indent=4, orient='records')


In [None]:
# scrape issues of Gitlab dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_gitlab_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos without any issues
        repos = repos[repos['#Issues'] > 0]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(f'{row["Name"]}: {repos["#Issues"].sum()}')
        # scrape issues for the current tool
        issues = gitlab_miner.scrape_issue_list(repos['Repo'].tolist())
        issues.to_json(os.path.join(path_gitlab_issue_raw,
                       f'{row["Name"]}.json'), indent=4, orient='records')


In [43]:
# exclude Gitlab issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_gitlab_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]
    for index, issue in issues.iterrows():
        if tools_keywords[tool_name] in nltk.word_tokenize(issue['Issue_title'].lower()):
            valid_issue = pd.DataFrame([issue])
            valid_issues = pd.concat(
                [valid_issues, valid_issue], ignore_index=True)
            if not pd.isnull(issue['Issue_closed_time']):
                valid_fixes = pd.concat(
                    [valid_fixes, valid_issue], ignore_index=True)
    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)

valid_issues_all.to_json(os.path.join(
    path_gitlab_issue_filtered, 'issues.json'), indent=4, orient='records')
valid_fixes_all.to_json(os.path.join(
    path_gitlab_issue_filtered, 'fixes.json'), indent=4, orient='records')

In [44]:
summary_gitlab = valid_issues_all.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_gitlab.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_gitlab.to_csv(os.path.join(
    path_gitlab_issue, 'summary.csv'), index=False)
summary_gitlab

Unnamed: 0,Tool,#Issue
0,Amazon SageMaker,2
1,MLflow,3


In [46]:
# After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
df_summary = pd.read_csv(os.path.join(path_gitlab_issue, 'summary.csv'))
df_summary

Unnamed: 0,Tool,#Issue,#Closed,#Sample Issue,#Sample Closed
0,Amazon SageMaker,2,2,2,2
1,MLflow,3,0,3,0


In [48]:
# sample Gitlab issues and fixes accordingly
df_issue_samples = pd.DataFrame()
df_fix_samples = pd.DataFrame()

for index, row in df_summary.iterrows():
    df_issue_sample = valid_issues_all[valid_issues_all['Tool'] == row['Tool']].sample(
        n=row['#Sample Issue'], random_state=0)
    df_fix_sample = valid_fixes_all[valid_fixes_all['Tool'] == row['Tool']].sample(
        n=row['#Sample Closed'], random_state=0)
    df_issue_samples = pd.concat(
        [df_issue_samples, df_issue_sample], ignore_index=True)
    df_fix_samples = pd.concat(
        [df_fix_samples, df_fix_sample], ignore_index=True)


df_issue_samples.to_json(os.path.join(
    path_gitlab_issue_sampled, 'issues.json'), indent=4, orient='records')
df_fix_samples.to_json(os.path.join(
    path_gitlab_issue_sampled, 'fixes.json'), indent=4, orient='records')

In [49]:
# filter out Gitlab issues that are mostly not related to asset management
df_issue_samples = pd.read_json(os.path.join(
    path_gitlab_issue_sampled, 'issues.json'))
df_fixes_samples = pd.read_json(os.path.join(
    path_gitlab_issue_sampled, 'fixes.json'))
df_issue_samples[~df_issue_samples['Tool'].isin(ignore_tools)].to_json(
    os.path.join(path_gitlab_issue_sampled, 'lean_issues.json'), indent=4, orient='records')
df_fixes_samples[~df_fixes_samples['Tool'].isin(ignore_tools)].to_json(
    os.path.join(path_gitlab_issue_sampled, 'lean_fixes.json'), indent=4, orient='records')

In [17]:
# scrape Github dependents general information for each tool
for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos_data, errors_data = github_miner.scrape_repo_list(
        row['GitHub Dependents'])

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_github_repo_scraped, f'{row["Tool"]}.json'), indent=4, orient='records')

    if not errors_data.empty:
        errors_data.to_json(os.path.join(path_github_repo_scraped,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

0: Aim
1: Amazon SageMaker
2: Azure Machine Learning
3: ClearML
4: Codalab
5: Comet
6: Deep Lake
7: Determined
8: Domino
9: DVC
10: Guild AI
11: Kedro
12: Keepsake
13: MLflow
14: MLRun
15: ModelDB
16: Neptune
17: Pachyderm
18: Polyaxon
19: Sacred
20: SigOpt
21: Valohai
22: Vertex AI
23: Weights & Biases


In [21]:
# scrape issues of Github dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_github_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos with only pr-based issues
        repos = repos[repos['#Issues'] > repos['#Pull Requests']]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(
            f'{row["Name"]}: {repos["#Issues"].sum() - repos["#Pull Requests"].sum()}')
        # scrape issues for the current tool
        issues = github_miner.scrape_issue_list(repos['Repo'].tolist())
        if not issues.empty:
            issues.to_json(os.path.join(path_github_issue_raw,
                           f'{row["Name"]}.json'), indent=4, orient='records')


Kedro: 755
MLRun: 10


In [23]:
# exclude Github issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_github_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]

    for index, issue in issues.iterrows():
        if tools_keywords[tool_name] in nltk.word_tokenize(issue['Issue_title'].lower()):
            valid_issue = pd.DataFrame([issue])
            valid_issues = pd.concat(
                [valid_issues, valid_issue], ignore_index=True)
            if not pd.isnull(issue['Issue_closed_time']):
                valid_fixes = pd.concat(
                    [valid_fixes, valid_issue], ignore_index=True)

    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)

valid_issues_all.to_json(os.path.join(
    path_github_issue_filtered, 'issues.json'), indent=4, orient='records')
valid_fixes_all.to_json(os.path.join(
    path_github_issue_filtered, 'fixes.json'), indent=4, orient='records')


In [24]:
summary_issues = valid_issues_all.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_fixes = valid_fixes_all.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_issues.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_fixes.rename(columns={'Issue_title': '#Closed'}, inplace=True)
summary_github = summary_issues.merge(
    summary_fixes, on='Tool', how='outer').fillna(0)
summary_github = summary_github.astype({'#Issue': 'int32', '#Closed': 'int32'})
summary_github.to_csv(os.path.join(
    path_github_issue, 'summary.csv'), index=False)
summary_github


Unnamed: 0,Tool,#Issue,#Closed
0,Amazon SageMaker,403,258
1,Azure Machine Learning,1029,800
2,ClearML,21,12
3,Comet,29,24
4,DVC,274,157
5,Determined,1,0
6,Kedro,85,47
7,MLRun,1,1
8,MLflow,426,308
9,Neptune,33,25


In [25]:
# After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
df_summary = pd.read_csv(os.path.join(path_github_issue, 'summary.csv'))
df_summary

Unnamed: 0,Tool,#Issue,#Closed,#Sample Issue,#Sample Closed
0,Amazon SageMaker,403,258,197,155
1,Azure Machine Learning,1029,800,280,260
2,ClearML,21,12,20,12
3,Comet,29,24,28,23
4,Determined,1,0,1,0
5,DVC,274,157,161,112
6,Kedro,85,47,70,42
7,MLflow,426,308,203,172
8,MLRun,1,1,1,1
9,Neptune,33,25,31,24


In [26]:
# sample Github issues and fixes accordingly
df_issue_samples = pd.DataFrame()
df_fix_samples = pd.DataFrame()

for index, row in df_summary.iterrows():
    df_issue_sample = valid_issues_all[valid_issues_all['Tool'] == row['Tool']].sample(
        n=row['#Sample Issue'], random_state=0)
    df_fix_sample = valid_fixes_all[valid_fixes_all['Tool'] == row['Tool']].sample(
        n=row['#Sample Closed'], random_state=0)
    df_issue_samples = pd.concat(
        [df_issue_samples, df_issue_sample], ignore_index=True)
    df_fix_samples = pd.concat(
        [df_fix_samples, df_fix_sample], ignore_index=True)

df_issue_samples.to_json(os.path.join(
    path_github_issue_sampled, 'issues.json'), indent=4, orient='records')
df_fix_samples.to_json(os.path.join(
    path_github_issue_sampled, 'fixes.json'), indent=4, orient='records')


In [27]:
# filter out Github issues that are mostly not related to asset management
df_issue_samples = pd.read_json(os.path.join(
    path_github_issue_sampled, 'issues.json'))
df_fix_samples = pd.read_json(os.path.join(
    path_github_issue_sampled, 'fixes.json'))
df_issue_samples[~df_issue_samples['Tool'].isin(ignore_tools)].to_json(
    os.path.join(path_github_issue_sampled, 'lean_issues.json'), indent=4, orient='records')
df_fix_samples[~df_fix_samples['Tool'].isin(ignore_tools)].to_json(
    os.path.join(path_github_issue_sampled, 'lean_fixes.json'), indent=4, orient='records')