In [1]:
import multiprocessing
import pandas as pd
import pickle
import json
import os

from pandarallel import pandarallel
from Scrape.GHMiner import GitHubMiner
from Scrape.GLMiner import GitLabMiner

In [4]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_discussion = os.path.join(path_github, 'Discussion')
path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_labeling):
    os.makedirs(path_labeling)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_discussion):
    os.makedirs(path_github_discussion)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

In [2]:
pandarallel.initialize(progress_bar=False, nb_workers=multiprocessing.cpu_count())

github_miner = GitHubMiner(private_token=os.getenv('GITHUB_TOKEN'))
gitlab_miner = GitLabMiner(private_token=os.getenv('GITLAB_TOKEN'))

tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'Neptune': 'neptune-ai/neptune-client',
    'Optuna': 'optuna/optuna',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Verta': 'VertaAI/modeldb',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'H2O AI Cloud': '2021-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'H2O AI Cloud': 'https://h2o.ai/platform/ai-cloud',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker'],
    'Azure Machine Learning': ['aml', 'azure machine learning', 'azure ml', 'azure-ml', 'azureml'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai', 'guild-ai', 'guildai'],
    'H2O AI Cloud': ['h2o ai', 'h2o.ai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Verta': ['modeldb', 'verta'],
    'Vertex AI': ['vertex ai', 'vertex-ai', 'vertexai'],
    'Weights & Biases': ['weights and biases', 'wandb', 'weights & biases', 'weights&biases', 'w & b', 'w&b']
}

issue_labels = {
    'bug',
    'crash',
    'error',
    'invalid',
}


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name, release_time=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'), indent=4, orient='records')
tools_data


Unnamed: 0,Repo,Link,Repo Created Date,Last Commit Date,Topic,Language,Size,#Star,#Watch,#Fork,#Contributor,#Branch,#Release,#Commit,#Pull Requests,#Pull Requests (Open),#Issue,#Issue (Open),Name,First Release Date
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2023-08-14 14:02:51,"[python, ai, data-science, data-visualization,...",Python,64682.0,4109.0,45.0,253.0,65.0,97.0,55.0,2390.0,2016.0,15.0,2932.0,295.0,Aim,2022-01-22 13:45:58
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2023-08-15 21:08:45,"[aws, mxnet, tensorflow, machine-learning, pyt...",Python,112079.0,1903.0,133.0,1032.0,373.0,17.0,532.0,3213.0,2596.0,27.0,3884.0,486.0,Amazon SageMaker,2017-11-19 00:00:00
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2023-08-16 16:03:51,"[python, azure, azure-sdk, hacktoberfest]",Python,610773.0,3779.0,367.0,2469.0,401.0,688.0,3038.0,15223.0,23201.0,133.0,31575.0,1041.0,Azure Machine Learning,2015-02-18 00:00:00
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2023-08-15 10:54:04,"[version-control, experiment-manager, version,...",Python,43142.0,4623.0,87.0,603.0,78.0,4.0,87.0,2181.0,226.0,5.0,1081.0,365.0,ClearML,2019-06-11 17:27:11
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2023-08-06 20:02:30,[],Python,27814.0,144.0,18.0,79.0,53.0,145.0,117.0,4595.0,2309.0,19.0,4512.0,384.0,Codalab,2017-05-14 00:32:55
5,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2023-08-16 02:51:20,"[data-science, machine-learning, reproducibili...",Python,19169.0,11866.0,137.0,1087.0,262.0,13.0,474.0,8922.0,5107.0,14.0,9523.0,568.0,DVC,2017-05-04 08:03:08
6,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2023-08-16 17:15:49,"[deep-learning, machine-learning, ml-platform,...",Go,215251.0,2414.0,75.0,317.0,89.0,298.0,90.0,6263.0,7327.0,94.0,7641.0,125.0,Determined,2020-04-08 20:01:20
7,dominodatalab/python-domino,https://github.com/dominodatalab/python-domino,2016-05-16 22:58:02,2023-08-09 15:37:51,[],Python,479.0,54.0,29.0,54.0,34.0,56.0,16.0,206.0,138.0,4.0,177.0,19.0,Domino,2020-08-05 05:16:39
8,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 18:57:50,2023-08-12 20:19:05,[],Python,24057.0,813.0,14.0,78.0,25.0,70.0,2.0,5777.0,77.0,2.0,511.0,218.0,Guild AI,2022-04-28 14:31:07
9,kedro-org/kedro,https://github.com/kedro-org/kedro,2019-04-18 10:29:56,2023-08-14 18:08:55,"[pipeline, kedro, hacktoberfest, mlops, experi...",Python,196681.0,8590.0,106.0,810.0,185.0,39.0,42.0,2356.0,1278.0,25.0,2697.0,347.0,Kedro,2019-06-03 16:15:43


In [5]:
github_dependents = {}
gitlab_dependents = {}

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    # collect Github dependents
    file_name = os.path.join(path_github_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        if repo_name in github_dependents:
                            github_dependents[repo_name].append(tool_name)
                        else:
                            github_dependents[repo_name] = [tool_name]
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix(
                            'github.com/')
                        if repo_name in github_dependents:
                            github_dependents[repo_name].append(tool_name)
                        else:
                            github_dependents[repo_name] = [tool_name]
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    repo_name = repo_file['name']
                    if repo_name in github_dependents:
                        github_dependents[repo_name].append(tool_name)
                    else:
                        github_dependents[repo_name] = [tool_name]

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    if repo_name in gitlab_dependents:
                        gitlab_dependents[repo_name].append(tool_name)
                    else:
                        gitlab_dependents[repo_name] = [tool_name]
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    if repo_name in gitlab_dependents:
                        gitlab_dependents[repo_name].append(tool_name)
                    else:
                        gitlab_dependents[repo_name] = [tool_name]

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.pop(tools_repo[tool_name], None)

with open(os.path.join(path_github_repo, 'Dependents.pickle'), 'wb') as file:
    pickle.dump(github_dependents, file, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(path_gitlab_repo, 'Dependents.pickle'), 'wb') as file:
    pickle.dump(gitlab_dependents, file, protocol=pickle.HIGHEST_PROTOCOL)

print(len(github_dependents))
print(len(gitlab_dependents))

37782
10


In [16]:
github_repos = {}
gitlab_repos = {}

for repo_name, tool_list in github_dependents.items():
    for tool_name in tool_list:
        github_repos[tool_name] = github_repos.get(tool_name, 0) + 1

for repo_name, tool_list in gitlab_dependents.items():
    for tool_name in tool_list:
        gitlab_repos[tool_name] = gitlab_repos.get(tool_name, 0) + 1

dependents_summary = pd.DataFrame(columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])

for tool_name, repo_num in github_repos.items():
    if tool_name in gitlab_repos:
        entry = {'Tool': tool_name, '#GitHub Dependents': repo_num, '#GitLab Dependents': gitlab_repos[tool_name]}
    else:
        entry = {'Tool': tool_name, '#GitHub Dependents': repo_num, '#GitLab Dependents': 0}
    dependents_summary = pd.concat([dependents_summary, pd.DataFrame([entry])], ignore_index=True)

dependents_summary = dependents_summary.sort_values(by=['#GitHub Dependents', '#GitLab Dependents'], ascending=False)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
2,Weights & Biases,19952,0
4,Optuna,6370,0
9,DVC,6098,0
5,Sacred,1918,0
7,MLflow,1573,4
1,Kedro,1184,0
11,Amazon SageMaker,1113,3
6,Azure Machine Learning,826,0
8,Comet,678,0
3,ClearML,498,0


In [4]:
with open(os.path.join(path_github_repo, 'Dependents.pickle'), 'rb') as file:
    github_dependents = pickle.load(file)
    print(len(github_dependents))

with open(os.path.join(path_gitlab_repo, 'Dependents.pickle'), 'rb') as file:
    gitlab_dependents = pickle.load(file)
    print(len(gitlab_dependents))
    
df_tool = pd.read_json(os.path.join(path_dataset, 'Tools.json'))
tools_release_date = pd.Series(pd.to_datetime(df_tool['First Release Date'].values), index=df_tool['Name']).to_dict()

37782
10


In [13]:
# scrape discussion posts of Github dependents for each tool

df_posts = pd.DataFrame()

for repo_name, tool_list in github_dependents.items():
    repo_data = github_miner.scrape_repo(repo_name)
    if repo_data.empty:
        continue
    invalid_repo = []
    for tool_name in tool_list:
        if repo_data['Repo Created Date'].values[0] < tools_release_date[tool_name]:
            invalid_repo.append(tool_name)
    tool_list = [tool for tool in tool_list if tool not in invalid_repo]
    if tool_list:
        posts = pd.DataFrame()
        try:
            posts = github_miner.scrape_discussion(repo_name)
        except:
            print(f'Crashed repo: {repo_name}')
        if posts.empty:
            continue
        posts['Tools'] = [tool_list for _ in range(len(posts))]
        df_posts = pd.concat([df_posts, posts], ignore_index=True)
        df_posts.to_json(os.path.join(path_github_discussion, 'raw.json'), indent=4, orient='records')
                
df_posts.to_json(os.path.join(path_github_discussion, 'raw.json'), indent=4, orient='records')
print(len(df_posts))

6912

In [6]:
# exclude discussions that are not related to each tool
# create discussion dataset

df = pd.read_json(os.path.join(path_github_discussion, 'raw.json'))
df['Platform'] = 'GitHub Discussion'
df_discussion = pd.DataFrame()

for index, row in df.iterrows():
    tool_list = []
    for tool_name in row['Tools']:
        for keyword in tools_keywords[tool_name]:
            if keyword in row['Question_title'].lower():
                tool_list.append(tool_name)
                break
    if tool_list:
        row['Tools'] = tool_list
        df_discussion = pd.concat([df_discussion, row.to_frame().T], ignore_index=True)

df_discussion.to_json(os.path.join(path_labeling, 'discussions.json'), indent=4, orient='records')
len(df_discussion)

43

In [9]:
# scrape issues of Github dependents for each tool

df_issues = pd.DataFrame()

for repo_name, tool_list in github_dependents.items():
    repo_data = github_miner.scrape_repo(repo_name)
    if repo_data.empty or (repo_data['#Issue'].values[0] - repo_data['#Pull Requests'].values[0] < 1):
        continue
    invalid_repo = []
    for tool_name in tool_list:
        if repo_data['Repo Created Date'].values[0] < tools_release_date[tool_name]:
            invalid_repo.append(tool_name)
    tool_list = [tool for tool in tool_list if tool not in invalid_repo]
    if tool_list:
        issues = github_miner.scrape_issue(repo_name)
        issues['Tools'] = [tool_list for _ in range(len(issues))]
        df_issues = pd.concat([df_issues, issues], ignore_index=True)
        df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')
    
df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')
len(df_issues)


437652

In [10]:
# scrape issues of Gitlab dependents for each tool

df_issues = pd.DataFrame()

for repo_name, tool_list in github_dependents.items():
    repo_data = gitlab_miner.scrape_repo(repo_name)
    if repo_data.empty or (repo_data['#Issue'].values[0] < 1):
        continue
    invalid_repo = []
    for tool_name in tool_list:
        if repo_data['Repo Created Date'].values[0] < tools_release_date[tool_name]:
            invalid_repo.append(tool_name)
    tool_list = [tool for tool in tool_list if tool not in invalid_repo]
    if tool_list:
        issues = gitlab_miner.scrape_issue(repo_name)
        issues['Tools'] = [tool_list for _ in range(len(issues))]
        df_issues = pd.concat([df_issues, issues], ignore_index=True)
        df_issues.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')
    
df_issues.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')
len(df_issues)

7675

In [7]:
# exclude issues that are not related to each tool

df_issues_gitlab = pd.read_json(os.path.join(path_gitlab_issue, 'raw.json'))
df_issues_github = pd.read_json(os.path.join(path_github_issue, 'raw.json'))

df_issues_github['Platform'] = 'GitHub Issue'
df_issues_gitlab['Platform'] = 'GitLab Issue'

df_issues = pd.DataFrame()

for index, row in df_issues_github.iterrows():
    tool_list = []
    for tool_name in row['Tools']:
        for keyword in tools_keywords[tool_name]:
            if keyword in row['Issue_title'].lower():
                tool_list.append(tool_name)
                break
    if tool_list:
        row['Tools'] = tool_list
        df_issues = pd.concat([df_issues, row.to_frame().T], ignore_index=True)

for index, row in df_issues_gitlab.iterrows():
    tool_list = []
    for tool_name in row['Tools']:
        for keyword in tools_keywords[tool_name]:
            if keyword in row['Issue_title'].lower():
                tool_list.append(tool_name)
                break
    if tool_list:
        row['Tools'] = tool_list
        df_issues = pd.concat([df_issues, row.to_frame().T], ignore_index=True)

len(df_issues)

5178

In [8]:
final = set()
for _, row in df_issues['Issue_label'].map(set).items():
    final = final.union(row)
final

{'"experiments"',
 '0.4.6',
 '1 - Critic',
 '1.1',
 '1.1.0',
 '1.4',
 '1.6',
 '1.7',
 '2.0',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 '3rd-party',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 ':rotating_light:',
 'A: example-get-started',
 'A: example-get-started-experiments',
 'ADO',
 'AI\u202fFrameworks/ONNX',
 'AML Compute Instance',
 'API',
 'Auto\u202fML',
 'BF',
 'Bug',
 'CXP Attention',
 'CleanUp',
 'Client',
 'Cloud',
 'Cognitive - Text Analytics',
 'Cognitive Services',
 'Community',
 'Community Contribution Needed',
 'Compute',
 'Contributions welcome',
 'Core UI',
 'DRL',
 'Data Labeling',
 'Data4ML',
 'Data\u202fDrift',
 'Data\u202fPrep\u202fServices',
 'DeepSpeed',
 'Defect',
 'Design',
 'Design Doc',
 'Design: Research',
 'Developer experience',
 'Documentation',
 'Done',
 'ERRATA_CANDIDATE',
 'Enhancement',
 'Environments',
 'Evaluation',
 'Experiment Tracking',
 'Experimentation UI',
 'FAQ',
 'Feature - Medium Priority',
 'Feature request',
 'Good Fir

In [9]:
# filter out issues that are not related to challenges

df = pd.DataFrame()

for index, row in df_issues.iterrows():
    if not row['Issue_label']:
        df = pd.concat([df, row.to_frame().T], ignore_index =True)
    else:
        for label in row['Issue_label']:
            for issue_label in issue_labels:
                if issue_label in label.lower():
                    df = pd.concat([df, row.to_frame().T], ignore_index =True)
                    break
            else:
                continue
            break

print(len(df[df['Platform'] == 'GitHub Issue']))
print(len(df[df['Platform'] == 'GitLab Issue']))


3332
3


In [15]:
# create issue dataset

df['Issue_score_count'] = df['Issue_upvote_count'] - df['Issue_downvote_count']
df['Issue_comment_score'] = df['Issue_comment_upvote'] - df['Issue_comment_downvote']
df['Issue_tag_count'] = df['Issue_label'].map(len)
df['Issue_body'] = df['Issue_body'].fillna('')

df.drop(columns=['Issue_upvote_count', 'Issue_downvote_count', 'Issue_comment_upvote', 'Issue_comment_downvote', 'Issue_label'], inplace=True)
df.to_json(os.path.join(path_labeling, 'issues.json'), indent=4, orient='records')