In [None]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import random
import glob
import json
import os

In [2]:
gitlab_miner = GitLabMiner('glpat-6ejKEymaxpnDHVqv-yk7')
github_miner = GitHubMiner('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')

In [3]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets',
    'Valohai': 'valohai/valohai-cli',
    'Domino': 'dominodatalab/python-domino',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Neptune': 'neptune-ai/neptune-client'
}

tools_creation_date = {
    'Vertex AI': '2019-03-01T00:00:00',
    'Databricks': '2021-05-27T00:00:00',
    'Comet': '2017-01-01T00:00:00',
    'Spell': '2017-01-01T00:00:00',
    'SigOpt': '2014-11-01T00:00:00'
}

tools_link = {
    'Vertex AI': 'https://cloud.google.com/vertex-ai',
    'Databricks': 'https://www.databricks.com/product/unity-catalog',
    'Comet': 'http://comet.ml',
    'Spell': 'https://spell.ml',
    'SigOpt': 'https://sigopt.com'
}

In [4]:
raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'

if not os.path.exists(raw_path):
    os.makedirs(raw_path)

if not os.path.exists(tool_path):
    os.makedirs(tool_path)

In [4]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)

In [5]:
tools_data = []

for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    github_repo_names = set()
    gitlab_repo_names = set()
    
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo_file in json_data['Results']:
            if repo_file['__typename'] == 'FileMatch':
                if 'github' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix('github.com/')
                    github_repo_names.add(repo_name)
                elif 'gitlab' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix('gitlab.com/')
                    gitlab_repo_names.add(repo_name)
            elif repo_file['__typename'] == 'Repository':
                if 'github' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('github.com/')
                    github_repo_names.add(repo_name)
                elif 'gitlab' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_repo_names.add(repo_name)

    tool_name = os.path.split(file_name)[1].split('.')[0]
    
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name)
        tool_data.pop('Archived')
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.to_datetime(tools_creation_date[tool_name]).to_datetime64()
        }

    tool_data['GitHub Dependents'] = list(github_repo_names)
    tool_data['GitLab Dependents'] = list(gitlab_repo_names)
    
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

df = pd.concat(tools_data, ignore_index=True)
df.to_json(os.path.join(tool_path, 'Tools.json'), indent=4, orient='records')
df

Unnamed: 0,Repo,Link,Creation Date,First Activity Date,Last Activity Date,Topics,#Star,#Watch,#Fork,#Releases,...,Size,#Issues,#Pull Requests,#Branches,#Commits,#Comments,#Deployments,Name,GitHub Dependents,GitLab Dependents
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2019-05-31 18:25:07,2022-12-05 09:41:17,"[python, ai, data-science, data-visualization,...",2902.0,35.0,177.0,45.0,...,57692.0,2380.0,1644.0,71.0,1973.0,8.0,5.0,Aim,"[yannicnoller/hydiff, GeoscienceAustralia/PF3D...",[lu-ci/sigma/apex-sigma]
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2017-11-30 05:59:28,2022-12-06 00:18:10,"[aws, mxnet, tensorflow, machine-learning, pyt...",1740.0,132.0,897.0,465.0,...,108082.0,3347.0,2216.0,13.0,2769.0,23.0,0.0,Amazon SageMaker,[aws-samples/amazon-sagemaker-aws-greengrass-c...,"[fluidattacks/universe, juliensimon/amazon-stu..."
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2012-05-02 17:44:40,2022-12-06 04:40:53,"[python, azure, azure-sdk, hacktoberfest]",3331.0,355.0,2106.0,2625.0,...,522458.0,27756.0,20293.0,550.0,13096.0,73.0,2808.0,Azure Machine Learning,"[MicrosoftDocs/azure-docs.zh-cn, csebuetnlp/Cr...",[]
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2019-06-10 08:18:33,2022-12-04 10:57:48,"[version-control, experiment-manager, version,...",3811.0,80.0,515.0,73.0,...,38525.0,835.0,163.0,3.0,1899.0,10.0,3.0,ClearML,"[okojoalg/raft-mlp, fcakyon/yolov5-pip, pytorc...",[]
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2013-11-12 02:18:24,2022-12-06 05:05:44,[],135.0,19.0,78.0,112.0,...,27667.0,4317.0,2218.0,123.0,4529.0,23.0,1558.0,Codalab,"[codalab/codalab-worksheets, facebookresearch/...",[]
5,,http://comet.ml,2017-01-01 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Comet,"[joelouismarino/variational_rl, Ahmkel/Keras-P...",[]
6,d6t/d6tflow,https://github.com/d6t/d6tflow,2019-02-03 01:51:22,2019-02-02 21:19:03,2022-10-26 17:04:54,[],947.0,22.0,73.0,0.0,...,271.0,42.0,19.0,1.0,269.0,0.0,0.0,D6tflow,[],[]
7,,https://www.databricks.com/product/unity-catalog,2021-05-27 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Databricks,"[niall-turbitt/e2e-mlops, aminenouira-db/ml-tr...",[]
8,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:17:59,2019-08-09 06:18:49,2022-12-05 12:41:42,"[datasets, deep-learning, machine-learning, da...",5057.0,63.0,409.0,74.0,...,63053.0,2010.0,1613.0,106.0,6531.0,5.0,48.0,Deep Lake,"[activeloopai/examples, activeloopai/deeplake,...",[]
9,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2020-04-07 16:29:08,2022-12-05 19:48:03,"[deep-learning, machine-learning, ml-platform,...",1953.0,62.0,261.0,74.0,...,102666.0,5545.0,5304.0,186.0,4637.0,46.0,35.0,Determined,[MilovanTomasevic/iOS-Swift-The-Complete-iOS-A...,[]


In [None]:
'''
df = pd.read_json(os.path.join(tool_path, 'Tools.json'))
file_name = os.path.join(raw_path, 'Amazon SageMaker.json')
github_repo_names = set()
gitlab_repo_names = set()
    
with open(file_name) as json_file:
    json_data = json.load(json_file)
    for repo_file in json_data['Results']:
        if repo_file['__typename'] == 'FileMatch':
            if 'github' in repo_file['repository']['name']:
                repo_name = repo_file['repository']['name'].removeprefix('github.com/')
                github_repo_names.add(repo_name)
            elif 'gitlab' in repo_file['repository']['name']:
                repo_name = repo_file['repository']['name'].removeprefix('gitlab.com/')
                gitlab_repo_names.add(repo_name)
        elif repo_file['__typename'] == 'Repository':
            if 'github' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('github.com/')
                github_repo_names.add(repo_name)
            elif 'gitlab' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('gitlab.com/')
                gitlab_repo_names.add(repo_name)

    tool_name = os.path.split(file_name)[1].split('.')[0]
    df.at[1, 'GitHub Dependents'] = list(github_repo_names)
    df.at[1, 'GitLab Dependents'] = list(gitlab_repo_names)

df.to_json(os.path.join(tool_path, 'Tools.json'), indent=4, orient='records')
df
'''

In [7]:
df_summary = pd.DataFrame(columns=['Name', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in df.iterrows():
    row_new = {
        'Name': row['Name'],
        '#GitHub Dependents': len(row["GitHub Dependents"]),
        '#GitLab Dependents': len(row["GitLab Dependents"])
    }
    df_summary = df_summary.append(row_new, ignore_index=True)
df_summary

Unnamed: 0,Name,#GitHub Dependents,#GitLab Dependents
0,Aim,359,1
1,Amazon SageMaker,929,3
2,Azure Machine Learning,641,0
3,ClearML,59,0
4,Codalab,6,0
5,Comet,432,0
6,D6tflow,0,0
7,Databricks,12,0
8,Deep Lake,3,0
9,Determined,44,0


In [8]:
df = pd.read_json(os.path.join(tool_path, 'Tools.json'))
df['Creation Date'] = pd.to_datetime(df['Creation Date'])

In [9]:
filter_path = '../Dataset/Filter'
    
github_path = os.path.join(filter_path, 'GitHub')
gitlab_path = os.path.join(filter_path, 'GitLab')

github_round1_path = os.path.join(github_path, 'Round#1')
gitlab_round2_path = os.path.join(gitlab_path, 'Round#2')

github_round1_error_path = os.path.join(github_round1_path, 'Error')
gitlab_round2_error_path = os.path.join(gitlab_round2_path, 'Error')

if not os.path.exists(filter_path):
    os.makedirs(filter_path)

if not os.path.exists(github_path):
    os.makedirs(github_path)

if not os.path.exists(github_round1_path):
    os.makedirs(github_round1_path)

if not os.path.exists(github_round1_error_path):
    os.makedirs(github_round1_error_path)

if not os.path.exists(gitlab_round2_error_path):
    os.makedirs(gitlab_round2_error_path)

In [11]:
for index, row in df.iterrows():
    print(f'{row["Name"]}: {row["GitLab Dependents"]}')

Aim: ['lu-ci/sigma/apex-sigma']
Amazon SageMaker: ['geeks-accelerator/oss/saas-starter-kit', 'gitlab-org/docker-distribution-pruner', 'postgres-ai/database-lab', 'msvechla/vaultbot', 'gitlab-org/gitlab-elasticsearch-indexer', 'gitlab-org/ci-cd/custom-executor-drivers/fargate', 'gitlab-org/ci-cd/docker-machine', 'minds/minds', 'gitlab-org/container-registry', 'juliensimon/amazon-studio-demos', 'fluidattacks/universe', 'rluna-gitlab/gitlab-ce', 'polarsquad/eks-auth-sync', 'gitlab-org/gitlab-foss', 'juliensimon/huggingface-demos', 'sandlin/aws/terraform_eks', 'gitlab-org/gitlab']
Azure Machine Learning: []
ClearML: []
Codalab: []
Comet: []
D6tflow: []
Databricks: []
Deep Lake: []
Determined: []
Domino: []
DVC: []
Guild AI: ['lexikiq/semicolon', 'litecord/litecord', 'nekokatt/hikari', 'lu-ci/sigma/apex-sigma']
Keepsake: []
LakeFS: []
MLflow: ['jan-teichmann/ml-flow-ds-project', 'librespacefoundation/polaris/polaris', 'juliensimon/huggingface-demos']
ModelDB: []
Neptune: []
Pachyderm: []
Po

In [13]:
for index, row in df.iterrows():
    if index not in [1, 19]:
        continue
    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=row['GitHub Dependents'], date=row['Creation Date'])

    if not github_repos_tool.empty:
        github_repos_tool.to_json(os.path.join(
            github_round1_path, f'{row["Name"]}.json'), indent=4, orient='records')

    if not github_errors_tool.empty:
        github_errors_tool.to_json(os.path.join(
            github_round1_error_path, f'{row["Name"]}.json'), indent=4, orient='records')

In [10]:
gitlab_round2_path = os.path.join(gitlab_path, 'Round#2')
gitlab_round2_error_path = os.path.join(gitlab_round2_path, 'Error')

if not os.path.exists(gitlab_round2_path):
    os.makedirs(gitlab_round2_path)

if not os.path.exists(gitlab_round2_error_path):
    os.makedirs(gitlab_round2_error_path)

In [None]:
for file_name in glob.glob(os.path.join(gitlab_round2_path, '*.csv')):
    repos = pd.read_csv(file_name)
    repos = repos[repos['Category'] != 'Examplar']

    if repos.empty:
        continue

    repos['Repo'] = repos['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))
    gitlab_repos, gitlab_errors = gitlab_miner.collect(
        repo_names=repos['Repo'].to_list())
    gitlab_repos = pd.merge(
        gitlab_repos, repos[['Repo', 'Category']], how='outer', on='Repo')

    tool_name = os.path.split(file_name)[1].split('.')[0]
    
    if not gitlab_repos.empty:
        gitlab_repos.to_json(os.path.join(
            gitlab_round2_path, f'{tool_name}.json'), indent=4, orient='records')

    if not gitlab_errors.empty:
        gitlab_errors.to_json(os.path.join(
            gitlab_round2_error_path, f'{tool_name}.json'), indent=4, orient='records')

In [20]:
# Randomly sample a fraction of elements from a list
def random_sample(lst, frac=0.1, lowerbound=50, upperbound=50):
    sample_size = int(len(lst) * frac)
    if sample_size < lowerbound:
        sample_size = lowerbound
    if sample_size > min(len(lst), upperbound):
        sample_size = min(len(lst), upperbound)
    return random.sample(lst, sample_size)

random.seed(0)

In [19]:
github_round2_path = os.path.join(github_path, 'Round#2')

if not os.path.exists(github_round2_path):
    os.makedirs(github_round2_path)
    
sample1_github_round2_path = os.path.join(github_round2_path, 'Sample#1')

if not os.path.exists(sample1_github_round2_path):
    os.makedirs(sample1_github_round2_path)

In [14]:
for file_name in glob.glob(os.path.join(github_round1_path, '*.json')):
    repos_valid = []
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo in json_data:
            if repo['#Issues'] > 0:
                repos_valid.append(repo)
    json_repos = json.dumps(repos_valid, indent='\t')
    tool_name = os.path.split(file_name)[1].split('.')[0]
    with open(os.path.join(github_round2_path, f'{tool_name}.json'), 'w') as outfile:
        outfile.write(json_repos)

In [24]:
for file_name in glob.glob(os.path.join(github_round2_path, '*.json')):
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        sample_repos = random_sample(json_data)
    tool_name = os.path.split(file_name)[1].split('.')[0]
    print(f'{len(sample_repos):2} out of {len(json_data):4} - {len(sample_repos)/len(json_data)*100:6.2f}%: {tool_name}')
    json_sample_repos = json.dumps(sample_repos, indent='\t')
    with open(os.path.join(sample1_github_round2_path, f'{tool_name}.json'), 'w') as outfile:
        outfile.write(json_sample_repos)

50 out of  174 -  28.74%: Aim
50 out of  701 -   7.13%: Amazon SageMaker
50 out of  514 -   9.73%: Azure Machine Learning
50 out of   51 -  98.04%: ClearML
 6 out of    6 - 100.00%: Codalab
50 out of  340 -  14.71%: Comet
 4 out of    4 - 100.00%: D6tflow
10 out of   10 - 100.00%: Databricks
 3 out of    3 - 100.00%: Deep Lake
27 out of   27 - 100.00%: Determined
50 out of   69 -  72.46%: Domino
50 out of   85 -  58.82%: DVC
50 out of  148 -  33.78%: Guild AI
 3 out of    3 - 100.00%: Keepsake
 6 out of    6 - 100.00%: LakeFS
50 out of  823 -   6.08%: MLflow
 8 out of    8 - 100.00%: ModelDB
50 out of  213 -  23.47%: Neptune
13 out of   13 - 100.00%: Pachyderm
33 out of   33 - 100.00%: Polyaxon
16 out of   16 - 100.00%: Quilt
50 out of  403 -  12.41%: Sacred
44 out of   44 - 100.00%: SigOpt
50 out of  516 -   9.69%: Spell
31 out of   31 - 100.00%: Valohai
50 out of   70 -  71.43%: Vertex AI
50 out of 2824 -   1.77%: Weights & Biases
