In [25]:
import ast
import glob
import json
import os

import pandas as pd
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner

In [26]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets',
    'Valohai': 'valohai/valohai-cli',
    'Domino': 'dominodatalab/python-domino',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Neptune': 'neptune-ai/neptune-client'
}

tools_creation_date = {
    'Vertex AI': '2019-03-01T00:00:00',
    'Databricks': '2021-05-27T00:00:00',
    'Comet': '2017-01-01T00:00:00',
    'Spell': '2017-01-01T00:00:00',
}

tools_link = {
    'Vertex AI': 'https://cloud.google.com/vertex-ai',
    'Databricks': 'https://www.databricks.com',
    'Comet': 'http://comet.ml',
    'Spell': 'https://spell.ml',
}


In [39]:
raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'

if not os.path.exists(raw_path):
    os.makedirs(raw_path)

if not os.path.exists(tool_path):
    os.makedirs(tool_path)


In [40]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)


In [41]:
tools_data = []

for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    github_repo_names = set()
    gitlab_repo_names = set()
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo_file in json_data['Results']:
            if repo_file['__typename'] == 'FileMatch':
                if 'github' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix('github.com/')
                    github_repo_names.add(repo_name)
                elif 'gitlab' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix('gitlab.com/')
                    gitlab_repo_names.add(repo_name)
            elif repo_file['__typename'] == 'Repository':
                if 'github' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('github.com/')
                    github_repo_names.add(repo_name)
                elif 'gitlab' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_repo_names.add(repo_name)

    tool_name = file_name.split('\\')[-1].split('.')[0]
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name)
        tool_data.pop('Archived')
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.to_datetime(tools_creation_date[tool_name]).to_datetime64()
        }

    tool_data['GitHub Dependents'] = list(github_repo_names)
    tool_data['GitLab Dependents'] = list(gitlab_repo_names)
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

tools_data = pd.concat(tools_data, ignore_index=True)
tools_data.to_csv(os.path.join(tool_path, 'Tools.csv'), index=False)
tools_data


Unnamed: 0,Repo,Link,Creation Date,First Activity Date,Last Activity Date,Topics,#Star,#Watch,#Fork,#Releases,...,Size,#Issues,#Pull Requests,#Branches,#Commits,#Comments,#Deployments,Name,GitHub Dependents,GitLab Dependents
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2019-05-31 18:25:07,2022-11-18 15:58:44,"[python, ai, data-science, data-visualization,...",2868.0,35.0,178.0,43.0,...,56556.0,2341.0,1620.0,77.0,1942.0,8.0,5.0,Aim,"[crownpku/aiml_chatbot, VlachosGroup/AIMSim, c...",[lu-ci/sigma/apex-sigma]
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2017-11-30 05:59:28,2022-11-15 03:24:09,"[aws, mxnet, tensorflow, machine-learning, pyt...",1731.0,130.0,882.0,463.0,...,107853.0,3316.0,2197.0,12.0,2729.0,23.0,0.0,Amazon SageMaker,"[cc4i/mahjong0, ryanprior/guix-packages, Chain...","[msvechla/vaultbot, gitlab-org/ci-cd/custom-ex..."
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2012-05-02 17:44:40,2022-11-18 07:43:33,"[python, azure, azure-sdk, hacktoberfest]",3300.0,356.0,2086.0,2591.0,...,517903.0,27528.0,20117.0,536.0,12953.0,72.0,2795.0,Azure Machine Learning,"[Azure/nlp-samples, IBM/watson-openscale-sampl...",[]
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2019-06-10 08:18:33,2022-11-15 13:59:26,"[version-control, experiment-manager, version,...",3764.0,78.0,501.0,70.0,...,37351.0,814.0,158.0,3.0,1876.0,10.0,3.0,ClearML,"[okojoalg/raft-mlp, megvii-research/basecls, x...",[]
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2013-11-12 02:18:24,2022-11-16 19:39:27,[],133.0,20.0,78.0,112.0,...,27460.0,4309.0,2211.0,121.0,4522.0,23.0,1551.0,Codalab,"[codalab/codalab-competitions, CharlesPikachu/...",[]
5,,http://comet.ml,2017-01-01 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Comet,[saeyslab/DeepLearning_for_ImagingFlowCytometr...,[]
6,d6t/d6tflow,https://github.com/d6t/d6tflow,2019-02-03 01:51:22,2019-02-02 21:19:03,2022-10-26 17:04:54,[],945.0,22.0,73.0,0.0,...,271.0,42.0,19.0,1.0,269.0,0.0,0.0,D6tflow,[],[]
7,,https://www.databricks.com,2021-05-27 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Databricks,"[rchynoweth/DemoContent, RafiKurlansik/dais202...",[]
8,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:17:59,2019-08-09 06:18:49,2022-11-17 14:18:36,"[datasets, deep-learning, machine-learning, da...",5019.0,63.0,407.0,72.0,...,61055.0,1983.0,1589.0,90.0,6490.0,5.0,48.0,Deep Lake,"[activeloopai/deeplake, arnor-sigurdsson/EIR, ...",[]
9,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2020-04-07 16:29:08,2022-11-17 21:03:23,"[deep-learning, machine-learning, ml-platform,...",1927.0,57.0,257.0,73.0,...,101788.0,5453.0,5213.0,171.0,4568.0,46.0,29.0,Determined,"[mdaniel/virtualbox-org-svn-vbox-trunk, frankf...",[]


In [42]:
df_summary = pd.DataFrame(columns=['Name', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in tools_data.iterrows():
    row_new = {
        'Name': row['Name'],
        '#GitHub Dependents': len(row["GitHub Dependents"]),
        '#GitLab Dependents': len(row["GitLab Dependents"])
    }
    df_summary = df_summary.append(row_new, ignore_index=True)
df_summary.to_csv(os.path.join(tool_path, '#Dependents.csv'), index=False)
df_summary


Unnamed: 0,Name,#GitHub Dependents,#GitLab Dependents
0,Aim,359,1
1,Amazon SageMaker,5364,17
2,Azure Machine Learning,641,0
3,ClearML,59,0
4,Codalab,6,0
5,Comet,432,0
6,D6tflow,0,0
7,Databricks,12,0
8,Deep Lake,3,0
9,Determined,44,0


In [None]:
'''
if not os.path.exists(tool_path):
    os.makedirs(tool_path)

file_name = os.path.join(raw_path, 'Codalab.json')
tool_name = file_name.split('/')[-1].split('.')[0]

github_repo_names = set()
gitlab_repo_names = set()

with open(file_name) as json_file:
    json_data = json.load(json_file)
    for repo_file in json_data['Results']:
        if 'github' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('github.com/')
            github_repo_names.add(repo_name)
        if 'gitlab' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('gitlab.com/')
            github_repo_names.add(repo_name)

if tool_name in tool_repos:
    tool_data, error_data = github_miner.scrape(
        repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
    del tool_data['Archived']
else:
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
    }
    
tool_data['GitHub Dependents'] = list(github_repo_names)
tool_data['GitLab Dependents'] = list(gitlab_repo_names)
tool_data = pd.DataFrame([tool_data])
tool_data.to_csv(os.path.join(tool_path, f'{tool_name}.csv'), index=False)
print(len(github_repo_names))
print(len(gitlab_repo_names))
'''


In [19]:
'''
for index, row in tools_data.iterrows():
    df_github_dependents = pd.DataFrame(columns=['Link'])
    df_gitlab_dependents = pd.DataFrame(columns=['Link'])
    for repo_name in ast.literal_eval(row['GitHub Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://github.com/' + repo_name}])
        df_github_dependents = pd.concat([df_github_dependents, df_repo])
    for repo_name in ast.literal_eval(row['GitLab Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://gitlab.com/' + repo_name}])
        df_gitlab_dependents = pd.concat([df_gitlab_dependents, df_repo])
    df_github_dependents.to_csv(os.path.join(github_round1_path, f'Repo_{tool_name}.csv'), index=False)
    df_gitlab_dependents.to_csv(os.path.join(gitlab_round1_path, f'Repo_{row["Name"]}.csv'), index=False)
'''


In [6]:
'''
def date_parser(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')


df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'), parse_dates=[
                 'Creation Date'], date_parser=date_parser)
'''

In [43]:
df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'))
df['Creation Date'] = pd.to_datetime(df['Creation Date'])

In [44]:
for index, row in df.iterrows():
    print(f'{row["Name"]}: {row["GitLab Dependents"]}')

Aim: ['lu-ci/sigma/apex-sigma']
Amazon SageMaker: ['msvechla/vaultbot', 'gitlab-org/ci-cd/custom-executor-drivers/fargate', 'gitlab-org/ci-cd/docker-machine', 'rluna-gitlab/gitlab-ce', 'gitlab-org/gitlab', 'gitlab-org/docker-distribution-pruner', 'polarsquad/eks-auth-sync', 'sandlin/aws/terraform_eks', 'minds/minds', 'postgres-ai/database-lab', 'gitlab-org/gitlab-elasticsearch-indexer', 'juliensimon/huggingface-demos', 'gitlab-org/gitlab-foss', 'geeks-accelerator/oss/saas-starter-kit', 'juliensimon/amazon-studio-demos', 'fluidattacks/universe', 'gitlab-org/container-registry']
Azure Machine Learning: []
ClearML: []
Codalab: []
Comet: []
D6tflow: []
Databricks: []
Deep Lake: []
Determined: []
Domino: []
DVC: []
Guild AI: ['lexikiq/semicolon', 'litecord/litecord', 'nekokatt/hikari', 'lu-ci/sigma/apex-sigma']
Keepsake: []
LakeFS: []
MLflow: ['juliensimon/huggingface-demos', 'librespacefoundation/polaris/polaris', 'jan-teichmann/ml-flow-ds-project']
ModelDB: []
Neptune: []
Pachyderm: []
Po

In [45]:
filter_path = '../Dataset/Filter'
    
github_path = os.path.join(filter_path, 'GitHub')
gitlab_path = os.path.join(filter_path, 'GitLab')

github_round1_path = os.path.join(github_path, 'Round#1')
gitlab_round1_path = os.path.join(gitlab_path, 'Round#1')

github_round1_error_path = os.path.join(github_round1_path, 'Error')
gitlab_round1_error_path = os.path.join(gitlab_round1_path, 'Error')

if not os.path.exists(filter_path):
    os.makedirs(filter_path)

if not os.path.exists(github_path):
    os.makedirs(github_path)

if not os.path.exists(github_round1_path):
    os.makedirs(github_round1_path)

if not os.path.exists(github_round1_error_path):
    os.makedirs(github_round1_error_path)

if not os.path.exists(gitlab_round1_error_path):
    os.makedirs(gitlab_round1_error_path)


In [46]:
gitlab_miner = GitLabMiner('glpat-6ejKEymaxpnDHVqv-yk7')
github_miner = GitHubMiner('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')

In [47]:
for index, row in df.iterrows():
    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=ast.literal_eval(row['GitHub Dependents']), date=row['Creation Date'])

    if not github_repos_tool.empty:
        github_repos_tool.to_csv(os.path.join(
            github_round1_path, f'{row["Name"]}.csv'), index=False)

    if not github_errors_tool.empty:
        github_errors_tool.to_csv(os.path.join(
            github_round1_error_path, f'{row["Name"]}.csv'), index=False)

In [17]:
df_summary = pd.DataFrame(columns=['Name', '#GitHub Dependents'])
for file_name in glob.glob(os.path.join(github_round1_path, '*.csv')):
    tool_name = file_name.split('\\')[-1].split('.')[0]
    df = pd.read_csv(file_name)
    row_new = {
        'Name': tool_name,
        '#GitHub Dependents': df.shape[0]
    }
    df_summary = df_summary.append(row_new, ignore_index=True)
df_summary


Unnamed: 0,Name,#GitHub Dependents
0,Aim,704
1,Amazon SageMaker,560
2,ClearML,55
3,Codalab,146
4,D6tflow,4
5,Deep Lake,2
6,Guild AI,144
7,Keepsake,3
8,LakeFS,9
9,MLflow,226


In [28]:
gitlab_round2_path = os.path.join(gitlab_path, 'Round#2')
gitlab_round2_error_path = os.path.join(gitlab_round2_path, 'Error')

if not os.path.exists(gitlab_round2_path):
    os.makedirs(gitlab_round2_path)

if not os.path.exists(gitlab_round2_error_path):
    os.makedirs(gitlab_round2_error_path)

In [None]:
for file_name in glob.glob(os.path.join(gitlab_round1_path, '*.csv')):
    tool_name = file_name.split('\\')[-1].split('.')[0]
    repos = pd.read_csv(file_name)
    repos = repos[repos['Category'] != 'Examplar']

    if repos.empty:
        continue

    repos['Repo'] = repos['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))
    gitlab_repos, gitlab_errors = gitlab_miner.collect(
        repo_names=repos['Repo'].to_list())
    gitlab_repos = pd.merge(
        gitlab_repos, repos[['Repo', 'Category']], how='outer', on='Repo')

    if not gitlab_repos.empty:
        gitlab_repos.to_csv(os.path.join(gitlab_round2_path,
                                         f'{tool_name}.csv'), index=False)

    if not gitlab_errors.empty:
        gitlab_errors.to_csv(os.path.join(
            gitlab_round2_error_path, f'{tool_name}.csv'), index=False)

In [None]:
'''
for file_name in glob.glob(os.path.join(github_round2_path, '*.csv')):
    tool_name = file_name.split('_')[-1].split('.')[0]

    df = pd.read_csv(file_name)
    df['Name'] = df['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))

    df_tool = df[df['Category'] == 'Tool']
    df_project = df[df['Category'] == 'Project']
    df_research = df[df['Category'] == 'Research']

    repos_tool = df_tool['Name'].to_list()
    repos_project = df_project['Name'].to_list()
    repos_research = df_research['Name'].to_list()

    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=repos_tool)
    github_repos_research, github_errors_research = github_miner.collect(
        repo_names=repos_research)
    github_repos_project, github_errors_project = github_miner.collect(
        repo_names=repos_project)

    github_repos_tool.to_csv(os.path.join(
        github_round1_path, f'Tool_{tool_name}.csv'), index=False)
    github_repos_research.to_csv(os.path.join(
        github_round1_path, f'Research_{tool_name}.csv'), index=False)
    github_repos_project.to_csv(os.path.join(
        github_round1_path, f'Project_{tool_name}.csv'), index=False)

    if not github_errors_tool.empty:
        github_errors_tool.to_csv(os.path.join(
            github_round1_error_path, f'Tool_{tool_name}.csv'), index=False)

    if not github_errors_research.empty:
        github_errors_research.to_csv(os.path.join(
            github_round1_error_path, f'Research_{tool_name}.csv'), index=False)

    if not github_errors_project.empty:
        github_errors_project.to_csv(os.path.join(
            github_round1_error_path, f'Project_{tool_name}.csv'), index=False)
'''


In [None]:
'''
github_round3_path = os.path.join(github_path, 'Round#3')
gitlab_round3_path = os.path.join(gitlab_path, 'Round#3')

github_round3_error_path = os.path.join(github_round3_path, 'Error')
gitlab_round3_error_path = os.path.join(gitlab_round3_path, 'Error')

if not os.path.exists(github_round3_path):
    os.makedirs(github_round3_path)

if not os.path.exists(gitlab_round3_path):
    os.makedirs(gitlab_round3_path)

if not os.path.exists(github_round3_error_path):
    os.makedirs(github_round3_error_path)

if not os.path.exists(gitlab_round3_error_path):
    os.makedirs(gitlab_round3_error_path)
'''
