In [1]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import random
import glob
import json
import os

In [2]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'D6tflow': 'd6t/d6tflow',
    'DVC': 'iterative/dvc',
    'Deep Lake': 'activeloopai/deeplake',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Keepsake': 'replicate/keepsake',
    'LakeFS': 'treeverse/lakeFS',
    'MLflow': 'mlflow/mlflow',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Pachyderm': 'pachyderm/pachyderm',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Comet': '2017-01-01',
    'D6tflow': '2019-02-02',
    'Databricks': '2021-05-27',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Spell': '2017-01-01',
    'Vertex AI': '2019-03-01',
    'cnvrg.io': '2020-03-31'  
}

tools_link = {
    'Comet': 'https://github.com/comet-ml',
    'Databricks': 'https://www.databricks.com/product/unity-catalog',
    'SigOpt': 'https://github.com/sigopt',
    'Spell': 'https://github.com/spellml',
    'Vertex AI': 'https://cloud.google.com/vertex-ai',
    'cnvrg.io': 'https://github.com/cnvrg'  
}

tools_keywords = {
    'Aim': 'aim',
    'Amazon SageMaker': 'sagemaker',
    'Azure Machine Learning': 'azure',
    'ClearML': 'clearml',
    'Codalab': 'codalab',
    'Comet': 'comet',
    'D6tflow': 'd6tflow',
    'DVC': 'dvc',
    'Deep Lake': 'deeplake',
    'Determined': 'determined',
    'Domino': 'domino',
    'Guild AI': 'guildai',
    'Keepsake': 'keepsake',
    'LakeFS': 'lakefs',
    'MLflow': 'mlflow',
    'ModelDB': 'modeldb',
    'Neptune': 'neptune',
    'Pachyderm': 'pachyderm',
    'Polyaxon': 'polyaxon',
    'Quilt': 'quilt',
    'Sacred': 'sacred',
    'SigOpt': 'sigopt',
    'Spell': 'spell',
    'Valohai': 'valohai',
    'Vertex AI': 'vertex',
    'Weights & Biases': 'wandb'
}

In [3]:
path_dataset = '../Dataset'
path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')
path_github_repo_raw = os.path.join(path_github_repo, 'Raw')
path_gitlab_repo_raw = os.path.join(path_gitlab_repo, 'Raw')
path_github_repo_scraped = os.path.join(path_github_repo, 'Scraped')
path_gitlab_repo_scraped = os.path.join(path_gitlab_repo, 'Scraped')
path_gitlab_repo_labelled = os.path.join(path_github_repo, 'labelled')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

if not os.path.exists(path_github_repo_raw):
    os.makedirs(path_github_repo_raw)

if not os.path.exists(path_gitlab_repo_raw):
    os.makedirs(path_gitlab_repo_raw)

if not os.path.exists(path_github_repo_scraped):
    os.makedirs(path_github_repo_scraped)

if not os.path.exists(path_gitlab_repo_scraped):
    os.makedirs(path_gitlab_repo_scraped)

if not os.path.exists(path_gitlab_repo_labelled):
    os.makedirs(path_gitlab_repo_labelled)

In [4]:
github_token1 = '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'
github_token2 = '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'
github_token3 = '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'
github_token4 = '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'
gitlab_token1 = 'glpat-LFsxferBHR75dL9XKvos'

github_miner = GitHubMiner(github_token1)
gitlab_miner = GitLabMiner(gitlab_token1)

In [4]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)

In [None]:
star_tracker_url = 'https://seladb.github.io/StarTrack-js/#/preload?r=mlflow,mlflow&r=iterative,dvc&r=pachyderm,pachyderm&r=activeloopai,deeplake&r=wandb,wandb&r=allegroai,clearml&r=IDSIA,sacred&r=polyaxon,polyaxon'

In [5]:
tools_data = None

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    tool_data, error_data = github_miner.scrape(tool_repo, release_date=True)
    if error_data is None:
        tool_data['Name'] = tool_name
        if tool_name in tools_release_date:
            tool_data['First Release Date'] = pd.to_datetime(
                tools_release_date[tool_name])
        tool_data = pd.DataFrame([tool_data])
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)
    else:
        print(error_data)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)


tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')
tools_data

Unnamed: 0,Repo,Link,Repo Creation Date,Last Commit Date,Topics,Language,Size,#Star,#Watch,#Fork,#Contributors,#Branches,#Releases,#Commits,#Pull Requests,#Pull Requests (Open),#Issues,#Issues (Open),First Release Date,Name
0,mlflow/mlflow,https://github.com/mlflow/mlflow,2018-06-05 16:05:58,2022-12-15 10:26:27,"[machine-learning, ai, ml, mlflow, apache-spar...",Python,120957.0,13188.0,285.0,3077.0,455.0,203.0,60.0,3568.0,4771.0,139.0,7417.0,1004.0,2018-06-27 16:19:13,MLflow
1,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2022-12-05 02:03:53,"[data-science, machine-learning, reproducibili...",Python,16891.0,10791.0,136.0,1010.0,253.0,8.0,408.0,8232.0,4413.0,15.0,8454.0,656.0,2017-05-04 08:03:08,DVC
2,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2022-12-13 09:26:49,"[python, ai, data-science, data-visualization,...",TypeScript,57914.0,2916.0,35.0,183.0,48.0,80.0,45.0,1978.0,1661.0,27.0,2411.0,206.0,2022-01-22 13:45:58,Aim
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2022-12-13 14:09:42,"[version-control, experiment-manager, version,...",Python,38696.0,3849.0,81.0,517.0,58.0,3.0,73.0,1921.0,164.0,0.0,841.0,306.0,2019-06-11 17:27:11,ClearML
4,pachyderm/pachyderm,https://github.com/pachyderm/pachyderm,2014-09-04 07:50:02,2022-12-15 17:08:46,"[go, pachyderm, docker, analytics, big-data, c...",Go,306635.0,5722.0,167.0,539.0,154.0,1158.0,360.0,21483.0,5363.0,189.0,8439.0,883.0,2014-11-26 22:49:29,Pachyderm
5,treeverse/lakeFS,https://github.com/treeverse/lakeFS,2019-09-12 11:46:28,2022-12-15 13:25:35,"[data-engineering, data-versioning, go, object...",Go,83126.0,3073.0,40.0,260.0,71.0,495.0,100.0,3705.0,2937.0,24.0,4803.0,510.0,2020-08-03 13:01:18,LakeFS
6,IDSIA/sacred,https://github.com/IDSIA/sacred,2014-03-31 18:05:29,2022-11-30 10:00:22,"[python, machine-learning, infrastructure, rep...",Python,6112.0,3960.0,70.0,361.0,90.0,8.0,11.0,1326.0,349.0,2.0,892.0,92.0,2016-01-13 18:56:23,Sacred
7,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 18:57:50,2022-12-07 21:27:40,[],Python,17496.0,758.0,13.0,69.0,19.0,60.0,2.0,5212.0,66.0,1.0,457.0,180.0,2022-04-28 14:31:07,Guild AI
8,VertaAI/modeldb,https://github.com/VertaAI/modeldb,2016-10-19 01:07:26,2022-12-15 16:45:37,"[machine-learning, model-management, modeldb, ...",Java,46868.0,1534.0,71.0,263.0,48.0,541.0,2.0,3580.0,3324.0,114.0,3456.0,194.0,2020-04-01 03:47:14,ModelDB
9,polyaxon/polyaxon,https://github.com/polyaxon/polyaxon,2016-12-26 12:48:47,2022-11-11 00:27:56,"[deep-learning, machine-learning, artificial-i...",Python,126225.0,3215.0,77.0,316.0,90.0,16.0,0.0,10010.0,398.0,2.0,1458.0,117.0,2018-10-16 00:00:00,Polyaxon


In [6]:
dependents = None

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo_raw, tool_name + '.json')
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        # either search by sourcegraph
        if 'Results' in json_data:
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' in repo_file['__typename'] and 'github.com' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'github.com/')
                    github_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' in repo_file['__typename'] and 'github.com' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('github.com/')
                    github_dependents.append(repo_name)
        # or search by dependent graph
        elif 'all_public_dependent_repos' in json_data:
            for repo_file in json_data['all_public_dependent_repos']:
                github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo_raw, tool_name + '.json')
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        # search by sourcegraph exclusively
        for repo_file in json_data['Results']:
            # file name match pattern
            if 'FileMatch' in repo_file['__typename'] and 'gitlab.com' in repo_file['repository']['name']:
                repo_name = repo_file['repository']['name'].removeprefix(
                    'gitlab.com/')
                gitlab_dependents.append(repo_name)
            # code usage match pattern
            elif 'Repository' in repo_file['__typename'] and 'gitlab.com' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('gitlab.com/')
                gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }
    
    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')

In [None]:
dependents_summary = pd.DataFrame(columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat([dependents_summary, dependent_data], ignore_index=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,MLflow,1189,3
1,DVC,4229,0
2,Aim,92,1
3,ClearML,303,0
4,Pachyderm,10,0
5,LakeFS,8,0
6,Sacred,1289,0
7,Guild AI,53,4
8,ModelDB,7,0
9,Polyaxon,35,0


In [5]:
df = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))

In [6]:
def slice_per(source, step):
    return [source[i::step] for i in range(step)]

num_workers = 100
wandb = df.at[15, 'GitHub Dependents']
wandb_sliced = slice_per(wandb, num_workers)

In [7]:
for num_worker in range(74, 100):
    print(f'Worker {num_worker + 1} is working')
    repos_data, errors_data = github_miner.collect(wandb_sliced[num_worker])

    if repos_data is not None:
        repos_data.to_json(os.path.join(path_github_repo_scraped, f'Weights & Biases_{num_worker}.json'), indent=4, orient='records')

    if errors_data is not None:
        errors_data.to_json(os.path.join(path_github_repo_scraped, f'Discarded.Weights & Biases_{num_worker}.json'), indent=4, orient='records')

Worker 75 is working


In [33]:
# Gitlab dependents retrieval for labelling
for index, row in df.iterrows():
    if not row['GitLab Dependents']:
        continue
    dependent = ['gitlab.com/' +
                 repo_name for repo_name in row['GitLab Dependents']]
    dependent = pd.DataFrame({'Link': dependent})
    dependent.to_json(os.path.join(path_gitlab_repo_labelled,
                      f'{row["Tool"]}.json'), indent=4, orient='records')

In [6]:
# Gitlab valid dependents general information mining
project_categories = {'Project', 'Toolkit', 'Research'}
for file_name in glob.glob(os.path.join(path_gitlab_repo_labelled, '*.json')):
    repos = pd.read_json(file_name)
    repos_name = []
    for index, row in repos.iterrows():
        if row['taxonomy'][0]['taxonomy'][0][0] in project_categories:
            repo_name = row['Link'].removeprefix('gitlab.com/')
            repos_name.append(repo_name)

    if not repos_name:
        continue

    repos_data, errors_data = gitlab_miner.collect(repos_name)
    tool_name = os.path.split(file_name)[1].split('.')[0]

    if repos_data is not None:
        repos_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'{tool_name}.json'), indent=4, orient='records')
    if errors_data is not None:
        errors_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'Discarded.{tool_name}.json'), indent=4, orient='records')

In [None]:
# Github dependents general information mining
for index, row in df.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos_data, errors_data = github_miner.collect(row['GitHub Dependents'])

    if repos_data is not None:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_github_repo_scraped, f'{row["Tool"]}.json'), indent=4, orient='records')

    if errors_data is not None:
        errors_data.to_json(os.path.join(path_github_repo_scraped,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

In [10]:
# Randomly sample a fraction of elements from a list
def random_sample(lst, frac=0.1, lowerbound=50, upperbound=50):
    sample_size = int(len(lst) * frac)
    if sample_size < lowerbound:
        sample_size = lowerbound
    if sample_size > min(len(lst), upperbound):
        sample_size = min(len(lst), upperbound)
    return random.sample(lst, sample_size)


random.seed(0)

In [23]:
from github import Github
github = Github('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')
repo = github.get_repo('polyaxon/polyaxon')
releases = repo.get_releases()

In [2]:
from gitlab import Gitlab
gitlab = Gitlab(private_token='glpat-LFsxferBHR75dL9XKvos')
repo = gitlab.projects.get('librespacefoundation/polaris/polaris')
commits = repo.commits.list()