In [1]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import random
import glob
import json
import os


In [2]:
tools_repo = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets',
    'Valohai': 'valohai/valohai-cli',
    'Domino': 'dominodatalab/python-domino',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Neptune': 'neptune-ai/neptune-client'
}

tools_release_date = {
    'Vertex AI': '2019-03-01T00:00:00.000Z',
    'Databricks': '2021-05-27T00:00:00.000Z',
    'Comet': '2017-01-01T00:00:00.000Z',
    'Spell': '2017-01-01T00:00:00.000Z',
    'SigOpt': '2014-11-01T00:00:00.000Z',
    'D6tflow': '2019-02-02T00:00:00.000Z',
    'Polyaxon': '2018-10-16T00:00:00.000Z'    
}

tools_link = {
    'Vertex AI': 'https://cloud.google.com/vertex-ai',
    'Databricks': 'https://www.databricks.com/product/unity-catalog',
    'Comet': 'https://github.com/comet-ml',
    'Spell': 'https://github.com/spellml',
    'SigOpt': 'https://github.com/sigopt'
}

tools_keywords = {
    'MLflow': 'mlflow',
    'DVC': 'dvc',
    'Aim': 'aim',
    'ClearML': 'clearml',
    'Pachyderm': 'pachyderm',
    'LakeFS': 'lakefs',
    'Sacred': 'sacred',
    'Guild AI': 'guildai',
    'ModelDB': 'modeldb',
    'Polyaxon': 'polyaxon',
    'Quilt': 'quilt',
    'D6tflow': 'd6tflow',
    'Deep Lake': 'deeplake',
    'Keepsake': 'keepsake',
    'Determined': 'determined',
    'Weights & Biases': 'wandb',
    'Codalab': 'codalab',
    'Valohai': 'valohai',
    'Domino': 'domino',
    'Azure Machine Learning': 'azure',
    'Amazon SageMaker': 'sagemaker',
    'Neptune': 'neptune',
    'Vertex AI': 'vertex',
    'Comet': 'comet',
    'Spell': 'spell',
    'SigOpt': 'sigopt'
}


In [10]:
path_dataset = '../Dataset'
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab_round1 = os.path.join(path_gitlab, 'Round#1')
path_github_round1 = os.path.join(path_github, 'Round#1')
path_gitlab_round2 = os.path.join(path_gitlab, 'Round#2')
path_github_round2 = os.path.join(path_github, 'Round#2')
path_gitlab_round1_raw = os.path.join(path_gitlab_round1, 'Raw')
path_github_round1_raw = os.path.join(path_github_round1, 'Raw')
path_gitlab_round2_raw = os.path.join(path_gitlab_round2, 'Raw')
path_github_round2_raw = os.path.join(path_github_round2, 'Raw')
path_gitlab_round1_labelled = os.path.join(path_gitlab_round1, 'Labelled')
path_gitlab_round1_filtered = os.path.join(path_gitlab_round1, 'Filtered')
path_github_round1_filtered = os.path.join(path_github_round1, 'Filtered')

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab_round1):
    os.makedirs(path_gitlab_round1)

if not os.path.exists(path_gitlab_round2):
    os.makedirs(path_gitlab_round2)

if not os.path.exists(path_github_round2):
    os.makedirs(path_github_round2)

if not os.path.exists(path_github_round1):
    os.makedirs(path_github_round1)

if not os.path.exists(path_gitlab_round1_raw):
    os.makedirs(path_gitlab_round1_raw)

if not os.path.exists(path_gitlab_round2_raw):
    os.makedirs(path_gitlab_round2_raw)

if not os.path.exists(path_github_round2_raw):
    os.makedirs(path_github_round2_raw)

if not os.path.exists(path_github_round1_raw):
    os.makedirs(path_github_round1_raw)

if not os.path.exists(path_gitlab_round1_labelled):
    os.makedirs(path_gitlab_round1_labelled)

if not os.path.exists(path_gitlab_round1_filtered):
    os.makedirs(path_gitlab_round1_filtered)

if not os.path.exists(path_github_round1_filtered):
    os.makedirs(path_github_round1_filtered)


In [4]:
gitlab_miner = GitLabMiner('glpat-LFsxferBHR75dL9XKvos')
github_miner = GitHubMiner('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')

In [4]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)


In [7]:
tools_data = None

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    tool_data, error_data = github_miner.scrape(tool_repo)
    if error_data is None:
        tool_data['Name'] = tool_name
        if tool_name in tools_release_date:
            tool_data['First Release Date'] = pd.to_datetime(
                tools_release_date[tool_name]).to_datetime64()
        tool_data = pd.DataFrame([tool_data])
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)
    else:
        print(error_data)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name]).to_datetime64()
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)


tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')
tools_data

Unnamed: 0,Repo,Link,Repo Creation Date,Last Commit Date,Topics,Language,Size,#Star,#Watch,#Fork,...,#Commits,#Comments,#Deployments,#Pull Requests,#Pull Requests (Open),#Issues,#Issues (Open),#Issues (All),First Release Date,Name
0,mlflow/mlflow,https://github.com/mlflow/mlflow,2018-06-05 16:05:58,2022-12-10 09:24:43,"[machine-learning, ai, ml, mlflow, apache-spar...",Python,119088.0,13169.0,285.0,3072.0,...,3558.0,39.0,0.0,4759.0,142.0,7391.0,998.0,7504.0,2018-06-27 16:19:13,MLflow
1,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2022-12-09 08:39:32,"[data-science, machine-learning, reproducibili...",Python,16902.0,10760.0,136.0,1006.0,...,8222.0,45.0,0.0,4402.0,17.0,8431.0,649.0,8678.0,2017-05-04 08:03:08,DVC
2,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2022-12-08 21:14:58,"[python, ai, data-science, data-visualization,...",TypeScript,56797.0,2909.0,35.0,181.0,...,1977.0,8.0,5.0,1656.0,24.0,2399.0,197.0,2423.0,2022-01-22 13:45:58,Aim
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2022-12-08 05:59:32,"[version-control, experiment-manager, version,...",Python,38580.0,3829.0,80.0,516.0,...,1906.0,10.0,3.0,164.0,0.0,837.0,303.0,847.0,2019-06-11 17:27:11,ClearML
4,pachyderm/pachyderm,https://github.com/pachyderm/pachyderm,2014-09-04 07:50:02,2022-12-09 23:05:45,"[go, pachyderm, docker, analytics, big-data, c...",Go,305563.0,5707.0,167.0,539.0,...,21472.0,31.0,1.0,5350.0,188.0,8426.0,882.0,8434.0,2014-11-26 22:49:29,Pachyderm
5,treeverse/lakeFS,https://github.com/treeverse/lakeFS,2019-09-12 11:46:28,2022-12-10 07:40:58,"[data-engineering, data-versioning, go, object...",Go,82779.0,3065.0,40.0,259.0,...,3679.0,10.0,2598.0,2908.0,30.0,4754.0,537.0,4777.0,2020-08-03 13:01:18,LakeFS
6,IDSIA/sacred,https://github.com/IDSIA/sacred,2014-03-31 18:05:29,2022-11-30 10:00:22,"[python, machine-learning, infrastructure, rep...",Python,6112.0,3958.0,70.0,361.0,...,1326.0,3.0,0.0,349.0,2.0,892.0,92.0,892.0,2016-01-13 18:56:23,Sacred
7,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 18:57:50,2022-12-07 21:27:40,[],Python,17331.0,756.0,13.0,68.0,...,5212.0,5.0,0.0,66.0,1.0,457.0,180.0,463.0,2022-04-28 14:31:07,Guild AI
8,VertaAI/modeldb,https://github.com/VertaAI/modeldb,2016-10-19 01:07:26,2022-12-09 11:08:22,"[machine-learning, model-management, modeldb, ...",Java,47303.0,1533.0,71.0,263.0,...,3569.0,7.0,43.0,3308.0,112.0,3440.0,192.0,3440.0,2020-04-01 03:47:14,ModelDB
9,polyaxon/polyaxon,https://github.com/polyaxon/polyaxon,2016-12-26 12:48:47,2022-11-11 00:27:56,"[deep-learning, machine-learning, artificial-i...",Python,126225.0,3213.0,77.0,315.0,...,10010.0,2.0,0.0,398.0,2.0,1457.0,116.0,1521.0,2018-10-16 00:00:00,Polyaxon


In [30]:
dependents_data = None
tools_release_date = pd.Series(
    tools_data['First Release Date'], index=tools_data['Name'].values).to_dict()

# collect Github dependents
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_round1_raw, tool_name + '.json')
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        # either search by sourcegraph
        if 'Results' in json_data:
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' in repo_file['__typename'] and 'github.com' in repo_file['repository']['name']:
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'github.com/')
                    github_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' in repo_file['__typename'] and 'github.com' in repo_file['name']:
                    repo_name = repo_file['name'].removeprefix('github.com/')
                    github_dependents.append(repo_name)
        # or search by dependent graph
        elif 'all_public_dependent_repos' in json_data:
            for repo_file in json_data['all_public_dependent_repos']:
                github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_round1_raw, tool_name + '.json')
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        # search by sourcegraph exclusively
        for repo_file in json_data['Results']:
            # file name match pattern
            if 'FileMatch' in repo_file['__typename'] and 'gitlab.com' in repo_file['repository']['name']:
                repo_name = repo_file['repository']['name'].removeprefix(
                    'gitlab.com/')
                gitlab_dependents.append(repo_name)
            # code usage match pattern
            elif 'Repository' in repo_file['__typename'] and 'gitlab.com' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('gitlab.com/')
                gitlab_dependents.append(repo_name)

    dependent_data = {
        'Tool': tool_name,
        'First Release Date': tools_release_date[tool_name],
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_data = pd.concat(
        [dependents_data, dependent_data], ignore_index=True)

dependents_data.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')

In [31]:
dependents_summary = pd.DataFrame(columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents_data.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat([dependents_summary, dependent_data], ignore_index=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,MLflow,1190,3
1,DVC,4229,0
2,Aim,93,1
3,ClearML,304,0
4,Pachyderm,11,0
5,LakeFS,9,0
6,Sacred,1289,0
7,Guild AI,54,4
8,ModelDB,8,0
9,Polyaxon,36,0


In [12]:
df = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df['First Release Date'] = pd.to_datetime(df['First Release Date'])

In [33]:
for index, row in df.iterrows():
    if not row['GitLab Dependents']:
        continue
    dependent = ['gitlab.com/' +
                 repo_name for repo_name in row['GitLab Dependents']]
    dependent = pd.DataFrame({'Link': dependent})
    dependent.to_json(os.path.join(path_gitlab_round1_filtered,
                      f'{row["Tool"]}.json'), indent=4, orient='records')

In [6]:
# Gitlab dependents general information mining
project_categories = {'Project', 'Toolkit', 'Research'}
for file_name in glob.glob(os.path.join(path_gitlab_round1_labelled, '*.json')):
    repos = pd.read_json(file_name)
    repos_name = []
    for index, row in repos.iterrows():
        if row['taxonomy'][0]['taxonomy'][0][0] in project_categories:
            repo_name = row['Link'].removeprefix('gitlab.com/')
            repos_name.append(repo_name)

    if not repos_name:
        continue

    repos_data, errors_data = gitlab_miner.collect(repos_name)
    tool_name = os.path.split(file_name)[1].split('.')[0]

    if repos_data is not None:
        repos_data.to_json(os.path.join(
            path_gitlab_round2_raw, f'{tool_name}.json'), indent=4, orient='records')
    if errors_data is not None:
        errors_data.to_json(os.path.join(
            path_gitlab_round2_raw, f'Discarded.{tool_name}.json'), indent=4, orient='records')

In [15]:
# Github dependents general information mining
for index, row in df.iterrows():
    print(row["Tool"])
    repos_data, errors_data = github_miner.collect(
        row['GitHub Dependents'], tool_release_date=row['First Release Date'])

    if repos_data is not None:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_github_round1_filtered, f'{row["Tool"]}.json'), indent=4, orient='records')

    if errors_data is not None:
        errors_data.to_json(os.path.join(path_github_round1_filtered,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

MLflow


In [10]:
# Randomly sample a fraction of elements from a list
def random_sample(lst, frac=0.1, lowerbound=50, upperbound=50):
    sample_size = int(len(lst) * frac)
    if sample_size < lowerbound:
        sample_size = lowerbound
    if sample_size > min(len(lst), upperbound):
        sample_size = min(len(lst), upperbound)
    return random.sample(lst, sample_size)


random.seed(0)


In [24]:
for file_name in glob.glob(os.path.join(github_round1_path, '*.json')):
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        json_data.sort(key=lambda x: x['#Issues'], reverse=True)
    json_repos = json.dumps(json_data, indent='\t')
    tool_name = os.path.split(file_name)[1].split('.')[0]
    with open(os.path.join(github_round1_path, f'{tool_name}.json'), 'w') as outfile:
        outfile.write(json_repos)

In [15]:
path_desktop = os.path.join(os.path.expanduser('~'), 'Desktop')
df = pd.DataFrame(columns=['Tool', 'Total', 'Sampled', 'Sampled as Percentage',
                  '# Projects', '# Average Project Issues', '# Average Asset Management Issues'])
for file_name in glob.glob(os.path.join(sample1_result_github_round2_path, '*.json')):
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        sample_repos = random_sample(json_data)

    average_project_issues = 0
    projects = 0
    for repo in sample_repos:
        if repo['taxonomy'][0]['taxonomy'][0][0] in {'Project', 'Research', 'Toolkit'}:
            projects += 1
            average_project_issues += repo['#Issues']
    if projects > 0:
        average_project_issues /= projects

    tool_name = os.path.split(file_name)[1].split('.')[0]
    total = len(json_data)
    sampled = len(sample_repos)
    sampled_as_percentage = str(round(sampled/total*100, 2)) + '%'

    s = pd.Series([tool_name, total, sampled, sampled_as_percentage,
                  projects, average_project_issues, 0], index=df.columns)
    df = pd.concat([df, s.to_frame().T])
    #print(f'{len(sample_repos):2} out of {len(json_data):4} - {len(sample_repos)/len(json_data)*100:6.2f}%: {tool_name}')
    #json_sample_repos = json.dumps(sample_repos, indent='\t')
    # with open(os.path.join(sample1_github_round2_path, f'{tool_name}.json'), 'w') as outfile:
    #    outfile.write(json_sample_repos)
df.to_csv(os.path.join(path_desktop, 'Sample#1.csv'), index=False)


In [None]:
path_desktop = os.path.join(os.path.expanduser('~'), 'Desktop')
df = pd.DataFrame(columns=['Tool', 'Total', 'Sampled', 'Sampled as Percentage',
                  '# Projects', '# Average Project Issues', '# Average Asset Management Issues'])
for file_name in glob.glob(os.path.join(github_round1_path, '*.json')):
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        sample_repos = random_sample(json_data)

    average_project_issues = 0
    projects = 0
    for repo in sample_repos:
        if repo['taxonomy'][0]['taxonomy'][0][0] in {'Project', 'Research', 'Toolkit'}:
            projects += 1
            average_project_issues += repo['#Issues']
    if projects > 0:
        average_project_issues /= projects

    tool_name = os.path.split(file_name)[1].split('.')[0]
    total = len(json_data)
    sampled = len(sample_repos)
    sampled_as_percentage = str(round(sampled/total*100, 2)) + '%'

    s = pd.Series([tool_name, total, sampled, sampled_as_percentage,
                  projects, average_project_issues, 0], index=df.columns)
    df = pd.concat([df, s.to_frame().T])
    #print(f'{len(sample_repos):2} out of {len(json_data):4} - {len(sample_repos)/len(json_data)*100:6.2f}%: {tool_name}')
    #json_sample_repos = json.dumps(sample_repos, indent='\t')
    # with open(os.path.join(sample1_github_round2_path, f'{tool_name}.json'), 'w') as outfile:
    #    outfile.write(json_sample_repos)
df.to_csv(os.path.join(path_desktop, 'Sample#1.csv'), index=False)


In [23]:
from github import Github
github = Github('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')
repo = github.get_repo('polyaxon/polyaxon')
releases = repo.get_releases()

In [2]:
from gitlab import Gitlab
gitlab = Gitlab(private_token='glpat-LFsxferBHR75dL9XKvos')
repo = gitlab.projects.get('librespacefoundation/polaris/polaris')
commits = repo.commits.list(get_all=True)