In [1]:
import ast
import glob
import json
import os

import pandas as pd
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner

In [2]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets'
}

tools_creation_date = {
    'Databricks': '2021-05-27T00',
    'Amazon SageMaker': '2017-11-29T00',
    'Comet': '2017-01-01T00',
    'Neptune': '2018-01-01T00',
    'Spell': '2017-01-01T00',
    'Valohai': '2016-10-01T00',
    'Domino': '2013-04-01T00'
}

tools_link = {
    'Databricks': 'https://www.databricks.com',
    'Amazon SageMaker': 'https://aws.amazon.com/sagemaker',
    'Comet': 'http://comet.ml',
    'Neptune': 'https://neptune.ai',
    'Spell': 'https://spell.ml',
    'Valohai': 'https://valohai.com',
    'Domino': 'https://www.dominodatalab.com'
}


In [3]:
raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'

if not os.path.exists(raw_path):
    os.makedirs(raw_path)

if not os.path.exists(tool_path):
    os.makedirs(tool_path)


In [4]:
GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)


In [5]:
tools_data = []

for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    github_repo_names = set()
    gitlab_repo_names = set()
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo_file in json_data['Results']:
            if 'github' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('github.com/')
                github_repo_names.add(repo_name)
            if 'gitlab' in repo_file['name']:
                repo_name = repo_file['name'].removeprefix('gitlab.com/')
                gitlab_repo_names.add(repo_name)

    tool_name = file_name.split('\\')[-1].split('.')[0]
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name)
        tool_data.pop('Archived')
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.to_datetime(tools_creation_date[tool_name]).to_datetime64()
        }

    tool_data['GitHub Dependents'] = list(github_repo_names)
    tool_data['GitLab Dependents'] = list(gitlab_repo_names)
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

tools_data = pd.concat(tools_data, ignore_index=True)
tools_data.to_csv(os.path.join(tool_path, 'Tools.csv'), index=False)
tools_data


Unnamed: 0,Repo,Link,Creation Date,First Activity Date,Last Activity Date,Topics,#Star,#Watch,#Fork,#Releases,...,Size,#Issues,#Pull Requests,#Branches,#Commits,#Comments,#Deployments,Name,GitHub Dependents,GitLab Dependents
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2019-05-31 18:25:07,2022-11-16 15:46:24,"[python, ai, data-science, data-visualization,...",2864.0,35.0,176.0,43.0,...,55962.0,2331.0,1614.0,74.0,1940.0,8.0,5.0,Aim,"[orsonteodoro/oiledmachine-overlay, VlachosGro...","[xdevs23/linux-nitrous, lu-ci/sigma/apex-sigma..."
1,,https://aws.amazon.com/sagemaker,2017-11-29 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Amazon SageMaker,"[aws/sagemaker-scikit-learn-extension, aws-sam...","[juliensimon/huggingface-demos, juliensimon/am..."
2,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2019-06-10 08:18:33,2022-11-15 13:59:26,"[version-control, experiment-manager, version,...",3761.0,78.0,500.0,70.0,...,37351.0,813.0,157.0,3.0,1876.0,10.0,3.0,ClearML,"[okojoalg/raft-mlp, megvii-research/basecls, x...",[]
3,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2013-11-12 02:18:24,2022-11-16 19:39:27,[],133.0,20.0,78.0,112.0,...,27402.0,4308.0,2210.0,121.0,4522.0,23.0,1551.0,Codalab,"[fedir/ghstat, thunlp/thunlp.github.io, srz-zu...","[SlackBuilds.org/slackbuilds, illwieckz/i-love..."
4,,http://comet.ml,2017-01-01 00:00:00,NaT,NaT,,,,,,...,,,,,,,,Comet,[saeyslab/DeepLearning_for_ImagingFlowCytometr...,[]
5,d6t/d6tflow,https://github.com/d6t/d6tflow,2019-02-03 01:51:22,2019-02-02 21:19:03,2022-10-26 17:04:54,[],945.0,22.0,73.0,0.0,...,271.0,42.0,19.0,1.0,269.0,0.0,0.0,D6tflow,"[d6t/d6tflow-template, d6t/d6tflow, d6tdev/top...",[]
6,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:17:59,2019-08-09 06:18:49,2022-11-15 14:49:44,"[datasets, deep-learning, machine-learning, da...",5014.0,63.0,406.0,71.0,...,61015.0,1980.0,1586.0,90.0,6487.0,5.0,48.0,Deep Lake,"[activeloopai/deeplake, activeloopai/examples]",[]
7,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2020-04-07 16:29:08,2022-11-16 22:41:35,"[deep-learning, machine-learning, ml-platform,...",1922.0,57.0,257.0,73.0,...,100980.0,5442.0,5202.0,171.0,4563.0,46.0,29.0,Determined,"[mdaniel/virtualbox-org-svn-vbox-trunk, frankf...",[]
8,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2017-03-04 08:16:33,2022-11-17 08:31:26,"[data-science, machine-learning, reproducibili...",10668.0,138.0,995.0,400.0,...,16755.0,8339.0,4353.0,8.0,8149.0,45.0,0.0,DVC,"[cytomining/profiling-handbook, madebyollin/si...",[mlrep/mldev]
9,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 18:57:50,2017-09-27 19:55:55,2022-11-16 14:38:56,[],750.0,13.0,68.0,2.0,...,16500.0,456.0,66.0,60.0,5208.0,5.0,0.0,Guild AI,"[emisdumb/tau, bpierre/loot-rarity, yagomichal...","[lexikiq/semicolon, litecord/litecord, nekokat..."


In [7]:
df_summary = pd.DataFrame(columns=['Name', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in tools_data.iterrows():
    row_new = {
        'Name': row['Name'],
        '#GitHub Dependents': len(row["GitHub Dependents"]),
        '#GitLab Dependents': len(row["GitLab Dependents"])
    }
    df_summary = df_summary.append(row_new, ignore_index=True)
df_summary.to_csv(os.path.join(tool_path, '#Dependents.csv'), index=False)
df_summary


Unnamed: 0,Name,#GitHub Dependents,#GitLab Dependents
0,Aim,1554,19
1,Amazon SageMaker,808,3
2,ClearML,56,0
3,Codalab,161,2
4,Comet,430,0
5,D6tflow,4,0
6,Deep Lake,2,0
7,Determined,45,0
8,DVC,174,1
9,Guild AI,195,4


In [None]:
'''
if not os.path.exists(tool_path):
    os.makedirs(tool_path)

file_name = os.path.join(raw_path, 'Codalab.json')
tool_name = file_name.split('\\')[-1].split('.')[0]

github_repo_names = set()
gitlab_repo_names = set()

with open(file_name) as json_file:
    json_data = json.load(json_file)
    for repo_file in json_data['Results']:
        if 'github' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('github.com/')
            github_repo_names.add(repo_name)
        if 'gitlab' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('gitlab.com/')
            github_repo_names.add(repo_name)

if tool_name in tool_repos:
    tool_data, error_data = github_miner.scrape(
        repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
    del tool_data['Archived']
else:
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
    }
    
tool_data['GitHub Dependents'] = list(github_repo_names)
tool_data['GitLab Dependents'] = list(gitlab_repo_names)
tool_data = pd.DataFrame([tool_data])
tool_data.to_csv(os.path.join(tool_path, f'{tool_name}.csv'), index=False)
print(len(github_repo_names))
print(len(gitlab_repo_names))
'''


In [19]:
'''
for index, row in tools_data.iterrows():
    df_github_dependents = pd.DataFrame(columns=['Link'])
    df_gitlab_dependents = pd.DataFrame(columns=['Link'])
    for repo_name in ast.literal_eval(row['GitHub Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://github.com/' + repo_name}])
        df_github_dependents = pd.concat([df_github_dependents, df_repo])
    for repo_name in ast.literal_eval(row['GitLab Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://gitlab.com/' + repo_name}])
        df_gitlab_dependents = pd.concat([df_gitlab_dependents, df_repo])
    df_github_dependents.to_csv(os.path.join(github_round1_path, f'Repo_{tool_name}.csv'), index=False)
    df_gitlab_dependents.to_csv(os.path.join(gitlab_round1_path, f'Repo_{row["Name"]}.csv'), index=False)
'''


In [6]:
'''
def date_parser(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')


df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'), parse_dates=[
                 'Creation Date'], date_parser=date_parser)
'''


In [8]:
df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'))
df['Creation Date'] = pd.to_datetime(df['Creation Date'])


In [9]:
filter_path = '../Dataset/Filter'
    
github_path = os.path.join(filter_path, 'GitHub')
gitlab_path = os.path.join(filter_path, 'GitLab')

github_round1_path = os.path.join(github_path, 'Round#1')
gitlab_round1_path = os.path.join(gitlab_path, 'Round#1')

github_round1_error_path = os.path.join(github_round1_path, 'Error')
gitlab_round1_error_path = os.path.join(gitlab_round1_path, 'Error')

if not os.path.exists(filter_path):
    os.makedirs(filter_path)

if not os.path.exists(github_path):
    os.makedirs(github_path)

if not os.path.exists(github_round1_path):
    os.makedirs(github_round1_path)

if not os.path.exists(github_round1_error_path):
    os.makedirs(github_round1_error_path)

if not os.path.exists(gitlab_round1_error_path):
    os.makedirs(gitlab_round1_error_path)


In [10]:
gitlab_miner = GitLabMiner('glpat-6ejKEymaxpnDHVqv-yk7')
github_miner = GitHubMiner('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')

In [15]:
for index, row in df.iterrows():
    print(index)
    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=ast.literal_eval(row['GitHub Dependents']), date=row['Creation Date'])

    if not github_repos_tool.empty:
        github_repos_tool.to_csv(os.path.join(
            github_round1_path, f'{row["Name"]}.csv'), index=False)

    if not github_errors_tool.empty:
        github_errors_tool.to_csv(os.path.join(
            github_round1_error_path, f'{row["Name"]}.csv'), index=False)


0


In [None]:
for file_name in glob.glob(os.path.join(gitlab_round1_path, '*.csv')):
    tool_name = file_name.split('\\')[-1].split('.')[0]
    repos = pd.read_csv(file_name)
    repos = repos[repos.Category != 'Examplar']
    repos['Repo'] = repos['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))
    
    gitlab_repos, gitlab_errors = gitlab_miner.collect(
        repo_names=repos['Repo'].to_list())
    gitlab_repos = pd.concat([repos, gitlab_repos], axis=1)

    gitlab_repos.to_csv(os.path.join(gitlab_round1_path,
                        f'{tool_name}.csv'), index=False)

    if not gitlab_errors.empty:
        gitlab_errors.to_csv(os.path.join(
            gitlab_round1_error_path, f'{tool_name}.csv'), index=False)


In [None]:
'''
for file_name in glob.glob(os.path.join(github_round2_path, '*.csv')):
    tool_name = file_name.split('_')[-1].split('.')[0]

    df = pd.read_csv(file_name)
    df['Name'] = df['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))

    df_tool = df[df['Category'] == 'Tool']
    df_project = df[df['Category'] == 'Project']
    df_research = df[df['Category'] == 'Research']

    repos_tool = df_tool['Name'].to_list()
    repos_project = df_project['Name'].to_list()
    repos_research = df_research['Name'].to_list()

    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=repos_tool)
    github_repos_research, github_errors_research = github_miner.collect(
        repo_names=repos_research)
    github_repos_project, github_errors_project = github_miner.collect(
        repo_names=repos_project)

    github_repos_tool.to_csv(os.path.join(
        github_round1_path, f'Tool_{tool_name}.csv'), index=False)
    github_repos_research.to_csv(os.path.join(
        github_round1_path, f'Research_{tool_name}.csv'), index=False)
    github_repos_project.to_csv(os.path.join(
        github_round1_path, f'Project_{tool_name}.csv'), index=False)

    if not github_errors_tool.empty:
        github_errors_tool.to_csv(os.path.join(
            github_round1_error_path, f'Tool_{tool_name}.csv'), index=False)

    if not github_errors_research.empty:
        github_errors_research.to_csv(os.path.join(
            github_round1_error_path, f'Research_{tool_name}.csv'), index=False)

    if not github_errors_project.empty:
        github_errors_project.to_csv(os.path.join(
            github_round1_error_path, f'Project_{tool_name}.csv'), index=False)
'''


In [None]:
github_round2_path = os.path.join(github_path, 'Round#2')
gitlab_round2_path = os.path.join(gitlab_path, 'Round#2')

github_round3_path = os.path.join(github_path, 'Round#3')
gitlab_round3_path = os.path.join(gitlab_path, 'Round#3')

if not os.path.exists(github_round3_path):
    os.makedirs(github_round3_path)

if not os.path.exists(gitlab_round3_path):
    os.makedirs(gitlab_round3_path)

github_round3_error_path = os.path.join(github_round3_path, 'Error')
gitlab_round3_error_path = os.path.join(gitlab_round3_path, 'Error')

if not os.path.exists(github_round3_error_path):
    os.makedirs(github_round3_error_path)

if not os.path.exists(gitlab_round3_error_path):
    os.makedirs(gitlab_round3_error_path)