In [1]:
from GHMiner import GithubMiner
from GLMiner import GitlabMiner
import pandas as pd
import datetime
import glob
import json
import ast
import os


In [2]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets'
}

tools_creation_date = {
    'Databricks': '2021-05-27T00',
    'Amazon SageMaker': '2017-11-29T00',
    'Comet': '2017-01-01T00',
    'Neptune': '2018-01-01T00',
    'Spell': '2017-01-01T00',
    'Valohai': '2016-10-01T00',
    'Domino': '2013-04-01T00'
}

tools_link = {
    'Databricks': 'https://www.databricks.com',
    'Amazon SageMaker': 'https://aws.amazon.com/sagemaker',
    'Comet': 'http://comet.ml',
    'Neptune': 'https://neptune.ai',
    'Spell': 'https://spell.ml',
    'Valohai': 'https://valohai.com',
    'Domino': 'https://www.dominodatalab.com'
}


In [3]:
raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'

GITHUB_TOKEN = os.environ['GITHUB_TOKEN']
GITLAB_TOKEN = os.environ['GITLAB_TOKEN']

github_miner = GithubMiner(GITHUB_TOKEN)
gitlab_miner = GithubMiner(GITLAB_TOKEN)


In [None]:
tools_data = []

if not os.path.exists(tool_path):
    os.makedirs(tool_path)


In [11]:
for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    github_repo_names = set()
    gitlab_repo_names = set()
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo_file in json_data['Results']:
            if 'github' in repo_file['name']: 
                repo_name = repo_file['name'].removeprefix('github.com/')
                github_repo_names.add(repo_name)
            if 'github' in repo_file['name']: 
                repo_name = repo_file['name'].removeprefix('gitlab.com/')
                gitlab_repo_names.add(repo_name)
    tool_name = file_name.split('\\')[-1].split('.')[0]
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
        del tool_data['Archived']
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
        }
    tool_data['GitHub Dependents'] = list(github_repo_names)
    tool_data['GitLab Dependents'] = list(gitlab_repo_names)
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

tools_data = pd.concat(tools_data, ignore_index=True)
tools_data.to_csv(os.path.join(tool_path, 'Tools.csv'), index=False)
tools_data


Unnamed: 0,Repo,Link,Creation Date,Topics,#Star,#Watch,#Fork,#Releases,#Contributors,Languages,...,#Pull Requests,#Branches,#Tags,#Commits,#Comments,#Downloads,#Deployments,Name,GitHub Dependents,GitLab Dependents
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,"[python, ai, data-science, data-visualization,...",2840.0,35.0,174.0,42.0,44.0,TypeScript,...,21.0,69.0,147.0,1921.0,8.0,0.0,5.0,Aim,[realme-kernel-opensource/realmeC12_realmeC15_...,"[alfredchen/linux-prjc, xhang/gitlab, github.c..."
1,,https://aws.amazon.com/sagemaker,2017-11-29 00:00:00,,,,,,,,...,,,,,,,,Amazon SageMaker,"[aws/sagemaker-tensorflow-extensions, PacktPub...","[juliensimon/amazon-studio-demos, fluidattacks..."
2,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:33,"[version-control, experiment-manager, version,...",3716.0,77.0,496.0,69.0,53.0,Python,...,0.0,3.0,120.0,1847.0,10.0,0.0,3.0,ClearML,"[RizwanMunawar/yolov5-object-tracking, BrianG1...",[]
3,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2013-11-12 02:18:24,[],132.0,20.0,78.0,111.0,53.0,Python,...,24.0,122.0,125.0,4514.0,23.0,0.0,1543.0,Codalab,"[davidw/hecl, stanfordnlp/color-describer, web...","[illwieckz/i-love-compute, SlackBuilds.org/sla..."
4,,http://comet.ml,2017-01-01 00:00:00,,,,,,,,...,,,,,,,,Comet,"[UETAILab/uetai, twistedcubic/attention-rank-c...",[]
5,d6t/d6tflow,https://github.com/d6t/d6tflow,2019-02-02 21:19:03,[],943.0,22.0,73.0,0.0,11.0,Python,...,1.0,1.0,0.0,269.0,0.0,0.0,0.0,D6tflow,"[amesar/mlflow-export-import, antklen/sdsj2018...","[jan-teichmann/ml-flow-ds-project, juliensimon..."
6,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:18:49,"[datasets, deep-learning, machine-learning, da...",4954.0,63.0,403.0,67.0,89.0,Python,...,21.0,78.0,92.0,6434.0,5.0,0.0,48.0,Deep Lake,"[activeloopai/deeplake, activeloopai/examples]",[]
7,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:29:08,"[deep-learning, machine-learning, ml-platform,...",1903.0,58.0,258.0,72.0,66.0,TypeScript,...,105.0,170.0,341.0,4511.0,46.0,0.0,27.0,Determined,"[tekumara/notes, mlcommons/tiny_results_v0.7, ...",[]
8,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,"[data-science, machine-learning, reproducibili...",10603.0,136.0,991.0,396.0,252.0,Python,...,16.0,5.0,410.0,8093.0,45.0,0.0,0.0,DVC,"[Duankaiwen/PyCenterNet, iterative/example-dvc...",[mlrep/mldev]
9,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 19:55:55,[],744.0,13.0,68.0,2.0,19.0,Python,...,1.0,58.0,19.0,5203.0,5.0,0.0,0.0,Guild AI,"[nextml-code/template-nvae, KhoomeiK/MindMapRe...",[]


In [None]:
'''
if not os.path.exists(tool_path):
    os.makedirs(tool_path)

file_name = os.path.join(raw_path, 'Codalab.json')
tool_name = file_name.split('\\')[-1].split('.')[0]

github_repo_names = set()
gitlab_repo_names = set()

with open(file_name) as json_file:
    json_data = json.load(json_file)
    for repo_file in json_data['Results']:
        if 'github' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('github.com/')
            github_repo_names.add(repo_name)
        if 'gitlab' in repo_file['name']:
            repo_name = repo_file['name'].removeprefix('gitlab.com/')
            github_repo_names.add(repo_name)

if tool_name in tool_repos:
    tool_data, error_data = github_miner.scrape(
        repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
    del tool_data['Archived']
else:
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
    }
    
tool_data['GitHub Dependents'] = list(github_repo_names)
tool_data['GitLab Dependents'] = list(gitlab_repo_names)
tool_data = pd.DataFrame([tool_data])
tool_data.to_csv(os.path.join(tool_path, f'{tool_name}.csv'), index=False)
print(len(github_repo_names))
print(len(gitlab_repo_names))
'''


In [20]:
df_summary = pd.DataFrame(columns=['Name', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in tools_data.iterrows():
    row_new = {
        'Name': row['Name'],
        '#GitHub Dependents': len(ast.literal_eval(row["GitHub Dependents"])),
        '#GitLab Dependents': len(ast.literal_eval(row["GitLab Dependents"]))
    }
    df_summary = df_summary.append(row_new, ignore_index=True)
df_summary.to_csv(os.path.join(tool_path, '#Dependents.csv'), index=False)
df_summary


Unnamed: 0,Name,#GitHub Dependents,#GitLab Dependents
0,Aim,1548,19
1,Amazon SageMaker,806,3
2,ClearML,55,0
3,Codalab,161,2
4,Comet,427,0
5,D6tflow,4,2
6,Deep Lake,2,0
7,Determined,45,0
8,DVC,174,1
9,Guild AI,195,4


In [4]:
filter_path = '../Dataset/Filter'

if not os.path.exists(filter_path):
    os.makedirs(filter_path)
    
github_path = os.path.join(filter_path, 'GitHub')

if not os.path.exists(github_path):
    os.makedirs(github_path)

github_round1_path = os.path.join(github_path, 'Round#1')

if not os.path.exists(github_round1_path):
    os.makedirs(github_round1_path)

gitlab_path = os.path.join(filter_path, 'GitLab')

if not os.path.exists(gitlab_path):
    os.makedirs(gitlab_path)

gitlab_round1_path = os.path.join(gitlab_path, 'Round#1')

if not os.path.exists(gitlab_round1_path):
    os.makedirs(gitlab_round1_path)


In [19]:
for index, row in tools_data.iterrows():
    df_github_dependents = pd.DataFrame(columns=['Link'])
    df_gitlab_dependents = pd.DataFrame(columns=['Link'])
    for repo_name in ast.literal_eval(row['GitHub Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://github.com/' + repo_name}])
        df_github_dependents = pd.concat([df_github_dependents, df_repo])
    for repo_name in ast.literal_eval(row['GitLab Dependents']):
        df_repo = pd.DataFrame([{'Link': 'https://gitlab.com/' + repo_name}])
        df_gitlab_dependents = pd.concat([df_gitlab_dependents, df_repo])
    df_github_dependents.to_csv(os.path.join(github_round1_path, f'Repo_{tool_name}.csv'), index=False)
    df_gitlab_dependents.to_csv(os.path.join(gitlab_round1_path, f'Repo_{row["Name"]}.csv'), index=False)

In [4]:
def date_parser(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')


df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'), parse_dates=[
                 'Creation Date'], date_parser=date_parser)


In [9]:
github_round2_path = os.path.join(github_path, 'Round#2')
gitlab_round2_path = os.path.join(gitlab_path, 'Round#2')

github_round3_path = os.path.join(github_path, 'Round#3')
gitlab_round3_path = os.path.join(gitlab_path, 'Round#3')

if not os.path.exists(github_round3_path):
    os.makedirs(github_round3_path)

if not os.path.exists(gitlab_round3_path):
    os.makedirs(gitlab_round3_path)

github_round3_error_path = os.path.join(github_round3_path, 'Error')
gitlab_round3_error_path = os.path.join(gitlab_round3_path, 'Error')

if not os.path.exists(github_round3_error_path):
    os.makedirs(github_round3_error_path)

if not os.path.exists(gitlab_round3_error_path):
    os.makedirs(gitlab_round3_error_path)

In [23]:
for file_name in glob.glob(os.path.join(gitlab_round2_path, '*.csv')):
    tool_name = file_name.split('_')[-1].split('.')[0]

    df = pd.read_csv(file_name)
    df['Name'] = df['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))

    df_tool = df[df['Category'] == 'Tool']
    df_project = df[df['Category'] == 'Project']
    df_research = df[df['Category'] == 'Research']

    repos_tool = df_tool['Name'].to_list()
    repos_project = df_project['Name'].to_list()
    repos_research = df_research['Name'].to_list()

    gitlab_repos_tool, gitlab_errors_tool = gitlab_miner.collect(
        repo_names=repos_tool)
    gitlab_repos_research, gitlab_errors_research = gitlab_miner.collect(
        repo_names=repos_research)
    gitlab_repos_project, gitlab_errors_project = gitlab_miner.collect(
        repo_names=repos_project)

    gitlab_repos_tool.to_csv(os.path.join(
        gitlab_round3_path, f'Tool_{tool_name}.csv'), index=False)
    gitlab_repos_research.to_csv(os.path.join(
        gitlab_round3_path, f'Research_{tool_name}.csv'), index=False)
    gitlab_repos_project.to_csv(os.path.join(
        gitlab_round3_path, f'Project_{tool_name}.csv'), index=False)

    if not gitlab_errors_tool.empty:
        gitlab_errors_tool.to_csv(os.path.join(
            gitlab_round3_error_path, f'Tool_{tool_name}.csv'), index=False)

    if not gitlab_errors_research.empty:
        gitlab_errors_research.to_csv(os.path.join(
            gitlab_round3_error_path, f'Research_{tool_name}.csv'), index=False)

    if not gitlab_errors_project.empty:
        gitlab_errors_project.to_csv(os.path.join(
            gitlab_round3_error_path, f'Project_{tool_name}.csv'), index=False)


In [None]:
for file_name in glob.glob(os.path.join(github_round2_path, '*.csv')):
    tool_name = file_name.split('_')[-1].split('.')[0]

    df = pd.read_csv(file_name)
    df['Name'] = df['Link'].apply(
        lambda x: x.removeprefix('https://gitlab.com/'))

    df_tool = df[df['Category'] == 'Tool']
    df_project = df[df['Category'] == 'Project']
    df_research = df[df['Category'] == 'Research']

    repos_tool = df_tool['Name'].to_list()
    repos_project = df_project['Name'].to_list()
    repos_research = df_research['Name'].to_list()

    github_repos_tool, github_errors_tool = github_miner.collect(
        repo_names=repos_tool)
    github_repos_research, github_errors_research = github_miner.collect(
        repo_names=repos_research)
    github_repos_project, github_errors_project = github_miner.collect(
        repo_names=repos_project)

    github_repos_tool.to_csv(os.path.join(
        github_round3_path, f'Tool_{tool_name}.csv'), index=False)
    github_repos_research.to_csv(os.path.join(
        github_round3_path, f'Research_{tool_name}.csv'), index=False)
    github_repos_project.to_csv(os.path.join(
        github_round3_path, f'Project_{tool_name}.csv'), index=False)

    if not github_errors_tool.empty:
        github_errors_tool.to_csv(os.path.join(
            github_round3_error_path, f'Tool_{tool_name}.csv'), index=False)

    if not github_errors_research.empty:
        github_errors_research.to_csv(os.path.join(
            github_round3_error_path, f'Research_{tool_name}.csv'), index=False)

    if not github_errors_project.empty:
        github_errors_project.to_csv(os.path.join(
            github_round3_error_path, f'Project_{tool_name}.csv'), index=False)


In [8]:
'''
import gitlab
import requests

url = 'https://gitlab.com/api/v4/projects/8926014/search?scope=issues&search=file'
private_token = gitlabToken

header = {
    'PRIVATE-TOKEN': 'glpat-TA5AUdJiykY59MeEdq14'
}
res = requests.get(url, headers=header)
print(res.json())
'''