In [None]:
%pip install PyGithub python-gitlab pandas


In [1]:
from github import Github, GithubException
from GHMiner import GHMiner
from MyToken import *
import pandas as pd
import time
import glob
import json
import ast
import os


In [18]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets'
}

tools_creation_date = {
    'Databricks': '2021-05-27T00',
    'Amazon SageMaker': '2017-11-29T00',
    'Comet': '2017-01-01T00',
    'Neptune': '2018-01-01T00',
    'Spell': '2017-01-01T00',
    'Valohai': '2016-10-01T00',
    'Domino': '2013-04-01T00'
}

tools_link = {
    'Databricks': 'https://www.databricks.com',
    'Amazon SageMaker': 'https://aws.amazon.com/sagemaker',
    'Comet': 'http://comet.ml',
    'Neptune': 'https://neptune.ai',
    'Spell': 'https://spell.ml',
    'Valohai': 'https://valohai.com',
    'Domino': 'https://www.dominodatalab.com'
}

raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'


In [19]:
github_miner = GHMiner(githubToken)


In [20]:
tools_data = []

if not os.path.exists(tool_path):
    os.makedirs(tool_path)

for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    github_repo_names = set()
    gitlab_repo_names = set()
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        for repo_file in json_data['Results']:
            if 'github.com' in repo_file['repository']['name']: 
                repo_name = repo_file['repository']['name'].replace('github.com/', '')
                github_repo_names.add(repo_name)
            if 'gitlab.com' in repo_file['repository']['name']: 
                repo_name = repo_file['repository']['name'].replace('gitlab.com/', '')
                gitlab_repo_names.add(repo_name)
    tool_name = file_name.split('\\')[-1].split('.')[0]
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
        del tool_data['Archived']
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
        }
    tool_data['GitHub Dependents'] = list(github_repo_names)
    tool_data['GitLab Dependents'] = list(gitlab_repo_names)
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

tools_data = pd.concat(tools_data, ignore_index=True)
tools_data.to_csv(os.path.join(tool_path, 'Tools.csv'), index=False)
tools_data


Unnamed: 0,Repo,Link,Creation Date,Topics,#Star,#Watch,#Fork,#Releases,#Contributors,Languages,...,#Pull Requests,#Branches,#Tags,#Commits,#Comments,#Downloads,#Deployments,Name,GitHub Dependents,GitLab Dependents
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,"[python, ai, data-science, data-visualization,...",2832.0,35.0,174.0,42.0,44.0,TypeScript,...,19.0,67.0,147.0,1906.0,8.0,0.0,5.0,Aim,"[lizhipay/acg-faka, GrapheneOS/linux-hardened,...","[xdevs23/linux-nitrous, friendlyelec/rk3399-an..."
1,,https://aws.amazon.com/sagemaker,2017-11-29 00:00:00,,,,,,,,...,,,,,,,,Amazon SageMaker,"[awslabs/aws-customer-churn-pipeline, rshyamsu...","[juliensimon/amazon-studio-demos, juliensimon/..."
2,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:33,"[version-control, experiment-manager, version,...",3711.0,77.0,496.0,69.0,52.0,Python,...,2.0,3.0,120.0,1846.0,10.0,0.0,3.0,ClearML,"[pytorch/ignite, harrywang/tutorial-buffet, ze...",[]
3,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2013-11-12 02:18:24,[],132.0,20.0,77.0,110.0,53.0,Python,...,25.0,123.0,124.0,4512.0,23.0,0.0,1541.0,Codalab,"[bhadresh/dqp, rktamplayo/AutoSense, p-lambda/...","[illwieckz/i-love-compute, SlackBuilds.org/sla..."
4,,http://comet.ml,2017-01-01 00:00:00,,,,,,,,...,,,,,,,,Comet,[],[]
5,d6t/d6tflow,https://github.com/d6t/d6tflow,2019-02-02 21:19:03,[],943.0,22.0,73.0,0.0,11.0,Python,...,1.0,1.0,0.0,269.0,0.0,0.0,0.0,D6tflow,"[csebuetnlp/xl-sum, takeshi-teshima/few-shot-d...","[jan-teichmann/ml-flow-ds-project, juliensimon..."
6,activeloopai/deeplake,https://github.com/activeloopai/deeplake,2019-08-09 06:18:49,"[datasets, deep-learning, machine-learning, da...",4954.0,62.0,403.0,66.0,89.0,Python,...,19.0,76.0,91.0,6428.0,5.0,0.0,48.0,Deep Lake,"[activeloopai/examples, activeloopai/deeplake]",[]
7,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:29:08,"[deep-learning, machine-learning, ml-platform,...",1901.0,58.0,257.0,72.0,66.0,TypeScript,...,109.0,166.0,341.0,4485.0,46.0,0.0,26.0,Determined,"[mlcommons/tiny_results_v0.7, NVIDIA/data-scie...",[]
8,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,"[data-science, machine-learning, reproducibili...",10594.0,136.0,989.0,395.0,251.0,Python,...,19.0,13.0,409.0,8065.0,45.0,0.0,0.0,DVC,"[achao2013/deep3dmap, toandaominh1997/grow, U7...",[mlrep/mldev]
9,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 19:55:55,[],743.0,13.0,68.0,2.0,19.0,Python,...,1.0,58.0,19.0,5200.0,5.0,0.0,0.0,Guild AI,"[wrkzcoin/TipBot, ivanhawkes/Chrysalis, jackyk...",[]


In [5]:
from datetime import datetime
date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'), parse_dates=['Creation Date'], date_parser=date_parser)


In [6]:
filter_path = '../Dataset/Filter'

if not os.path.exists(filter_path):
    os.makedirs(filter_path)

round1_path = os.path.join(filter_path, 'Round#1')

if not os.path.exists(round1_path):
    os.makedirs(round1_path)

github_path = os.path.join(round1_path, 'GitHub')

if not os.path.exists(github_path):
    os.makedirs(github_path)
    
for index, row in df.iterrows():
    repos_data, errors_data = github_miner.collect(repo_names=ast.literal_eval(
        row['GitHub Dependents']), date=pd.to_datetime(row['Creation Date']))
    repos_data.to_csv(os.path.join(
        github_path, f'Repo_{row["Name"]}.csv'), index=False)
    errors_data.to_csv(os.path.join(
        github_path, f'Error_{row["Name"]}.csv'), index=False)


In [11]:
'''
creation_date = df.set_index('Name').to_dict()['Creation Date']
creation_date

redo_path = os.path.join(filter_path, 'Redo')
artifact_path = os.path.join(redo_path, 'Artifact')

for file_name in glob.glob(os.path.join(redo_path, '*.csv')):
    temp = pd.read_csv(file_name)
    tool_name = file_name.split('_')[-1].split('.')[0]
    repos_data, errors_data = github_miner.collect(
        repo_names=temp['Repo'].tolist(), date=creation_date[tool_name])
    repos_data.to_csv(os.path.join(
        artifact_path, f'Repo_{tool_name}.csv'), index=False)
    errors_data.to_csv(os.path.join(
        artifact_path, f'Error_{tool_name}.csv'), index=False)
'''