In [None]:
%pip install PyGithub pandas


In [1]:
from github import Github, GithubException
from GHMiner import GHMiner
from MyToken import accessToken
import pandas as pd
import time
import glob
import json
import ast
import os


In [2]:
tool_repos = {
    'MLflow': 'mlflow/mlflow',
    'DVC': 'iterative/dvc',
    'Aim': 'aimhubio/aim',
    'ClearML': 'allegroai/clearml',
    'Pachyderm': 'pachyderm/pachyderm',
    'LakeFS': 'treeverse/lakeFS',
    'Sacred': 'IDSIA/sacred',
    'Guild AI': 'guildai/guildai',
    'ModelDB': 'VertaAI/modeldb',
    'Polyaxon': 'polyaxon/polyaxon',
    'Quilt': 'quiltdata/quilt',
    'D6tflow': 'd6t/d6tflow',
    'Deep Lake': 'activeloopai/deeplake',
    'Keepsake': 'replicate/keepsake',
    'Determined': 'determined-ai/determined',
    'Weights & Biases': 'wandb/wandb',
    'Codalab': 'codalab/codalab-worksheets'
}

tools_creation_date = {
    'Databricks': '2021-05-27T00',
    'Amazon SageMaker': '2017-11-29T00',
    'Comet': '2017-01-01T00',
    'Neptune': '2018-01-01T00',
    'Spell': '2017-01-01T00',
    'Valohai': '2016-10-01T00',
    'Domino': '2013-04-01T00'
}

tools_link = {
    'Databricks': 'https://www.databricks.com',
    'Amazon SageMaker': 'https://aws.amazon.com/sagemaker',
    'Comet': 'http://comet.ml',
    'Neptune': 'https://neptune.ai',
    'Spell': 'https://spell.ml',
    'Valohai': 'https://valohai.com',
    'Domino': 'https://www.dominodatalab.com'
}

raw_path = '../Dataset/Raw'
tool_path = '../Dataset/Tool'


In [3]:
github_miner = GHMiner(accessToken)


In [4]:
tools_data = []

if not os.path.exists(tool_path):
    os.makedirs(tool_path)

for file_name in glob.glob(os.path.join(raw_path, '*.json')):
    json_file = open(file_name)
    json_data = json.load(json_file)
    repo_names = set()
    for repo_file in json_data['Results']:
        repo_name = repo_file['repository']['name'].replace('github.com/', '')
        repo_names.add(repo_name)
    json_file.close()
    tool_name = file_name.split('/')[-1].split('.')[0]
    if tool_name in tool_repos:
        tool_data, error_data = github_miner.scrape(
            repo_name=tool_repos[tool_name], name=tool_name, real_creation_date=False)
        del tool_data['Archived']
    else:
        tool_data = {
            'Name': tool_name,
            'Link': tools_link[tool_name],
            'Creation Date': pd.Timestamp(tools_creation_date[tool_name])
        }
    tool_data['Dependents'] = list(repo_names)
    tool_data = pd.DataFrame([tool_data])
    tools_data.append(tool_data)

tools_data = pd.concat(tools_data, ignore_index=True)
tools_data.to_csv(os.path.join(tool_path, 'Tools.csv'), index=False)
tools_data


Unnamed: 0,Name,Link,Creation Date,Dependents,Repo,Description,Topics,#Star,#Watch,#Fork,...,Languages,Size,#Open Issues,#Pull Requests,#Branches,#Tags,#Commits,#Comments,#Downloads,#Deployments
0,Amazon SageMaker,https://aws.amazon.com/sagemaker,2022-01-01 00:00:00,"[arunpa0206/mltrainingv1, aws-samples/amazon-s...",,,,,,,...,,,,,,,,,,
1,MLflow,https://github.com/mlflow/mlflow,2018-06-05 16:05:58,"[equinor/flownet, julilien/PLDepth, AllenInsti...",mlflow/mlflow,Open source platform for the machine learning ...,"[machine-learning, ai, ml, mlflow, apache-spar...",12889.0,284.0,3005.0,...,Python,106935.0,971.0,139.0,190.0,106.0,3362.0,38.0,0.0,0.0
2,DVC,https://github.com/iterative/dvc,2017-03-04 08:16:33,"[NMGRL/pychron, opendevstack/ods-project-quick...",iterative/dvc,ðŸ¦‰Data Version Control | Git for Data & Models ...,"[data-science, machine-learning, reproducibili...",10574.0,136.0,987.0,...,Python,16536.0,661.0,18.0,11.0,406.0,8058.0,45.0,0.0,0.0
3,Weights & Biases,https://github.com/wandb/wandb,2017-03-24 05:46:23,"[slyautomation/osrs_yolov5, facebookresearch/O...",wandb/wandb,ðŸ”¥ A tool for visualizing and tracking your mac...,"[machine-learning, experiment-track, deep-lear...",4898.0,36.0,376.0,...,Python,58487.0,675.0,157.0,463.0,117.0,4866.0,45.0,0.0,2281.0
4,ClearML,https://github.com/allegroai/clearml,2019-06-10 08:18:32,"[sigal-raab/MoDi, RizwanMunawar/yolov5-object-...",allegroai/clearml,ClearML - Auto-Magical CI/CD to streamline you...,"[version-control, experiment-manager, version,...",3703.0,77.0,495.0,...,Python,37136.0,292.0,4.0,3.0,119.0,1838.0,10.0,0.0,3.0
5,Neptune,https://neptune.ai,2018-01-01 00:00:00,"[skorch-dev/skorch, amundsen-io/amundsenmetada...",,,,,,,...,,,,,,,,,,
6,Comet,http://comet.ml,2017-01-01 00:00:00,[],,,,,,,...,,,,,,,,,,
7,Pachyderm,https://github.com/pachyderm/pachyderm,2014-09-04 07:50:02,"[pachyderm/examples, dwhitena/pach-go-regressi...",pachyderm/pachyderm,Data-Centric Pipelines and Data Versioning,"[go, pachyderm, docker, analytics, big-data, c...",5666.0,168.0,537.0,...,Go,304864.0,870.0,179.0,1095.0,1012.0,21406.0,31.0,0.0,1.0
8,LakeFS,https://github.com/treeverse/lakeFS,2019-09-12 11:46:28,"[treeverse/lakeFS-samples, treeverse/dais-chal...",treeverse/lakeFS,Git-like capabilities for your object storage,"[data-engineering, data-versioning, go, object...",2910.0,38.0,254.0,...,Go,81345.0,461.0,20.0,470.0,117.0,3576.0,10.0,0.0,2489.0
9,Spell,https://spell.ml,2017-01-01 00:00:00,"[natsheh/sensim, ihmeuw-msca/CurveFit, evandez...",,,,,,,...,,,,,,,,,,


In [4]:
filter_path = '../Dataset/Filter'

if not os.path.exists(filter_path):
    os.makedirs(filter_path)


In [5]:
from datetime import datetime
date_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

df = pd.read_csv(os.path.join(tool_path, 'Tools.csv'), parse_dates=['Creation Date'], date_parser=date_parser)

round1_path = os.path.join(filter_path, 'Round#1')

if not os.path.exists(round1_path):
    os.makedirs(round1_path)


In [6]:
for index, row in df.iterrows():
    repos_data, errors_data = github_miner.collect(repo_names=ast.literal_eval(
        row['Dependents']), date=pd.to_datetime(row['Creation Date']))
    repos_data.to_csv(os.path.join(
        round1_path, f'Repo_{row["Name"]}.csv'), index=False)
    errors_data.to_csv(os.path.join(
        round1_path, f'Error_{row["Name"]}.csv'), index=False)


In [None]:
creation_date = df.set_index('Name').to_dict()['Creation Date']
creation_date

redo_path = os.path.join(filter_path, 'Redo')
artifact_path = os.path.join(redo_path, 'Artifact')

for file_name in glob.glob(os.path.join(redo_path, '*.csv')):
    temp = pd.read_csv(file_name)
    tool_name = file_name.split('_')[-1].split('.')[0]
    repos_data, errors_data = github_miner.collect(
        repo_names=temp['Repo'].tolist(), date=creation_date[tool_name])
    repos_data.to_csv(os.path.join(
        artifact_path, f'Repo_{tool_name}.csv'), index=False)
    errors_data.to_csv(os.path.join(
        artifact_path, f'Error_{tool_name}.csv'), index=False)
