In [1]:
import pandas as pd
import os

In [2]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_labeling):
    os.makedirs(path_labeling)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

In [3]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Optuna': 'optuna/optuna',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sage maker', 'sagemaker'],
    'Azure Machine Learning': ['azure machine learning', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai', 'guildai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex ai', 'vertexai'],
    'Weights & Biases': ['weights and biases', 'wandb', 'weights & biases', 'weights&biases', 'w & b', 'w&b']
}

issue_labels = {
    'bug',
    'error',
    'invalid',
}


In [4]:
from Scrape.GHMiner import GitHubMiner
from Scrape.GLMiner import GitLabMiner

github_miner = GitHubMiner(private_token=os.getenv('GITHUB_TOKEN'))
gitlab_miner = GitLabMiner(private_token=os.getenv('GITLAB_TOKEN'))

In [99]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name, release_time=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')


In [100]:
import json

dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')


In [101]:
github_repos = set()
gitlab_repos = set()

for index, row in dependents.iterrows():
    github_repos = github_repos.union(row['GitHub Dependents'])
    gitlab_repos = gitlab_repos.union(row['GitLab Dependents'])

print(len(github_repos))
print(len(gitlab_repos))

37786
10


In [104]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,189,1
1,Amazon SageMaker,1114,3
2,Azure Machine Learning,826,0
3,ClearML,498,0
4,Codalab,40,0
5,Comet,678,0
6,Determined,36,0
7,Domino,1,0
8,DVC,6099,0
9,Guild AI,67,4


In [6]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df_tool = pd.read_json(os.path.join(path_dataset, 'Tools.json'))
tools_release_date = pd.Series(pd.to_datetime(df_tool['First Release Date'].values), index=df_tool['Name']).to_dict()

In [None]:
# scrape issues of Gitlab dependents for each tool

df_issues = pd.DataFrame()

for index, row in df_dependents.iterrows():
    print(row['Tool'])
    for repo in row['GitLab Dependents']:
        print(repo)
        repo_data = gitlab_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > tools_release_date[row['Tool']]) and repo_data['#Issue'].values[0]:
            issues = gitlab_miner.scrape_issue(repo)
            issues['Tool'] = row['Tool']
            df_issues = pd.concat([df_issues, issues], ignore_index=True)
            df_issues.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')
    
df_issues.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')

In [None]:
# scrape issues of Github dependents for each tool

df_issues = pd.DataFrame()

for index, row in df_dependents.iterrows():
    print(row['Tool'])
    for repo in row['GitHub Dependents']:
        print(repo)
        repo_data = github_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > tools_release_date[row['Tool']]) and repo_data['#Issue'].values[0]:
            issues = github_miner.scrape_issue(repo)
            issues['Tool'] = row['Tool']
            df_issues = pd.concat([df_issues, issues], ignore_index=True)
            df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')
    
df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')

In [9]:
# scrape issues of Github dependents for each tool

from Scrape.GHMiner import GitHubMiner

github_miner = GitHubMiner(private_token='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')

df_issues = pd.read_json(os.path.join(path_github_issue, 'raw.json'))
mark = False
repo_done = ['Aim']

for index, row in df_dependents.iterrows():
    if row['Tool'] in repo_done:
        continue
    print(row['Tool'])
    for repo in row['GitHub Dependents']:
        if (not mark) and (repo != 'apache/airflow'):
            continue
        elif (not mark) and (repo == 'apache/airflow'):
            mark = True
        repo_data = github_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > tools_release_date[row['Tool']]) and repo_data['#Issue'].values[0]:
            print(repo)
            issues = github_miner.scrape_issue(repo)
            issues['Tool'] = row['Tool']
            df_issues = pd.concat([df_issues, issues], ignore_index=True)
            df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')
    
df_issues.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')

Amazon SageMaker
huggingface/transformers
apache/airflow


ReadTimeout: HTTPSConnectionPool(host='api.github.com', port=443): Read timed out. (read timeout=15)

In [None]:
# exclude issues that are not related to each tool

issues_gitlab = pd.read_json(os.path.join(path_gitlab_issue, 'raw.json'))
issues_github = pd.read_json(os.path.join(path_github_issue, 'raw.json'))

for index, row in issues_gitlab.iterrows():
    for keyword in tools_keywords[row['Tool']]:
        if keyword in row['Issue_title'].lower():
            break
    else:
        issues_gitlab.drop(index, inplace=True)

for index, row in issues_github.iterrows():
    for keyword in tools_keywords[row['Tool']]:
        if keyword in row['Issue_title'].lower():
            break
    else:
        issues_github.drop(index, inplace=True)

issues_github['Platform'] = 'Github'
issues_gitlab['Platform'] = 'Gitlab'

df_issues = pd.concat([issues_gitlab, issues_github], ignore_index=True)        
len(df_issues)

2719

In [None]:
final = set()
for _, row in df_issues['Issue_label'].map(set).items():
    final = final.union(row)
final

{'"experiments"',
 '0.4.6',
 '1.1',
 '1.4',
 '1.6',
 '1.7',
 '2.0',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 ':rotating_light:',
 '? - Needs Triage',
 'A: example-dvc-experiments',
 'A: example-get-started',
 'ADO',
 'AI\u202fFrameworks/ONNX',
 'AML Compute Instance',
 'API',
 'API & Doc',
 'Auto\u202fML',
 'BF',
 'Cloud',
 'Community',
 'Compute',
 'Core UI',
 'DRL',
 'Data Labeling',
 'Data4ML',
 'Data\u202fDrift',
 'Data\u202fPrep\u202fServices',
 'Documentation',
 'ERRATA_CANDIDATE',
 'Enhancement',
 'Environments',
 'Evaluation',
 'Experimentation UI',
 'FAQ',
 'Feature - Medium Priority',
 'HIGH',
 'HPO',
 'Hyperdrive',
 'Important',
 'In the roadmap',
 'Inf1',
 'Inference',
 'Ingestion',
 'Issue: Bug Report 🐞',
 'Issue: Feature Request',
 'L',
 'LOE: S',
 'Localized',
 'MLOps',
 'NLP',
 'NUM',
 'Needs Triage',
 'Not related to PyCaret',
 'Notebook',
 'Optional',
 'P0',
 'P1',
 'P2',
 'Pipelines',
 'Priority 1',
 'Rei

In [41]:
# filter out issues that are not related to challenges

for index, row in df_issues.iterrows():
    if not row['Issue_title'].isascii():
        df_issues.drop(index, inplace=True)
        continue
    
    if not row['Issue_label']:
        continue
    else:
        for label in row['Issue_label']:
            for issue_label in issue_labels:
                if issue_label in label.lower():
                    break
            else:
                continue
            break
        else:
            df_issues.drop(index, inplace=True)

len(df_issues)

1657

In [None]:
# create issue dataset

df_issues['Issue_score_count'] = df_issues['Issue_upvote_count'] - df_issues['Issue_downvote_count']
df_issues['Comment_score_count'] = df_issues['Comment_upvote_count'] - df_issues['Comment_downvote_count']
df_issues['Issue_body'] = df_issues['Issue_body'].fillna('')

del df_issues['Issue_upvote_count']
del df_issues['Issue_downvote_count']
del df_issues['Comment_upvote_count']
del df_issues['Comment_downvote_count']
del df_issues['Issue_label']

df_issues = df_issues.reindex(sorted(df_issues.columns), axis=1)
df_issues.to_json(os.path.join(path_labeling, 'issues.json'), indent=4, orient='records')