# Fetch all GitHub Repositories

This Notebook fetches all GitHub repositories that have been launched on
mybinder.org since we started keeping records. 

In [4]:
import pandas as pd
from tqdm.auto import tqdm

Read the [machine readable index](https://archive.analytics.mybinder.org/index.jsonl) to get
the list of event archive files. There is one archive file per day.

In [5]:
archive_index = pd.read_json('https://archive.analytics.mybinder.org/index.jsonl', lines=True)
event_files = 'https://archive.analytics.mybinder.org/' + archive_index['name']

Now we fetch all the event archive files & load them into a single pandas dataframe

In [6]:
all_launches = pd.DataFrame()
for event_file in tqdm(event_files):
    launches = pd.read_json(event_file, lines=True)
    all_launches = pd.concat([all_launches, launches])

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




Our dataframe now has *all* the launches on mybinder.org since Nov '18

In [7]:
len(all_launches)

547730

In [8]:
all_launches.head()

Unnamed: 0,provider,schema,spec,status,timestamp,version
0,GitHub,binderhub.jupyter.org/launch,Qiskit/qiskit-tutorial/master,success,2018-11-03 00:00:00,1
1,GitHub,binderhub.jupyter.org/launch,ipython/ipython-in-depth/master,success,2018-11-03 00:00:00,1
2,GitHub,binderhub.jupyter.org/launch,QISKit/qiskit-tutorial/master,success,2018-11-03 00:00:00,1
3,GitHub,binderhub.jupyter.org/launch,QISKit/qiskit-tutorial/master,success,2018-11-03 00:01:00,1
4,GitHub,binderhub.jupyter.org/launch,jupyterlab/jupyterlab-demo/master,success,2018-11-03 00:01:00,1


We select only launches from GitHub & split them into user, repo & ref columns

In [9]:
gh_launches = all_launches[all_launches.provider == 'GitHub'].spec\
                .str.split('/', expand=True, n=2)\
                .rename(columns={0: 'user', 1: 'repo', 2: 'ref'})

In [10]:
len(gh_launches)

545160

In [11]:
gh_launches.head()

Unnamed: 0,user,repo,ref
0,Qiskit,qiskit-tutorial,master
1,ipython,ipython-in-depth,master
2,QISKit,qiskit-tutorial,master
3,QISKit,qiskit-tutorial,master
4,jupyterlab,jupyterlab-demo,master


For this analysis, we want the unique list of repositories that have been
launched. However, I can't find a way to do this natively in pandas easily,
since we need this to be unique across two columns

In [12]:
# FIXME: Pandas should have a way to select unique things across two columns, right?!
repos = set()
launches_iter = tqdm(gh_launches.iterrows())
for _, launch in launches_iter:
    repos.add((launch.user, launch.repo))
    launches_iter.set_description(f'Found {len(repos)} repos')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Check out all repositories

We check out repositories that already haven't been checked out.
If a repository is no longer available, we ignore it.

Currently, we also ignore gists.

In [13]:
import subprocess
import os
import requests

class RepoDeletedException(Exception):
    """
    Raised when we attempt to check out a repo that has been deleted.
    
    Useful to distinguish this case from git checkout generally failing
    """
    pass

def checkout_repo(prefix, user, repo):
    """
    Checkout a github repo if it isn't already checked out
    
    Repos are created under {prefix}/{user}/{repo}. If there
    already is a repo there, it isn't updated.
    
    Raises RepoDeletedException if the repository does not
    exist / has been made private.
    """
    checkout_path = os.path.join(prefix, user, repo)
    repo_url = f'https://github.com/{user}/{repo}'
    if os.path.exists(checkout_path):
        # Already checked out, so let's get out
        return
    os.makedirs(os.path.join(prefix, user), exist_ok=True)
    try:
        subprocess.check_call([
            'git', 'clone', repo_url, checkout_path
        ])
    except subprocess.CalledProcessError as e:
        if requests.get(repo_url).status_code == 404:
            raise RepoDeletedException()
        raise

In [14]:
repositer = tqdm(repos, dynamic_ncols=True, desc='Checking out')
deleted_repos = 0
with tqdm(dynamic_ncols=True) as not_found_progress:
    for user, repo in repositer:
        try:
            checkout_repo('repos', user, repo)
            repositer.set_description(f'{user}/{repo}')
        except RepoDeletedException:
            deleted_repos += 1
            not_found_progress.update(deleted_repos)
            not_found_progress.set_description(f'{user}/{repo}')

HBox(children=(IntProgress(value=0, description='Checking out', layout=Layout(flex='2'), max=5405, style=Progr…

HBox(children=(IntProgress(value=1, bar_style='info', layout=Layout(flex='2'), max=1), HTML(value='')), layout…