# Prune repository contents

This notebook clears out all files in the repositories we have checked
out except for the files repo2docker uses to determine what to build.

In [19]:
import os
from glob import glob
from tqdm.auto import tqdm
import shutil

In [20]:
def find_paths_to_keep(repos_dir, filenames):
    """
    Yield list of paths to be kept from inside repos_dir
    
    If there is a 'binder' directory, it will be returned in
    full. If not, files in the root directory that match any
    of the names in the `filenames` iterable will be returned.
    """
    for repo in glob(os.path.join(repos_dir, '*', '*')):
        binder_dir = os.path.join(repo, 'binder')
        if os.path.exists(binder_dir):
            yield binder_dir
            continue
        
        for filename in filenames:
            fullpath = os.path.join(repo, filename)
            if os.path.exists(fullpath):
                yield fullpath
    

In [21]:
filenames_to_keep = [
    'requirements.txt',
    'runtime.txt',
    'apt.txt',
    'Dockerfile',
    'postBuild',
    'install.R',
    'DESCRIPTION',
    'setup.py',
    'REQUIRE'
]

for kept_path in tqdm(find_files_to_keep('repos', filenames_to_keep)):
    target_path = os.path.join('sparse', kept_path)
    os.makedirs(os.path.dirname(target_path), exist_ok=True)
    if os.path.isdir(kept_path):
        if os.path.exists(target_path):
            shutil.rmtree(target_path)
        shutil.copytree(kept_path, target_path)
    else:
        shutil.copy2(kept_path, target_path)
    

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


