This notebooks makes the .tar.gz files (compressed archives) with checkpoints that are in the Dryad "results" dataset that accompanies the paper.

In [1]:
import logging
from pathlib import Path
import shutil
import tarfile

import pandas as pd
import pyprojroot

In [2]:
results_root = pyprojroot.here() / 'results'

Use error .csv files to figure out which results dirs we need to tar

In [3]:
err_csvs = sorted(results_root.rglob('err*csv'))
err_csvs = [
    err_csv
    for err_csv in err_csvs
    if 'Bengalese_Finches' in str(err_csv) or 'Canaries' in str(err_csv)
]

In [5]:
these_expt_dirs = [
    'learncurve',
    'long_train',
]

In [12]:
err_csvs = [
    err_csv
    for err_csv in err_csvs
    if any([expt_dir in str(err_csv) for expt_dir in these_expt_dirs]) and not '.ipynb_checkpoints' in str(err_csv)
]           

In [26]:
results_dirs = []
for err_csv in err_csvs:
    df = pd.read_csv(err_csv)
    if 'results_dir' in df:
        results_dirs.extend(
            df['results_dir'].unique().tolist()
        )
    else:
        assert 'long_train' in str(err_csv)
        results_root = err_csv.parent
        subdirs = [subdir for subdir in results_root.iterdir() if subdir.is_dir() and subdir.name.startswith('ll')]
        subdir_results_dirs = []
        for subdir in subdirs:
            subdir_results_dir = sorted(subdir.glob('results_*'))
            assert len(subdir_results_dir) == 1
            subdir_results_dir = subdir_results_dir[0]
            subdir_results_dirs.append(subdir_results_dir)
        subdir_results_dirs = [str(results_dir).replace('/home/art/Documents/repos/coding/birdsong/tweetynet/article/', '') 
                               for results_dir in subdir_results_dirs]
        results_dirs.extend(subdir_results_dirs)

In [28]:
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# add it to the root logger
logging.getLogger().addHandler(handler)

In [33]:
DRY_RUN = False
SKIP_EXISTING_TAR = False

TAR_ROOT_PATH = pyprojroot.here() / 'results' / 'tars'

logger = logging.Logger('targz.logger', level=logging.DEBUG)

for results_dir in results_dirs:
    # need to fix path for window_size_352, it's different for some reason
    if not results_dir.startswith('results') and results_dir.startswith(
        '/home/art/Documents/repos/coding/birdsong/tweetynet/'
    ):
        results_dir = results_dir.replace('/home/art/Documents/repos/coding/birdsong/tweetynet/', '')

    results_dir_path = pyprojroot.here() / results_dir

    if not results_dir_path.exists():
        print(
            f'does not exist: {results_dir_path}'
        )
    else:
        print(
            f'\nMaking tar from:\n{results_dir_path}'
        )

        if 'long_train' in results_dir:
            # only one checkpoint but use glob as a hack to get this to work the same way for long_train and learncurve
            checkpoints = sorted(results_dir_path.glob('TweetyNet/checkpoints/max-val-acc-checkpoint.pt'))
        else:
            largest_train_dur_dir = sorted(results_dir_path.glob('train_dur_*'), 
               key=lambda x: int(x.name.split('_')[-1].replace('s', ''))
              )[-1]
            checkpoints = sorted(largest_train_dur_dir.glob('replicate_*/TweetyNet/checkpoints/max-val-acc-checkpoint.pt'))
        print(
            f'found {len(checkpoints)} checkpoint files for largest training set duration'
        )

        tar_name = '-'.join(results_dir.split('/')[1:-1])
        tar_path = TAR_ROOT_PATH / f'{tar_name}-checkpoints.tar.gz'
        print(
            f'will generate archive as: {tar_path}'
        )

        if SKIP_EXISTING_TAR:
            if tar_path.exists():
                print (
                    f'\tSKIP_EXISTING_TAR is true and tar exists:\n\t{tar_path}.\n\tSkipping.'
                )

        if not DRY_RUN:
            tar = tarfile.open(str(tar_path), "w|gz")
            for checkpoint in checkpoints:
                tar.add(checkpoint)
            tar.close()


Making tar from:
/home/art/Documents/repos/coding/birdsong/tweetynet/article/results/Bengalese_Finches/learncurve/Bird0/results_210528_225043
found 10 checkpoint files for largest training set duration
will generate archive as: /home/art/Documents/repos/coding/birdsong/tweetynet/article/results/tars/Bengalese_Finches-learncurve-Bird0-checkpoints.tar.gz

Making tar from:
/home/art/Documents/repos/coding/birdsong/tweetynet/article/results/Bengalese_Finches/learncurve/Bird4/results_210529_031959
found 10 checkpoint files for largest training set duration
will generate archive as: /home/art/Documents/repos/coding/birdsong/tweetynet/article/results/tars/Bengalese_Finches-learncurve-Bird4-checkpoints.tar.gz

Making tar from:
/home/art/Documents/repos/coding/birdsong/tweetynet/article/results/Bengalese_Finches/learncurve/Bird7/results_210527_213421
found 10 checkpoint files for largest training set duration
will generate archive as: /home/art/Documents/repos/coding/birdsong/tweetynet/article