In [None]:
#|default_exp wandb

In [None]:
#|hide
# from __future__ import annotations
from fastcore.test import *
from fastcore.utils import run
from metagentorch.cnn_virus.utils import setup_nb
from nbdev.showdoc import *
from typing import List

import nbdev

ON_COLAB, p2dataroot, p2data = setup_nb(_dev=True)

if ON_COLAB:
    cmd = 'pip install -qqU wandb'
    run(cmd)

%load_ext autoreload
%autoreload 2

Running locally


In [None]:
#|export
# Imports all dependencies
import numpy as np
import pandas as pd
import os
# import tensorflow as tf
import wandb

from datetime import datetime
from pathlib import Path
from typing import Callable, Tuple

In [None]:
#|hide
# print(f"Tensorflow version: {tf.__version__} - Expected 2.8.2")
# print(f"WandB version: {wandb.__version__}")

# wandb

> Tracking projects experiments with **WandB**

Once setup, **WandB** tracks datasets, models, training runs, evaluation runs across several experiments. The original documentation is [here](https://docs.wandb.ai/).

Key concepts we use in this package: 

- `Run` (status, losses and other metadata logged during training or evaluation experiments).
- `Artifact` (datasets, code (incl. notebooks), models, ...).
    - Artifacts can refer to a single file or to a directory with multiple files

## Steps:

- Login to **`wandb`**: 
    - May require an API key, which is available at [https://wandb.ai/authorize](https://wandb.ai/authorize). To access the API key, must be logged in onto WandB.
- Initialize a **`Run`** with desired parameters and metadata
- Perform operations to be tracked (e.g. train model, load dataset as artifact, ...)
- Finish the **`Run`**

In [None]:
#| export
def login_nb(
    nb_file: str|Path   # name of the notebook (str) or path to the notebook (Path)
    ):
    """First step to setup WandB from notebook. Logs in and logs passed notebook as source of code"""

    # Validate nb_file
    if nb_file is None:
        raise TypeError('the file name of the current nb is required to allow code tracking')
    if isinstance(nb_file, str):
        if nb_file[-6:] != '.ipynb': nb_file = f"{nb_file}.ipynb"
        nb_file = Path.cwd() / nb_file
    elif not isinstance(nb_file, Path):
        raise TypeError('nb_file must me a `str` or a `Path`')
    
    if not nb_file.is_file():
        raise ValueError(f"{nb_file.name} is not a file, please correct the file name")

    # Registers notebook as WandB code
    os.environ['WANDB_NOTEBOOK_NAME'] = str(nb_file.absolute())
    print(f"Logging in from notebook: {os.environ['WANDB_NOTEBOOK_NAME']}")

    wandb.login(relogin=False)    

To allow WandB to store the code used for the session, the **name or path of the notebook must be passed** as argument `nb_file`.

Example:

In [None]:
#|eval: false
login_nb('01_wandb')

Logging in from notebook: /home/vtec/projects/bio/metagentools/nbs-dev/01_wandb.ipynb


[34m[1mwandb[0m: Currently logged in as: [33mvtecftyw[0m. Use [1m`wandb login --relogin`[0m to force relogin


`login_nb` raises error in the following cases:

- If `nb_file` is not passed, the function raises a `TypeError`

In [None]:
#| hide
test_fail(login_nb, kwargs={'nb_file':None}, contains='the file name of the current nb')

- If `nb_file` is not a string or a Path, the function raises a `TypeError`

In [None]:
#| hide
test_fail(login_nb, kwargs={'nb_file':999}, contains='nb_file must me a `str` or a `Path`')

- There must exist a file `nb_file` or a `ValueError` is raised

In [None]:
#| hide
test_fail(login_nb, kwargs={'nb_file':'fake_file_name'}, contains='is not a file, please correct the file name')

In [None]:
#| export
class WandbRun():
    """Manages a WandB run and all logged actions performed while run is active. Close run with .finish()"""
    
    def __init__(
        self,
        entity: str='',             # user or organization under which the run will be logged. Default: `metagenomics_sh` 
        project: str='',            # name of the WandB project under which the run will be logged 
        run_name: str='',           # unique name for the run,
        job_type: str='',           # e.g.: `load_datasets`, `train_exp`, ... 
        notes: str='',              # any text description or additional information to store with the run 
        logs_dir: str|Path|None= None,   # default is project_root/wandb-logs if None, or uses the passed Path
        testing: bool=False         # (optional) If True, will not create a run on WandB. Use for local testing
        ) :
        """Validates metadata inputs and initialize the wandb run, unless testing is set to True"""
        
        # Validate inputs
        for k,v in [key_val for key_val in locals().items() if key_val[0] not in ['self', 'notes', 'testing', 'logs_dir']]:
            if v == '': raise ValueError(f"{k} may not be an empty string. Please provide a value")

        for k,v in [key_val for key_val in locals().items() if key_val[0] not in ['self', 'testing', 'logs_dir']]:
            if not isinstance(v, str): raise TypeError(f"{k} must be a string, not a {type(v)}")

        self.entity = entity
        self.project = project
        self.run_name = run_name
        self.job_type = job_type
        self.notes = notes
        
        if logs_dir is None:
            self.wandb_logs = self._get_wandb_logs_dir()
        elif isinstance(logs_dir, str):
            self.wandb_logs = Path(logs_dir)
        elif isinstance(logs_dir, Path):
            self.wandb_logs = logs_dir
        else:
            raise ValueError(f"logs_dir must be None, a str or a Path, not a {type(logs_dir)}")
        
        if not testing:
            self.run = wandb.init(
                entity=entity, 
                project=project, 
                name=run_name, 
                job_type=job_type, 
                notes=notes, 
                save_code=True,
                dir= self.wandb_logs
            )

    def finish(self):
        """End the run"""
        self.run.finish()
    
    @staticmethod
    def _get_wandb_logs_dir():
        cur_dir_parents = Path().absolute().parents
        wandb_logs_dir = [p for p in cur_dir_parents if 'nbs' not in p.name][0] / 'wandb-logs'
        if not wandb_logs_dir.is_dir():
            raise ValueError(f"Cannot find the wandb-logs directory. Please specify the correct path ")
        return wandb_logs_dir
        
    def upload_dataset(
        self, 
        ds_path: str,                 # path to the file or directory to load as dataset artifact 
        ds_name: str,                 # name for the dataset
        ds_type: str,                 # type of dataset: e.g. raw_data, processed_data, ...
        ds_descr: str,                # short description of the dataset
        ds_metadata: dict,            # keys/values for metadata on the dataset, eg. nb_samples, ...
        load_type:str = 'file',       # `file` to load a single file, `dir` to load all files in a directory
        wait_completion: bool = False # when True, wait completion of the logging before returning artifact
        ):
        """Load a dataset from a file as WandB artifact, with associated information and metadata"""
        
        # validate ds_path
        if load_type not in ['file', 'dir']:
            raise ValueError(f"load_type must be 'file' or 'dir'")
        if load_type == 'file' and not Path(ds_path).is_file():
            raise ValueError(f"No file found as {ds_path}. Please check path or load type")
        if load_type == 'dir' and not Path(ds_path).is_dir():
            raise ValueError(f"No directory found as {ds_path}. Please check path or load type")

        artifact = wandb.Artifact(name=ds_name, type=ds_type, description=ds_descr, metadata=ds_metadata)

        if load_type == 'file':
            artifact.add_file(ds_path, ds_name)
        if load_type == 'dir':
            artifact.add_dir(ds_path, ds_name)
        
        self.run.log_artifact(artifact)
        
        print(f"Dataset {ds_name} is being logged as artifact ...")
        
        if wait_completion:
            artifact.wait()
            print(f"Dataset {ds_name} logging completed")
            print(f"Artifact state: {artifact.state}")
        
        return artifact

#### Create a Run instance

WandbRun allows to define a set of metadata associated with the run, such as `entity`, `project`, `name`, `job_type` and additional `notes`.

Example:

- set the parameters

In [None]:
entity = 'metagenomics_sh'
project = 'coding-with-nbdev'
run_name = 'nbdev-test'
job_type = "code_testing"
notes = 'any other information of interest for the future'

- create a `WandbRun` instance called `wandb_run`

In [None]:
#|eval: false
wandb_run = WandbRun(
    entity=entity, 
    project=project, 
    run_name=run_name, 
    job_type=job_type, 
    notes=notes
    )

[34m[1mwandb[0m: Currently logged in as: [33mvtecftyw[0m ([33mmetagenomics_sh[0m). Use [1m`wandb login --relogin`[0m to force relogin


`WandbRun` instantiation raises an error in the following cases:

- If one of `entity`, `project`, `run_name` or `job_type` is not passed, the function raises a `ValueError`

In [None]:
#| hide
test_fail(WandbRun, kwargs={}, contains='entity may not be an empty string')
test_fail(WandbRun, kwargs={'entity':entity}, contains='project may not be an empty string')
test_fail(WandbRun, kwargs={'entity':entity, 'project':project}, contains='run_name may not be an empty string')
test_fail(WandbRun, kwargs={'entity':entity, 'project':project, 'run_name':run_name}, contains='job_type may not be an empty string')

- If one of `entity`, `project`, `run_name`, `job_type` or `notes` is not a string, the function raises a `TypeError`

In [None]:
#| hide
test_fail(
    WandbRun, 
    kwargs={'entity':9, 'project':project, 'run_name':run_name, 'job_type':job_type, 'notes':notes, 'testing':True}, 
    contains='entity must be a string'
    )
test_fail(
    WandbRun, 
    kwargs={'entity':entity, 'project':9, 'run_name':run_name, 'job_type':job_type, 'notes':notes, 'testing':True}, 
    contains='project must be a string'
    )
test_fail(
    WandbRun, 
    kwargs={'entity':entity, 'project':project, 'run_name':9, 'job_type':job_type, 'notes':notes, 'testing':True}, 
    contains='run_name must be a string'
    )
test_fail(
    WandbRun, 
    kwargs={'entity':entity, 'project':project, 'run_name':run_name, 'job_type':9, 'notes':notes, 'testing':True}, 
    contains='job_type must be a string'
    )
test_fail(
    WandbRun, 
    kwargs={'entity':entity, 'project':project, 'run_name':run_name, 'job_type':job_type, 'notes':9, 'testing':True}, 
    contains='notes must be a string'
    )

In [None]:
show_doc(WandbRun.upload_dataset)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/wandb.py#L106){target="_blank" style="float:right; font-size:smaller"}

### WandbRun.upload_dataset

>      WandbRun.upload_dataset (ds_path:str, ds_name:str, ds_type:str,
>                               ds_descr:str, ds_metadata:dict,
>                               load_type:str='file',
>                               wait_completion:bool=False)

*Load a dataset from a file as WandB artifact, with associated information and metadata*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| ds_path | str |  | path to the file or directory to load as dataset artifact |
| ds_name | str |  | name for the dataset |
| ds_type | str |  | type of dataset: e.g. raw_data, processed_data, ... |
| ds_descr | str |  | short description of the dataset |
| ds_metadata | dict |  | keys/values for metadata on the dataset, eg. nb_samples, ... |
| load_type | str | file | `file` to load a single file, `dir` to load all files in a directory |
| wait_completion | bool | False | when True, wait completion of the logging before returning artifact |

#### Load a dataset from a single file

In [None]:
p2ds = Path('data_dev/ncbi/refsequences/cov/cov_virus_sequence_one.fa')
assert p2ds.is_file()

ds_fname = str(p2ds.absolute())
ds_name = 'cov_one_sequence'
ds_type = 'cov_sequences'
ds_descr = 'one covid sequence fasta file'

ds_metadata = {
    'nb_sequences': 1,
    'file type': 'fasta',
}

In [None]:
#|eval: false
atx_one_file = wandb_run.upload_dataset(
    ds_path=ds_fname,
    ds_name=ds_name,
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=ds_metadata,
    load_type='file',
)

Dataset cov_one_sequence is being logged as artifact ...


#### Load a dataset with several files from a directory.

In [None]:
p2ds_dir = Path('data_dev/ncbi/refsequences/cov/single_1seq_150bp')
assert p2ds_dir.is_dir()

ds_dirname = str(p2ds_dir.absolute())
ds_name = 'cov_reads_single_1_sequence_150bp'
ds_type = 'sim_reads'
ds_descr = 'Simulated single reads of one cov sequence fq and aln files'

ds_metadata = {
    'nb_sequences': 1,
    'sim_type': 'single',
    'read_length': 150,
    'fold': 100,
}

In [None]:
#|eval: false
atx_multi_files = wandb_run.upload_dataset(
    ds_path=ds_dirname,
    ds_name=ds_name,
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=ds_metadata,
    load_type='dir',
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov/single_1seq_150bp)... Done. 0.2s


Dataset cov_reads_single_1_sequence_150bp is being logged as artifact ...


`WandbRun.upload_dataset` raises an error in the following cases:

- `ds_path` is a file and `load_type` is `dir`

In [None]:
#| hide
#|eval: false
# Passing a file but defining load_type as 'dir'
ds_path = str(Path('data_dev/cov_virus_sequence_one.fa').absolute())
ds_name = 'test_ds'
ds_type = 'to_delete'
ds_descr = 'ds used for testing'

ds_metadata = {
    'type': 'test'
}

test_fail(
    wandb_run.upload_dataset, 
    kwargs={'load_type':'dir', 'ds_path':ds_path, 'ds_name':ds_name, 'ds_type':ds_type, 'ds_descr':ds_descr, 'ds_metadata':ds_metadata}, 
    contains='No directory found as'
    )

- `ds_path` is a directory and `load_type` is 'file`

In [None]:
#| hide
#|eval: false
# Passing a directory but defining load_type as 'file'
ds_path = str(Path('data_dev/single_one_150bp').absolute())
ds_name = 'test_ds'
ds_type = 'to_delete'
ds_descr = 'ds used for testing'

ds_metadata = {
    'type': 'test'
}

test_fail(
    wandb_run.upload_dataset, 
    kwargs={'load_type':'file', 'ds_path':ds_path, 'ds_name':ds_name, 'ds_type':ds_type, 'ds_descr':ds_descr, 'ds_metadata':ds_metadata}, 
    contains='No file found as'
    )

- `load_type` has another value then `file` or `dir`

In [None]:
#| hide
#|eval: false
# load_type neither 'file' nor 'dir'
ds_path = str(Path('data_dev/reads_single_one_150bp').absolute())
ds_name = 'test_ds'
ds_type = 'to_delete'
ds_descr = 'ds used for testing'

ds_metadata = {
    'type': 'test'
}

test_fail(
    wandb_run.upload_dataset, 
    kwargs={'load_type':'wrong_type', 'ds_path':ds_path, 'ds_name':ds_name, 'ds_type':ds_type, 'ds_descr':ds_descr, 'ds_metadata':ds_metadata}, 
    contains="load_type must be 'file' or 'dir'"
    )

### Close a WandB run

In [None]:
#|eval: false
wandb_run.finish()

VBox(children=(Label(value='13.231 MB of 13.231 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
#| export
def entity_projects(
    entity: str # name of the entity from which the projects will be retrieved
    ) -> wandb.apis.public.Projects : # Projects iterator
    """Returns all projects under 'entity', as an iterable collection"""
    api = wandb.Api()
    projects = api.projects(entity=entity)
    return projects

`entity_projects` inquires WandB to retrieve all the projects, and returns them as an iterable object.

Each element in the iterator is a `wandb.Project` object. Each [`Project`](https://docs.wandb.ai/ref/python/public-api/project) object has the following attributes:

- `_attrs`: dict of attributes associated with the project (`id`, `name`, `entityName`, `createdAt`). These attributes can be called directly as `object.id`, ...
- `entity`
- `name`: project name
- `path`: as a list [entity, name]
- `url`: the url to the project workspace ('https://wandb.ai/entity/project/workspace')

In [None]:
#|eval: false

projs = entity_projects(entity='vtecftyw')

for p in projs:
    print(f"{p.name}:")
    print('  name:   ', p.name)
    print('  entity  ', p.entity)
    print('  path:   ', p.path)
    print()
    print('  url:    ', p.url)
    print('  id:     ', p.id)
    print('  created:', p.createdAt)
    print('  _attrs: ', p._attrs)
    print()

pytorch-intro:
  name:    pytorch-intro
  entity   vtecftyw
  path:    ['vtecftyw', 'pytorch-intro']

  url:     https://wandb.ai/vtecftyw/pytorch-intro/workspace
  id:      UHJvamVjdDp2MTpweXRvcmNoLWludHJvOnZ0ZWNmdHl3
  created: 2024-12-12T09:04:33Z
  _attrs:  {'id': 'UHJvamVjdDp2MTpweXRvcmNoLWludHJvOnZ0ZWNmdHl3', 'name': 'pytorch-intro', 'entityName': 'vtecftyw', 'createdAt': '2024-12-12T09:04:33Z', 'isBenchmark': False}

basic-intro:
  name:    basic-intro
  entity   vtecftyw
  path:    ['vtecftyw', 'basic-intro']

  url:     https://wandb.ai/vtecftyw/basic-intro/workspace
  id:      UHJvamVjdDp2MTpiYXNpYy1pbnRybzp2dGVjZnR5dw==
  created: 2024-12-12T08:54:34Z
  _attrs:  {'id': 'UHJvamVjdDp2MTpiYXNpYy1pbnRybzp2dGVjZnR5dw==', 'name': 'basic-intro', 'entityName': 'vtecftyw', 'createdAt': '2024-12-12T08:54:34Z', 'isBenchmark': False}

tut_artifacts:
  name:    tut_artifacts
  entity   vtecftyw
  path:    ['vtecftyw', 'tut_artifacts']

  url:     https://wandb.ai/vtecftyw/tut_artifacts/w

In [None]:
#| export
def get_project(
    entity: str,        # name of the entity from which the project will be retrieved 
    project_name:str,   # name of the project to retrieve
    ) -> wandb.apis.public.Project : # Project object
    """Returns project object defined by entity and project name"""
    api = wandb.Api()
    return api.from_path(f"{entity}/{project_name}")

In [None]:
#|eval: false

p = get_project('vtecftyw', 'tut_artifacts')

print(type(p))

print(p.entity,'\n', p.name,'\n', p.path,'\n', p.url)

<class 'wandb.apis.public.projects.Project'>
vtecftyw 
 tut_artifacts 
 ['vtecftyw', 'tut_artifacts'] 
 https://wandb.ai/vtecftyw/tut_artifacts/workspace


In [None]:
#| export
def print_entity_project_list(entity):
    """Print the name and url of all projects in entity"""
    projects = entity_projects(entity)
    print(f"List of projects under entity <{entity}>")
    for i, p in enumerate(projects):
        print(f" {i:2d}. {p.name:30s} (url: {p.url})")

In [None]:
#|eval: false

print_entity_project_list('vtecftyw')

List of projects under entity <vtecftyw>
  0. pytorch-intro                  (url: https://wandb.ai/vtecftyw/pytorch-intro/workspace)
  1. basic-intro                    (url: https://wandb.ai/vtecftyw/basic-intro/workspace)
  2. tut_artifacts                  (url: https://wandb.ai/vtecftyw/tut_artifacts/workspace)
  3. metagenomics                   (url: https://wandb.ai/vtecftyw/metagenomics/workspace)
  4. wand-hello-world-fastai        (url: https://wandb.ai/vtecftyw/wand-hello-world-fastai/workspace)


In [None]:
#| export
def project_artifacts(
    entity: str,                     # name of the entity from which to retrieve the artifacts 
    project_name: str,               # name of the project from which to retrieve the artifacts 
    by_alias: str='latest',          # name of the alias to filter by
    by_type: str=None,               # name of the artifact type to filter by (optional)
    by_version: str=None             # version to filter by (optional)
    ) -> Tuple[pd.DataFrame, list]:  # df w/ all artifacts and related info; list of artifact types in the project
    """Returns all artifacts in project, w/ key info, filtered by alias, types and version + list of artifact types"""
    api = wandb.Api()
    project = api.from_path(f"{entity}/{project_name}")
    atx_types = project.artifacts_types()
    runs = api.runs(path=f"{entity}/{project_name}")

    # validate by_type parameter
    if by_type is not None and by_type not in [t.name for t in atx_types]:
        raise ValueError(f"{by_type} is not an artifact type in {entity}/{project_name}")

    # create a df where each row corresponds to one artifact logged during one run in this project
    # some artifact may be duplicated when linked to more than one run. Those duplicate need to be filtered out
    cols = 'atx_name atx_type atx_id atx_state atx_version atx_aliases file_count created updated'.split(' ')
    artifacts_df = pd.DataFrame(columns=cols)
    
    for r in runs:
        for atx in r.logged_artifacts():
            metadata = [atx.name, atx.type, atx.id, atx.state, atx.version, atx.aliases, atx.file_count, atx.created_at, atx.updated_at]
            row = pd.DataFrame({k:v for k, v in zip(cols, metadata)})
            artifacts_df = pd.concat((artifacts_df, row), axis=0, ignore_index=True)
    artifacts_df = artifacts_df.loc[~artifacts_df.duplicated(subset=['atx_id'], keep='first'), :]

    cols2show = 'atx_name atx_version atx_type atx_aliases file_count created updated atx_id'.split(' ')
    # filtering by passed alias and type:
    #   if by_xxx is not None:    filter is a boolean vector
    #   if by_xxx is None:        filter is an array of 'True'
    nbr_rows = artifacts_df.shape[0]
    alias_filter = artifacts_df.atx_aliases==by_alias if by_alias is not None else np.ones(shape=(nbr_rows,), dtype=bool)
    type_filter = artifacts_df.atx_type==by_type if by_type is not None else np.ones(shape=(nbr_rows,), dtype=bool)
    version_filter = artifacts_df.atx_version==by_version if by_version is not None else np.ones(shape=(nbr_rows,), dtype=bool)

    row_filter = alias_filter * type_filter * version_filter

    latest = artifacts_df.loc[row_filter, cols2show].sort_values(by='created').reset_index(drop=True)
    return latest, [t.name for t in atx_types]

`project_artifacts` returns:

- a DataFrame including all the artifacts available under the project (`entity/project_name`)
- a list of all artifact types in the projects

In [None]:
#|eval: false

atx_df, atx_type_list = project_artifacts(
    entity='metagenomics_sh', 
    project_name='coding-with-nbdev'
    )

atx_type_list

['code', 'cov_sequences', 'sim_reads', 'job']

In [None]:
#|eval: false
atx_df

Unnamed: 0,atx_name,atx_version,atx_type,atx_aliases,file_count,created,updated,atx_id
0,source-coding-with-nbdev-_home_vtec_projects_b...,v0,code,latest,1,2025-02-01T09:46:40Z,2025-02-01T09:46:43Z,QXJ0aWZhY3Q6MTQ4MjA0MzE4Ng==
1,cov_one_sequence:v0,v0,cov_sequences,latest,1,2025-02-01T09:50:13Z,2025-02-01T09:50:15Z,QXJ0aWZhY3Q6MTQ4MjA0NjkyNQ==
2,cov_reads_single_1_sequence_150bp:v0,v0,sim_reads,latest,2,2025-02-01T09:52:45Z,2025-02-01T10:02:12Z,QXJ0aWZhY3Q6MTQ4MjA0OTY3MA==


The list of artifacts can be filtered, for instance, by artifact type

In [None]:
#|eval: false

atx_df, atx_type_list = project_artifacts(
    entity='metagenomics_sh', 
    project_name='coding-with-nbdev',
    by_type='cov_sequences'
    )

atx_df

Unnamed: 0,atx_name,atx_version,atx_type,atx_aliases,file_count,created,updated,atx_id
0,cov_one_sequence:v0,v0,cov_sequences,latest,1,2025-02-01T09:50:13Z,2025-02-01T09:50:15Z,QXJ0aWZhY3Q6MTQ4MjA0NjkyNQ==


In [None]:
#| export
def run_name_exists(
    run_name: str,      # name of the run to check 
    entity: str,        # name of the entity from which to retrieve the artifacts 
    project_name: str,  # name of the project from which to retrieve the artifacts 
    ) -> bool:          # True if a run exists with the name run_name, False otherwise
    """Check whether a run with name `run_name` already exists in `entity/project_name`"""
    api = wandb.Api()
    runs = api.runs(path=f"{entity}/{project_name}")
    run_matches = [run_name == r.name for r in runs]
    return any(run_matches)

In [None]:
#|eval: false

run_name_exists(
    run_name='nbdev-test', 
    entity='metagenomics_sh', 
    project_name='coding-with-nbdev'
    )

True

In [None]:
#|eval: false

run_name_exists(
    run_name='train_1M', 
    entity='metagenomics_sh', 
    project_name='coding-with-nbdev'
    )

False

In [None]:
#| export
def unique_run_name(
    name_seed:str     # Run name to which a timestamp will be added
    ):
    """Create a unique run name by adding a timestamp to the passed seed"""
    timestamp = datetime.now().strftime('%y%m%d-%H%M')
    return f"{name_seed}-{timestamp}"

In [None]:
unique_run_name('this_is_a_run_name')

'this_is_a_run_name-250201-1816'

In [None]:
#| hide
def validate_config(
    config
    ):
    """Returns config dictionary where missing required keys are replaced into with default values"""
    default_config = {
        'architecture': 'cnn-virus-original',
        'dataset': 'Dataset.map(string_to_tensor) v2',
        'batch_size': 1024,
        'learning_rate': 1e-3,
        'epochs': 5,
        'ds_cache': False,
        'ds_prefetch': True,
    }

    if config is None: config = {}

    for k, v in default_config.items():
        config[k] = config.get(k, default_config[k])
    
    return config

In [None]:
#| hide

# def train_with_wandb(
#     entity: str, project_name: str, run_name_seed: str, 
#     train_ds_at_name: str, val_ds_at_name: str, 
#     model_at_name: str = None, build_model: Callable = None,
#     config: dict = None 
#     ):
#     """Starts a new wandb run and performs a training sequence using datasets and (optional) saved model.
    
#     The function perform each of these steps:
#         1. validate the config
#         2. starts a new wandb run using the run_name_seed and the config dictionary
#         3. downloads train and val raw text datasets and transforms them with the transform function
#         4. load the selected saved model or creates new model, and compile it
#         5. train the model, using wandb to track and save intermediate models
#         6. finish the run

#     Parameters
#     ----------
#     entity : str
#         name of the WandB user or organization to use to create the new run
#     project_name : str
#         name of the project to use to create the new run. 
#     run_name_seed : str
#         name to give to the new run
#         the run display name will be this string followed by a timestamp
#     train_ds_at_name : str
#         name of the WandB Artifact with the train dataset.
#         the name should not include any `:vn` version number
#     val_ds_at_name : str
#         name of the WandB Artifact with the validation dataset.
#         the name should not include any `:vn` version number
#     config : dict
#         dictionary where key-value pairs represent all the metadata to store with the run
#         the key-value pairs below are required and will be set as the default values if not present:
#             'architecture' (default: 'cnn-virus-original')
#             'dataset': (default: 'Dataset.map(string_to_tensor) v2')
#             'n_train_samples': (default: 0) (retrieved from artifact metadata if available)
#             'n_val_samples': (default: 0)   (retrieved from artifact metadata if available)
#             'batch_size': (default: 1024)
#             'learning_rate': (default: 1e-3)
#             'epochs': (default: 5)
#             'ds_cache': (default: False)
#             'ds_prefetch': (default: True)
#     model_at_name : str, default=None
#         name of the WandB Artifact with the saved moded to use.
#         the name should not include any `:vn` version number
#         when `None`, a new model is created
#     build_model : Callable,
#         function to build an empty architecture

#     """
#     run_name = unique_run_name(run_name_seed)

#     # 1. validate configuration
#     config = validate_config(config)

#     # Retrieve n_samples from dataset artifacts metadata and save in config
#     train_ds_at_path = f"{entity}/{project_name}/{train_ds_at_name}:latest"
#     val_ds_at_path =   f"{entity}/{project_name}/{val_ds_at_name}:latest"
#     api = wandb.Api()
#     train_at = api.artifact(train_ds_at_path)
#     val_at = api.artifact(val_ds_at_path)
#     config['n_train_samples'] = train_at.metadata.get('n_samples', 0)
#     config['n_val_samples'] = val_at.metadata.get('n_samples', 0)

#     # 2. start a new run
#     run = wandb.init(
#         entity=entity, 
#         project=project_name, 
#         name=run_name, job_type="train-exp", 
#         config=config, 
#         save_code=True
#         )
#     cfg = wandb.config

#     # 3a. download train and val raw data files

#     train_ds_at = run.use_artifact(train_ds_at_path, type='raw_data')
#     train_ds_dir = train_ds_at.download()
#     train_ds_file = list(Path(train_ds_dir).iterdir())[0]

#     val_ds_at = run.use_artifact(val_ds_at_path, type='raw_data')
#     val_ds_dir = val_ds_at.download()
#     val_ds_file = list(Path(val_ds_dir).iterdir())[0]

#     print(f"Build Datasets from files {train_ds_file.name} and {val_ds_file.name}")

#     # 3.b create Datasets for train and val
#     text_train_ds = tf.data.TextLineDataset(
#         train_ds_file,
#         compression_type='',
#         name='text_train_ds'
#     ).batch(cfg['batch_size'])

#     text_val_ds = tf.data.TextLineDataset(
#         val_ds_file,
#         compression_type='',
#         name='text_val_ds'
#     ).batch(cfg['batch_size'])

#     if config['ds_cache'] and config['ds_prefetch']:
#         train_ds = text_train_ds.map(strings_to_tensors).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
#         val_ds = text_val_ds.map(strings_to_tensors).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

#     elif not config['ds_cache'] and config['ds_prefetch']:
#         train_ds = text_train_ds.map(strings_to_tensors).prefetch(buffer_size=tf.data.AUTOTUNE)
#         val_ds = text_val_ds.map(strings_to_tensors).prefetch(buffer_size=tf.data.AUTOTUNE)

#     else:
#         train_ds = text_train_ds.map(strings_to_tensors)
#         val_ds = text_val_ds.map(strings_to_tensors)

#     print(f"dataset built with cache:{config['ds_cache']}, prefetch:{config['ds_prefetch']}.")

#     # create model using passed build function or loaded artifact, and compile it
#     if model_at_name is None:
#         if build_model is not None and callable(build_model):
#             print('Creating a new model')
#             model = build_cnn_virus_original()
#         else:
#             raise ValueError(f"Require 'build_model' to be a callable to create a new model")
#     else:
#         print(f"Downloading and using latest version of model {model_at_name}")
#         model_at_path = f"{project_name}/{model_at_name}:latest"
#         model_at = run.use_artifact(model_at_path, type='model')
#         model_at_dir = model_at.download()
#         model = tf.keras.models.load_model(Path(model_at_dir).resolve())

#     optim = Adam(learning_rate=wandb.config.learning_rate)
#     model.compile(
#         optimizer=optim,
#         loss=[CategoricalCrossentropy(name='l1'), CategoricalCrossentropy(name='l2')],
#         metrics=['acc']
#     )
    
#     # train model
#     wb = WandbCallback(
#         monitor=cfg['metric_to_monitor'], 
#         save_model=True, 
#         log_weigths=True
#         )

#     res = model.fit(
#         train_ds,
#         epochs=wandb.config.epochs,
#         validation_data=val_ds,
#         verbose=1,
#         callbacks=[wb]
#         )

#     run.finish()

## Technical Notes for development with `nbdev`

**Resolve problem with `nbdev_export()` for this notebook**

When using `nbdev.nbdev_export()` in this notebook, the code exported seems to be old code. In particular, the dependency import section in cell is exported as:
```python
# %% ../nbs-dev/wandb/run-20221123_121523-2z5ycjrb/tmp/code/01_wandb.ipynb 2
# Imports all dependencies

import configparser
import numpy as np
import psutil
import os
```

The hint is in the first line: 

**`# %% ../nbs-dev/wandb/run-20221123_121523-2z5ycjrb/tmp/code/01_wandb.ipynb 2`**

It shows that the notebook used for exporting is not **`/nbs-dev/01_wandb.ipynb`** as it should be. This is because the WandB package creates a local directory `/nbs-dev/wandb/` where it keeps local logs and artifacts.

The solution is to move the directory where WandB stores local logs outside `nbs-dev`, which can be done with the `dir` argument in `wandb.Run()`

Illustrating by reproducing the functions from nbdev and a few dependencies

In [None]:
from nbdev.config import get_config
from fastcore.xtras import globtastic
from fastcore.meta import delegates

In [None]:
# from nbdev.doclinks.py

# line 105
@delegates(globtastic)
def nbglob(path=None, skip_folder_re = '^[_.]', file_glob='*.ipynb', skip_file_re='^[_.]', key='nbs_path', as_path=False, **kwargs):
    "Find all files in a directory matching an extension given a config key."
    path = Path(path or get_config()[key])
    recursive=get_config().recursive
    res = globtastic(path, file_glob=file_glob, skip_folder_re=skip_folder_re,
                     skip_file_re=skip_file_re, recursive=recursive, **kwargs)
    return res.map(Path) if as_path else res

In [None]:
# line 131 MODIFIED
def modified_nbdev_export(
    path:str=None, # Path or filename
    **kwargs):
    "Export notebooks in `path` to Python modules"
    if os.environ.get('IN_TEST',0): return
    files = nbglob(path=path, as_path=True, **kwargs).sorted('name')
#     for f in files: nb_export(f)
    for f in files: print(f)
#     add_init(get_config().lib_path)
#     _build_modidx()

Before the change: 
```python
modified_nbdev_export()
```

```ascii
/home/vtec/projects/bio/metagentools/nbs-dev/00_core.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/01_wandb.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/wandb/run-20221122_182641-1eafsab9/tmp/code/01_wandb.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/wandb/run-20221122_180513-1vgzoryt/tmp/code/01_wandb.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/wandb/run-20221123_121523-2z5ycjrb/tmp/code/01_wandb.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/index.ipynb
```

After the change

In [None]:
modified_nbdev_export()

/home/vtec/projects/bio/metagentools/nbs-dev/00_core.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/01_wandb.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/02_art.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/03_bio.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/03_cnn_virus_architecture.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/03_cnn_virus_data.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/03_cnn_virus_utils.ipynb
/home/vtec/projects/bio/metagentools/nbs-dev/index.ipynb


In [None]:
#| hide
nbdev.nbdev_export()