<a href="https://colab.research.google.com/github/vtecftwy/metagenomics/blob/refactor_cnn_virus/nbs/2_03_EC_datasets_to_wandb__colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data files as WandB Artifacts (Metagenomics CNN-Virus)

To be run on COLAB

- files to load data:
    - `50mer_training`
    - `50mer_validating`
    - `weight_of_classes`


# Imports and setup environment

### Install and import packages

In [1]:
import configparser
import numpy as np
import psutil
import os
import shutil
import sys
import tensorflow as tf

from pathlib import Path
from tensorflow.python.client import device_lib
print(f"Tensorflow version: {tf.__version__}\n")

%load_ext autoreload
%autoreload 2

devices = device_lib.list_local_devices()
print('\nDevices:')
for d in devices:
    t = d.device_type
    name = d.physical_device_desc
    l = [item.split(':', 1) for item in name.split(', ')]
    name_attr = dict([x for x in l if len(x)==2])
    dev = name_attr.get('name', ' ')
    print(f"  - {t}  {d.name} {dev:25s}")

Tensorflow version: 2.8.2


Devices:
  - CPU  /device:CPU:0                          


In [2]:
try:
    import wandb
    print(f'wandb version {wandb.__version__} already installed')
except ModuleNotFoundError:
    !pip install -qqU wandb
    import wandb
    print(f'wandb version {wandb.__version__} installed')

from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8 MB 27.2 MB/s 
[K     |████████████████████████████████| 181 kB 62.3 MB/s 
[K     |████████████████████████████████| 158 kB 63.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 157 kB 60.4 MB/s 
[K     |████████████████████████████████| 157 kB 58.3 MB/s 
[K     |████████████████████████████████| 157 kB 76.3 MB/s 
[K     |████████████████████████████████| 157 kB 78.9 MB/s 
[K     |████████████████████████████████| 157 kB 63.0 MB/s 
[K     |████████████████████████████████| 157 kB 67.3 MB/s 
[K     |████████████████████████████████| 157 kB 60.5 MB/s 
[K     |████████████████████████████████| 156 kB 59.8 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
wandb version 0.13.3 installed


## Install and import custom code, mount gdrive

In [3]:
try:
    from google.colab import drive
    ON_COLAB = True
    print('Running on colab')
    print('Installing custom project code')   
    !pip install -U git+https://github.com/vtecftwy/metagenomics.git@refactor_cnn_virus
    drive.mount('/content/gdrive')

except ModuleNotFoundError:
    ON_COLAB = False
    print('Running locally')
    print('Make sure you have installed the custom project code in your environment')
    pdata = Path('data/cnn_virus')

Running on colab
Installing custom project code
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/vtecftwy/metagenomics.git@refactor_cnn_virus
  Cloning https://github.com/vtecftwy/metagenomics.git (to revision refactor_cnn_virus) to /tmp/pip-req-build-w5zhgro3
  Running command git clone -q https://github.com/vtecftwy/metagenomics.git /tmp/pip-req-build-w5zhgro3
  Running command git checkout -b refactor_cnn_virus --track origin/refactor_cnn_virus
  Switched to a new branch 'refactor_cnn_virus'
  Branch 'refactor_cnn_virus' set up to track remote branch 'refactor_cnn_virus' from 'origin'.
Building wheels for collected packages: src
  Building wheel for src (setup.py) ... [?25l[?25hdone
  Created wheel for src: filename=src-1.0.2-py3-none-any.whl size=14773 sha256=4657ca96bae8865946ac2cb59d53bc7ab00b34b9441773ac236ef9f658366c97
  Stored in directory: /tmp/pip-ephem-wheel-cache-qslwj8lk/wheels/10/e4/2b/

#  Setup paths

This assumes that the shared gdrive directory is accessible through a shortcut called `Metagenomics` under the root of gdrive.

In [4]:
p2drive = Path('/content/gdrive/MyDrive/Metagenonics')
assert p2drive.is_dir()

p2data =  p2drive / 'CNN_Virus_data'
assert p2data.is_dir()

In [5]:
#path for the training file
filepath_train_full = p2data /"50mer_training"
assert filepath_train_full.is_file()

#path for the validating file
filepath_val_full = p2data / "50mer_validating"
assert filepath_val_full.is_file()

#path for the learning weights file
filepath_weights=p2data / "weight_of_classes"
assert filepath_weights.is_file()

# Create data files of different lengths for training and validation

- `50mer_training_100k`, `50mer_training_1M`, `50mer_training_15M`, `50mer_training_30M`, `50mer_training_50M`
- `50mer_validating_20k`, `50mer_validating_200k`, `50mer_validating_1M`

In [6]:
def shorten_file(p2long, max_lines=100_000,):
    """Create file(s) by shortening the long file to one of several max_lines
    
    max_lines:  int to create a single short file
                list/tuple of tuples ('suffix', max_length) to create several files
                ex: max_lines=[('100k', 100_000), ('1M', 1_000_000), ('15M', 15_000_000)]
    """

    if isinstance(max_lines, int):
        max_lines = ['short', max_lines]
    elif not isinstance(max_lines, list) and not isinstance(max_lines, tuple):
        raise ValueError(f"max_lines must be an int, a list of tuple or a tuple of tuple")

    max_lines_iterator = iter(max_lines)
    n_line = 0
    
    def create_new_file(file, suffix, ref_file=p2long):
        """create new file as a copy of `file`, with name based on `ref_file` and `suffix`"""
        if 'validating' in ref_file.stem:
            fname_seed = '50mer_validation'
        else:
            fname_seed = '50mer_training'
        new_file = Path(f"{fname_seed}_{suffix}{ref_file.suffix}")
        
        if file is None:
            if new_file.is_file():
                os.remove(new_file)
                print(f"Deleted old {new_file.name}")
            new_file.touch()
            print(f"Created empty {new_file.name}")
        else:
            shutil.copy(file, new_file)
            print(f"Copied {file.name} into {new_file.name}")
        return new_file

    with open(p2long, 'r') as fp_src:
        previous_file = None
        suffix, max_l = next(max_lines_iterator)
        current_file = create_new_file(previous_file, suffix, p2long)
           
        fp_tgt = open(current_file, 'a')
        print(f"Opened {fp_tgt.name} and adding lines up to {max_l:,d}")
        while True:
            n_line += 1
            if n_line > max_l:
                try:
                    suffix, max_l = next(max_lines_iterator)
                    previous_file = current_file
                    current_file = create_new_file(previous_file, suffix, p2long)

                    fp_tgt.close()                   
                    fp_tgt = open(current_file, 'a')
                    print(f"Opened {fp_tgt.name} and adding lines up to {max_l:,d}")
                except:
                    break
            line = fp_src.readline()
            if line == '':
                break
            else:
                fp_tgt.write(line)
        
        fp_tgt.close()
        print(f"Done")

training_maxlines = [('100k', 100_000), ('1M', 1_000_000), ('15M', 15_000_000), ('30M', 30_000_000), ('50M', 52_000_000)]
shorten_file(filepath_train_full, training_maxlines)

validation_maxlines = [('20k', 20_000), ('200k', 200_000), ('1M', 1_000_000)]
shorten_file(filepath_val_full, validation_maxlines)

Created empty 50mer_training_100k
Opened 50mer_training_100k and adding lines up to 100,000
Copied 50mer_training_100k into 50mer_training_1M
Opened 50mer_training_1M and adding lines up to 1,000,000
Copied 50mer_training_1M into 50mer_training_15M
Opened 50mer_training_15M and adding lines up to 15,000,000
Copied 50mer_training_15M into 50mer_training_30M
Opened 50mer_training_30M and adding lines up to 30,000,000
Copied 50mer_training_30M into 50mer_training_50M
Opened 50mer_training_50M and adding lines up to 52,000,000
Done
Created empty 50mer_validation_20k
Opened 50mer_validation_20k and adding lines up to 20,000
Copied 50mer_validation_20k into 50mer_validation_200k
Opened 50mer_validation_200k and adding lines up to 200,000
Copied 50mer_validation_200k into 50mer_validation_1M
Opened 50mer_validation_1M and adding lines up to 1,000,000
Done


In [10]:
sorted([f.name for f in Path('.').glob('50mer_*')])

['50mer_training_100k',
 '50mer_training_15M',
 '50mer_training_1M',
 '50mer_training_30M',
 '50mer_training_50M',
 '50mer_validation_1M',
 '50mer_validation_200k',
 '50mer_validation_20k']

## Setup WandB
Documentation [here](https://docs.wandb.ai/)

Get authentication key for WandB API and login

In [None]:
p2cfg = Path('/content/gdrive/MyDrive/private-across-accounts/config-api-keys.cfg')
cfg = configparser.ConfigParser()
cfg.read(p2cfg);
# cfg['wandb']['metagenomics']

In [11]:
os.environ['WANDB_NOTEBOOK_NAME'] = str(p2drive/ 'nbs/2_03_EC_datasets_to_wandb_-colab.ipynb')

In [12]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

Define WandB configuration and other parameters for `run`

In [14]:
# Initialize a run to load datasets
user = 'metagenomics_sh'
project = 'reproduce_cnn_virus'
run_name = 'load-data-2022-09-20'
job_type = "load_datasets"
notes = 'Reload the dataset with optimized jupyter nb'

run = wandb.init(entity=user, project=project, name=run_name, job_type=job_type, notes=notes, save_code=True)

[34m[1mwandb[0m: Currently logged in as: [33mvtecftyw[0m ([33mmetagenomics_sh[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
def upload_dataset(run, suffix, n_samples, original_fname, dataset_type):

    dataset_name = f"origin_{dataset_type}_{suffix}"
    descr = f"**Raw Data**: shortened version of `{original_fname}`, limited to {suffix} lines."
    metadata = {
        'n_samples': n_samples,
        'dataset_type': dataset_type,
        'original_file_name': original_fname,
    }
    my_data = wandb.Artifact(name=dataset_name, type="raw_data", description=descr, metadata=metadata)
    my_data.add_file(f"/content/50mer_{dataset_type}_{suffix}")
    run.log_artifact(my_data)

In [16]:
for suffix, n_samples in training_maxlines:
    upload_dataset(run, suffix, n_samples, '50mer_training', 'training')

In [17]:
for suffix, n_samples in validation_maxlines:
    upload_dataset(run, suffix, n_samples, '50mer_validating', 'validation')

In [18]:
run.finish()

VBox(children=(Label(value='0.059 MB of 0.059 MB uploaded (0.020 MB deduped)\r'), FloatProgress(value=1.0, max…

In [13]:
# wandb.init?

In [None]:
# wandb.Artifact?