In [1]:
#default_exp utils

In [2]:
#hide
%load_ext autoreload
%autoreload 2

# Utils

> Utilities used in the rest of the notebooks

In [3]:
#export
from dvats.imports import *
from fastcore.all import *
import wandb
import pickle
import pandas as pd
import numpy as np
#import tensorflow as tf
import torch.nn as nn
from fastai.basics import *

### Generate random time series dataframe

In [4]:
#export
def generate_TS_df(rows, cols):
    "Generates a dataframe containing a multivariate time series, where each column \
    represents a variable and each row a time point (sample). The timestamp is in the \
    index of the dataframe, and it is created with a even space of 1 second between samples"
    index = np.arange(pd.Timestamp.now(),
                      pd.Timestamp.now() + pd.Timedelta(rows-1, 'seconds'),
                      pd.Timedelta(1, 'seconds'))
    data = np.random.randn(len(index), cols)
    return pd.DataFrame(data, index=index)

In [5]:
df = generate_TS_df(3, 5)

In [6]:
test_eq(df.shape, (3, 5))

##  pandas Dataframe utilities

### Normalize columns

In [7]:
#export
def normalize_columns(df:pd.DataFrame):
    "Normalize columns from `df` to have 0 mean and 1 standard deviation"
    mean = df.mean()
    std = df.std() + 1e-7
    return (df-mean)/std

In [8]:
foo = generate_TS_df(3, 3)
foo.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-0.626541,0.077224,0.306504
std,0.873396,0.5551,1.177539
min,-1.615517,-0.435009,-0.916717
25%,-0.959318,-0.217679,-0.256398
50%,-0.303119,-0.000348,0.403921
75%,-0.132052,0.333341,0.918115
max,0.039014,0.667031,1.432309


In [9]:
bar = normalize_columns(foo)
bar.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-5.5511150000000004e-17,-7.401487e-17,3.700743e-17
std,0.9999999,0.9999998,0.9999999
min,-1.132335,-0.9227767,-1.038794
25%,-0.3810157,-0.5312611,-0.4780328
50%,0.3703038,-0.1397456,0.0827288
75%,0.5661676,0.4613884,0.5193972
max,0.7620314,1.062522,0.9560657


In [10]:
test_close(bar.describe().loc['mean'].values, np.repeat(0.0, len(bar.columns)))

In [11]:
test_close(bar.describe().loc['std'].values, np.repeat(1.0, len(bar.columns)))

### Remove constant columns

In [12]:
#export
def remove_constant_columns(df:pd.DataFrame):
    return df.loc[:, (df != df.iloc[0]).any()]

In [13]:
foo = generate_TS_df(3, 3)
foo['constant'] = [0.0]*len(foo)
foo

Unnamed: 0,0,1,2,constant
2021-11-18 11:05:45.661172,-1.065256,0.911272,0.434231,0.0
2021-11-18 11:05:46.661172,-0.69948,0.21744,1.220064,0.0
2021-11-18 11:05:47.661172,-1.243924,0.206862,-0.966509,0.0


In [14]:
bar = remove_constant_columns(foo)
bar

Unnamed: 0,0,1,2
2021-11-18 11:05:45.661172,-1.065256,0.911272,0.434231
2021-11-18 11:05:46.661172,-0.69948,0.21744,1.220064
2021-11-18 11:05:47.661172,-1.243924,0.206862,-0.966509


In [15]:
column_diff = set(foo.columns) - set(bar.columns)
test_eq_type(column_diff, set(['constant']))

## Create wandb artifact containing just the reference to an object pass as argument

In [16]:
#export
class ReferenceArtifact(wandb.Artifact):
    default_storage_path = Path('data/wandb_artifacts/') # * this path is relative to Path.home()
    "This class is meant to create an artifact with a single reference to an object \
    passed as argument in the contructor. The object will be pickled, hashed and stored \
    in a specified folder."
    @delegates(wandb.Artifact.__init__)
    def __init__(self, obj, name, type='object', folder=None, **kwargs):
        super().__init__(type=type, name=name, **kwargs)
        # pickle dumps the object and then hash it
        hash_code = str(hash(pickle.dumps(obj)))
        folder = Path(ifnone(folder, Path.home()/self.default_storage_path))
        with open(f'{folder}/{hash_code}', 'wb') as f:
            pickle.dump(obj, f)
        self.add_reference(f'file://{folder}/{hash_code}')
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['ref'] = dict()
        self.metadata['ref']['hash'] = hash_code
        self.metadata['ref']['type'] = str(obj.__class__)

In [17]:
foo = np.arange(10)
bar = ReferenceArtifact(obj=foo, name='foo', folder='.')
bar_path = Path(f'./{bar.metadata["ref"]["hash"]}')
test_eq(bar_path.exists(), True)
test_eq(bar.metadata['ref']['type'], "<class 'numpy.ndarray'>")

When a reference artifact is used by one wandb run, we should have a method to get the original object from it

In [18]:
#export
@patch
def to_obj(self:wandb.apis.public.Artifact):
    """Download the files of a saved ReferenceArtifact and get the referenced object. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."""
    if self.metadata.get('ref') is None:
        print(f'ERROR:{self} does not come from a saved ReferenceArtifact')
        return None
    original_path = ReferenceArtifact.default_storage_path/self.metadata['ref']['hash']
    path = original_path if original_path.exists() else Path(self.download()).ls()[0]
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

Test with Reference artifact from a df

In [19]:
foo = generate_TS_df(3, 3)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/victor/data/wandb_artifacts/-1976673005002984900/-1976673005002984900>])

In [20]:
test_eq(bar.name, 'test_reference_artifact')

In [21]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

TODO: Test method `to_obj`

ReferenceArtifact with a numpy array

In [22]:
foo = np.random.randn(5)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/victor/data/wandb_artifacts/-4822060124692261084/-4822060124692261084>])

In [23]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

In [26]:
#export
import torch.nn as nn
class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x.shape)
        return x

In [27]:
#export
@patch
def export_and_get(self:Learner, keep_exported_file=False):
    """
        Export the learner into an auxiliary file, load it and return it back. 
    """
    aux_path = Path('aux.pkl')
    self.export(fname='aux.pkl')
    aux_learn = load_learner('aux.pkl')
    if not keep_exported_file: aux_path.unlink()
    return aux_learn

### get_wandb_artifacts

In [28]:
#export
def get_wandb_artifacts(project_path, type=None, name=None, last_version=True):
    """
        Get the artifacts logged in a wandb project.
        Input:
        - `project_path` (str): entity/project_name
        - `type` (str): whether to return only one type of artifacts
        - `name` (str): Leave none to have all artifact names
        - `last_version`: whether to return only the last version of each artifact or not
        
        Output: List of artifacts
    """
    public_api = wandb.Api()
    if type is not None:
        types = [public_api.artifact_type(type, project_path)]
    else:
        types = public_api.artifact_types(project_path)

    res = L()
    for kind in types:
        for collection in kind.collections():
            if name is None or name == collection.name:
                versions = public_api.artifact_versions(
                    kind.type,
                    "/".join([kind.entity, kind.project, collection.name]),
                    per_page=1,
                )
                if last_version: res += next(versions)
                else: res += L(versions)
    return list(res)

In [29]:
foo = get_wandb_artifacts('wandb/artifacts-example', type='model')
test_eq(len(foo), 2)
foo = get_wandb_artifacts('wandb/artifacts-example', type='model', name='convnet')
test_eq(len(foo), 1)
foo = get_wandb_artifacts('wandb/artifacts-example', type='model', name='convnet', last_version=False)
test_eq(len(foo), 2)

## Export -

In [32]:
#hide
from nbdev.export import notebook2script
notebook2script()
beep(1)

Converted 01_dataset_artifact.ipynb.
Converted 02a_encoder_DCAE-torch.ipynb.
Converted 04_dimensionality_reduction.ipynb.
Converted encoder.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted utils.ipynb.
Converted visualization.ipynb.
