In [6]:
# default_exp utils

# Utils

> Utilities used in the rest of the notebooks

In [7]:
#export
from fastcore.all import *
import wandb
import pickle
import pandas as pd
import numpy as np

### Generate random time series dataframe

In [8]:
# export
def generate_TS_df(rows, cols):
    "Generates a dataframe containing a multivariate time series, where each column \
    represents a variable and each row a time point (sample). The timestamp is in the \
    index of the dataframe, and it is created with a even space of 1 second between samples"
    index = np.arange(pd.Timestamp.now(),
                      pd.Timestamp.now() + pd.Timedelta(rows-1, 'seconds'),
                      pd.Timedelta(1, 'seconds'))
    data = np.random.randn(len(index), cols)
    return pd.DataFrame(data, index=index)

In [9]:
df = generate_TS_df(3, 5)

In [10]:
test_eq(df.shape, (3, 5))

##  pandas Dataframe utilities

### Normalize columns

In [11]:
# export
def normalize_columns(df:pd.DataFrame):
    "Normalize columns from `df` to have 0 mean and 1 standard deviation"
    mean = df.mean()
    std = df.std() + 1e-7
    return (df-mean)/std

In [12]:
foo = generate_TS_df(3, 3)
foo.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,0.588805,0.417043,0.849817
std,0.994155,0.575128,1.078598
min,-0.465598,-0.219903,-0.315615
25%,0.128659,0.176416,0.368262
50%,0.722916,0.572736,1.052139
75%,1.116006,0.735516,1.432533
max,1.509096,0.898295,1.812927


In [13]:
bar = normalize_columns(foo)
bar.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,3.700743e-17,-3.700743e-17,-8.326673e-17
std,0.9999999,0.9999998,0.9999999
min,-1.060602,-1.107485,-1.080506
25%,-0.4628512,-0.4183873,-0.4464638
50%,0.1348998,0.2707105,0.1875787
75%,0.5303011,0.5537426,0.5402532
max,0.9257023,0.8367746,0.8929277


In [16]:
test_close(bar.describe().loc['mean'].values, np.repeat(0.0, len(bar.columns)))

In [17]:
test_close(bar.describe().loc['std'].values, np.repeat(1.0, len(bar.columns)))

### Remove constant columns

In [88]:
# export
def remove_constant_columns(df:pd.DataFrame):
    return df.loc[:, (df != df.iloc[0]).any()]

In [74]:
foo = generate_TS_df(3, 3)
foo['constant'] = [0.0]*len(foo)
foo

Unnamed: 0,0,1,2,constant
2020-06-23 15:32:27.376772,-0.971047,-0.539548,1.073372,0.0
2020-06-23 15:32:28.376772,-0.322303,-0.48831,1.661297,0.0
2020-06-23 15:32:29.376772,-0.014216,-0.31023,0.504043,0.0


In [80]:
bar = remove_constant_columns(foo)
bar

Unnamed: 0,0,1,2
2020-06-23 15:32:27.376772,-0.971047,-0.539548,1.073372
2020-06-23 15:32:28.376772,-0.322303,-0.48831,1.661297
2020-06-23 15:32:29.376772,-0.014216,-0.31023,0.504043


In [86]:
column_diff = set(foo.columns) - set(bar.columns)
test_eq_type(column_diff, set(['constant']))

## Create wandb artifact containing just the reference to an object pass as argument

In [18]:
# export
class ReferenceArtifact(wandb.Artifact):
    default_storage_path = Path('/data/PACMEL-2019/wandb_artifacts/') # *
    "This class is meant to create an artifact with a single reference to an object \
    passed as argument in the contructor. The object will be pickled, hashed and stored \
    in a specified folder."
    @delegates(wandb.Artifact.__init__)
    def __init__(self, obj, name, folder=None, **kwargs):
        super().__init__(type='object', name=name, **kwargs)
        # pickle dumps the object and then hash it
        hash_code = str(hash(pickle.dumps(obj)))
        folder = Path(ifnone(folder, self.default_storage_path))
        with open(f'{folder}/{hash_code}', 'wb') as f:
            pickle.dump(obj, f)
        self.add_reference(f'file://{folder}/{hash_code}')
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['ref'] = dict()
        self.metadata['ref']['hash'] = hash_code
        self.metadata['ref']['type'] = str(type(obj))

When a reference artifact is used by one wandb run, we should have a method to get the original object from it

In [4]:
# export
@patch
def to_obj(self:wandb.apis.public.Artifact):
    "Download the files of a saved ReferenceArtifact and get the referenced object. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."
    if self.metadata.get('ref') is None:
        print(f'ERROR:{self} does not come from a saved ReferenceArtifact')
        return None
    path = Path(self.download()).ls()[0]
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

Test with Reference artifact from a df

In [19]:
foo = generate_TS_df(3, 3)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact', folder='/')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:////6388870144283712062/6388870144283712062>])

In [10]:
test_eq(bar.name, 'test_reference_artifact')

In [11]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

TODO: Test method `to_obj`

ReferenceArtifact with a numpy array

In [12]:
foo = np.random.randn(5)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact', folder='/')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:////-7284171405855839870/-7284171405855839870>])

In [38]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))