In [30]:
#default_exp utils

In [31]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Utils

> Utilities used in the rest of the notebooks

In [32]:
#export
from timecluster_hub.imports import *
from fastcore.all import *
import wandb
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import torch.nn as nn

### Generate random time series dataframe

In [33]:
#export
def generate_TS_df(rows, cols):
    "Generates a dataframe containing a multivariate time series, where each column \
    represents a variable and each row a time point (sample). The timestamp is in the \
    index of the dataframe, and it is created with a even space of 1 second between samples"
    index = np.arange(pd.Timestamp.now(),
                      pd.Timestamp.now() + pd.Timedelta(rows-1, 'seconds'),
                      pd.Timedelta(1, 'seconds'))
    data = np.random.randn(len(index), cols)
    return pd.DataFrame(data, index=index)

In [34]:
df = generate_TS_df(3, 5)

In [35]:
test_eq(df.shape, (3, 5))

##  pandas Dataframe utilities

### Normalize columns

In [36]:
#export
def normalize_columns(df:pd.DataFrame):
    "Normalize columns from `df` to have 0 mean and 1 standard deviation"
    mean = df.mean()
    std = df.std() + 1e-7
    return (df-mean)/std

In [37]:
foo = generate_TS_df(3, 3)
foo.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-0.458059,0.398024,0.095583
std,0.618333,0.747088,1.002144
min,-0.913371,-0.103518,-0.845386
25%,-0.810033,-0.031286,-0.431303
50%,-0.706696,0.040945,-0.017221
75%,-0.230403,0.648795,0.566067
max,0.245889,1.256645,1.149356


In [38]:
bar = normalize_columns(foo)
bar.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-3.700743e-17,-7.401487e-17,1.850372e-17
std,0.9999998,0.9999999,0.9999999
min,-0.7363533,-0.6713293,-0.9389558
25%,-0.5692304,-0.5746454,-0.5257593
50%,-0.4021074,-0.4779614,-0.1125628
75%,0.3681766,0.3356647,0.4694779
max,1.138461,1.149291,1.051519


In [39]:
test_close(bar.describe().loc['mean'].values, np.repeat(0.0, len(bar.columns)))

In [40]:
test_close(bar.describe().loc['std'].values, np.repeat(1.0, len(bar.columns)))

### Remove constant columns

In [41]:
#export
def remove_constant_columns(df:pd.DataFrame):
    return df.loc[:, (df != df.iloc[0]).any()]

In [42]:
foo = generate_TS_df(3, 3)
foo['constant'] = [0.0]*len(foo)
foo

Unnamed: 0,0,1,2,constant
2021-09-28 16:06:04.683660,-1.34413,1.367564,-0.449192,0.0
2021-09-28 16:06:05.683660,0.370796,-0.437145,-0.879332,0.0
2021-09-28 16:06:06.683660,-0.471223,-0.79475,0.469549,0.0


In [43]:
bar = remove_constant_columns(foo)
bar

Unnamed: 0,0,1,2
2021-09-28 16:06:04.683660,-1.34413,1.367564,-0.449192
2021-09-28 16:06:05.683660,0.370796,-0.437145,-0.879332
2021-09-28 16:06:06.683660,-0.471223,-0.79475,0.469549


In [44]:
column_diff = set(foo.columns) - set(bar.columns)
test_eq_type(column_diff, set(['constant']))

## Create wandb artifact containing just the reference to an object pass as argument

In [45]:
#export
class ReferenceArtifact(wandb.Artifact):
    default_storage_path = Path('data/wandb_artifacts/') # * this path is relative to Path.home()
    "This class is meant to create an artifact with a single reference to an object \
    passed as argument in the contructor. The object will be pickled, hashed and stored \
    in a specified folder."
    @delegates(wandb.Artifact.__init__)
    def __init__(self, obj, name, folder=None, **kwargs):
        super().__init__(type='object', name=name, **kwargs)
        # pickle dumps the object and then hash it
        hash_code = str(hash(pickle.dumps(obj)))
        folder = Path(ifnone(folder, Path.home()/self.default_storage_path))
        with open(f'{folder}/{hash_code}', 'wb') as f:
            pickle.dump(obj, f)
        self.add_reference(f'file://{folder}/{hash_code}')
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['ref'] = dict()
        self.metadata['ref']['hash'] = hash_code
        self.metadata['ref']['type'] = str(type(obj))

When a reference artifact is used by one wandb run, we should have a method to get the original object from it

In [46]:
#export
@patch
def to_obj(self:wandb.apis.public.Artifact):
    "Download the files of a saved ReferenceArtifact and get the referenced object. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."
    if self.metadata.get('ref') is None:
        print(f'ERROR:{self} does not come from a saved ReferenceArtifact')
        return None
    path = Path(self.download()).ls()[0]
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

Test with Reference artifact from a df

In [47]:
foo = generate_TS_df(3, 3)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/victor/data/wandb_artifacts/-833660379957662567/-833660379957662567>])

In [48]:
test_eq(bar.name, 'test_reference_artifact')

In [49]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

TODO: Test method `to_obj`

ReferenceArtifact with a numpy array

In [50]:
foo = np.random.randn(5)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/victor/data/wandb_artifacts/2682589598282141722/2682589598282141722>])

In [51]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

# Keras add plot_top_losses functionality

In [52]:
#export
from timecluster_hub.visualization import *

In [53]:
#export
@patch
def plot_top_losses(self:tf.keras.Sequential, validation_data, k, largest=True, return_fig=True, title_pos=0.99, **kwargs):
    "Take the validation data of model self, compute the model losses for every item there, sort, and plot the results.\
    If `largest` is True, the validation losses will be sorted from larger to lower. Once they are sorted, take the\
    k first items based on this order and plot the predictions.\
    If 'return_fig' is true, a Figure-set of plots is returned. If not, just showed on screen"
    # Get a prediction with the validation_data
    pred_validation_data = self.predict(validation_data)
    # Calculate the MSE with respect to original_data
    mse_values = np.mean(np.square(validation_data - pred_validation_data), axis=(1,2))
    
    # Order the numpy array and take the top k.
    if largest:
        id_loss_values = mse_values.argsort()[-k:]
        txt_var = "Largest MSE of the model for validation dataset"
    else:
        id_loss_values = mse_values.argsort()[:k]
        txt_var = "Smallest MSE of the model for validation dataset"
        
    # Plot figures
    list_figs = [None] * k
    for i in range(k):
        title = txt_var + " windoes_num: " + str(id_loss_values[i])
        list_figs[i] = plot_validation_ts_ae(validation_data, 
                                             pred_validation_data,
                                             title_str = title, 
                                             title_pos = title_pos,
                                             window_num = id_loss_values[i],
                                             return_fig = True,
                                             **kwargs)
    
    # Returns
    if return_fig:
        return list_figs
    else:
        list_figs 
        return None

## Pytorch debug print layer

In [54]:
#export
import torch.nn as nn
class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()
    
    def forward(self, x):
        # Do your print / debug stuff here
        print(x.shape)
        return x

## Export -

In [55]:
#hide
from nbdev.export import notebook2script
notebook2script()
beep(1)

Converted 01_dataset_artifact.ipynb.
Converted 02a_encoder_DCAE-torch.ipynb.
Converted 02a_encoder_DCAE.ipynb.
Converted 02b_encoder_MVP.ipynb.
Converted 03_dimensionality_reduction.ipynb.
Converted 04_baseline_models.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted utils.ipynb.
Converted visualization.ipynb.
