In [None]:
from nbdev import *

In [None]:
#default_exp utils

# Utils

> Utilities used in the rest of the notebooks

In [None]:
#export
from fastcore.all import *
import wandb
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf

### Generate random time series dataframe

In [None]:
#export
def generate_TS_df(rows, cols):
    "Generates a dataframe containing a multivariate time series, where each column \
    represents a variable and each row a time point (sample). The timestamp is in the \
    index of the dataframe, and it is created with a even space of 1 second between samples"
    index = np.arange(pd.Timestamp.now(),
                      pd.Timestamp.now() + pd.Timedelta(rows-1, 'seconds'),
                      pd.Timedelta(1, 'seconds'))
    data = np.random.randn(len(index), cols)
    return pd.DataFrame(data, index=index)

In [None]:
df = generate_TS_df(3, 5)

In [None]:
test_eq(df.shape, (3, 5))

##  pandas Dataframe utilities

### Normalize columns

In [None]:
#export
def normalize_columns(df:pd.DataFrame):
    "Normalize columns from `df` to have 0 mean and 1 standard deviation"
    mean = df.mean()
    std = df.std() + 1e-7
    return (df-mean)/std

In [None]:
foo = generate_TS_df(3, 3)
foo.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-0.202306,-0.459356,0.304291
std,0.445546,0.448938,0.320722
min,-0.616619,-0.944742,0.110391
25%,-0.437954,-0.659514,0.119192
50%,-0.259288,-0.374287,0.127993
75%,0.004851,-0.216663,0.401241
max,0.26899,-0.059039,0.674489


In [None]:
bar = normalize_columns(foo)
bar.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-1.850372e-17,1.110223e-16,-1.850372e-16
std,0.9999998,0.9999998,0.9999997
min,-0.9299002,-1.081188,-0.6045728
25%,-0.5288969,-0.4458489,-0.5771327
50%,-0.1278937,0.1894903,-0.5496926
75%,0.4649501,0.540594,0.3022864
max,1.057794,0.8916978,1.154265


In [None]:
test_close(bar.describe().loc['mean'].values, np.repeat(0.0, len(bar.columns)))

In [None]:
test_close(bar.describe().loc['std'].values, np.repeat(1.0, len(bar.columns)))

### Remove constant columns

In [None]:
#export
def remove_constant_columns(df:pd.DataFrame):
    return df.loc[:, (df != df.iloc[0]).any()]

In [None]:
foo = generate_TS_df(3, 3)
foo['constant'] = [0.0]*len(foo)
foo

Unnamed: 0,0,1,2,constant
2021-01-25 10:56:47.066244,-0.578937,1.838584,0.720022,0.0
2021-01-25 10:56:48.066244,2.658628,0.063197,0.922973,0.0
2021-01-25 10:56:49.066244,0.114976,-1.445801,0.350152,0.0


In [None]:
bar = remove_constant_columns(foo)
bar

Unnamed: 0,0,1,2
2021-01-25 10:56:47.066244,-0.578937,1.838584,0.720022
2021-01-25 10:56:48.066244,2.658628,0.063197,0.922973
2021-01-25 10:56:49.066244,0.114976,-1.445801,0.350152


In [None]:
column_diff = set(foo.columns) - set(bar.columns)
test_eq_type(column_diff, set(['constant']))

## Create wandb artifact containing just the reference to an object pass as argument

In [None]:
#export
class ReferenceArtifact(wandb.Artifact):
    default_storage_path = Path('data/PACMEL-2019/wandb_artifacts/') # * this path is relative to Path.home()
    "This class is meant to create an artifact with a single reference to an object \
    passed as argument in the contructor. The object will be pickled, hashed and stored \
    in a specified folder."
    @delegates(wandb.Artifact.__init__)
    def __init__(self, obj, name, folder=None, **kwargs):
        super().__init__(type='object', name=name, **kwargs)
        # pickle dumps the object and then hash it
        hash_code = str(hash(pickle.dumps(obj)))
        folder = Path(ifnone(folder, Path.home()/self.default_storage_path))
        with open(f'{folder}/{hash_code}', 'wb') as f:
            pickle.dump(obj, f)
        self.add_reference(f'file://{folder}/{hash_code}')
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['ref'] = dict()
        self.metadata['ref']['hash'] = hash_code
        self.metadata['ref']['type'] = str(type(obj))

When a reference artifact is used by one wandb run, we should have a method to get the original object from it

In [None]:
#export
@patch
def to_obj(self:wandb.apis.public.Artifact):
    "Download the files of a saved ReferenceArtifact and get the referenced object. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."
    if self.metadata.get('ref') is None:
        print(f'ERROR:{self} does not come from a saved ReferenceArtifact')
        return None
    path = Path(self.download()).ls()[0]
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

Test with Reference artifact from a df

In [None]:
foo = generate_TS_df(3, 3)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact', folder='/')
bar.manifest.entries.values()

PermissionError: [Errno 13] Permission denied: '//2111032565604952391'

In [None]:
test_eq(bar.name, 'test_reference_artifact')

In [None]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

TODO: Test method `to_obj`

ReferenceArtifact with a numpy array

In [None]:
foo = np.random.randn(5)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact', folder='/')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:////-7284171405855839870/-7284171405855839870>])

In [None]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

# Keras add plot_top_losses functionality

In [None]:
#export
from timecluster_extension.visualization import *

ModuleNotFoundError: No module named 'timecluster_extension'

In [None]:
#export
@patch
def plot_top_losses(self:tf.keras.Sequential, validation_data, k, largest=True, return_fig=True, title_pos=0.99, **kwargs):
    "Take the validation data of model self, compute the model losses for every item there, sort, and plot the results.\
    If `largest` is True, the validation losses will be sorted from larger to lower. Once they are sorted, take the\
    k first items based on this order and plot the predictions.\
    If 'return_fig' is true, a Figure-set of plots is returned. If not, just showed on screen"
    # Get a prediction with the validation_data
    pred_validation_data = self.predict(validation_data)
    # Calculate the MSE with respect to original_data
    mse_values = np.mean(np.square(validation_data - pred_validation_data), axis=(1,2))
    
    # Order the numpy array and take the top k.
    if largest:
        id_loss_values = mse_values.argsort()[-k:]
        txt_var = "Largest MSE of the model for validation dataset"
    else:
        id_loss_values = mse_values.argsort()[:k]
        txt_var = "Smallest MSE of the model for validation dataset"
        
    # Plot figures
    list_figs = [None] * k
    for i in range(k):
        title = txt_var + " windoes_num: " + str(id_loss_values[i])
        list_figs[i] = plot_validation_ts_ae(validation_data, 
                                             pred_validation_data,
                                             title_str = title, 
                                             title_pos = title_pos,
                                             window_num = id_loss_values[i],
                                             return_fig = True,
                                             **kwargs)
    
    # Returns
    if return_fig:
        return list_figs
    else:
        list_figs 
        return None