In [None]:
from nbdev import *

In [None]:
#hide
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2

# Baseline Models

> This notebook tries generate baseline models to evaluate if DCAE models have a good performance or nor


In [None]:
#hide
from fastcore import test
import pandas as pd
import numpy as np

In [None]:
#hide
from timecluster_extension.load import *
from timecluster_extension.dr import *
from timecluster_extension.visualization import *
from timecluster_extension.utils import *
from tensorflow.keras.optimizers import Adam
import wandb
from wandb.keras import WandbCallback
from yaml import load, FullLoader
from fastcore.utils import Path
from datetime import datetime
import pickle

Create a run to save the models (job_type = "baseline_models")

In [None]:
#hide
run_baseline = wandb.init(entity = "pacmel",
                      project="timecluster-extension",
                      job_type='baseline_models',
                      allow_val_change=True,
                      resume=False)
config = wandb.config  # Object for storing hyperparameters

## Load the datasets

In [None]:
artifact_name_and_version = 'JNK:train_10days'

In [None]:
ds_train_artifact = run_baseline.use_artifact(artifact_name_and_version)

In [None]:
# parameters (uncomment to override the yaml file)
config.update(
    {
          'ds_train_artifact_type': ds_train_artifact.type,
          'ds_train_artifact_name': ds_train_artifact.name,
          'ds_train_artifact_digest': ds_train_artifact.digest,
    }, 
    allow_val_change=True)
ds_train_artifact.type, ds_train_artifact.name, ds_train_artifact.digest

('dataset', 'JNK:v6', '59d36b625c02415285418566f626f154')

In [None]:
df_train = ds_train_artifact.to_df()

In [None]:
df_train.head(1)

Unnamed: 0_level_0,RCD_AverageThree-phaseCurrent,LCD_AverageThree-phaseCurrent,LP_AverageThree-phaseCurrent,LHD_LeftHaulageDrive(tractor)Temperature(gearbox),RHD_RightHaulageDrive(tractor)Temperature(gearbox),LA_LeftArmTemperature,RA_RightArmTemperature,SM_DailyRouteOfTheShearer,SM_TotalRoute,LHD_EngineCurrent,RHD_EngineCurrent,RCD_BearingTemperature,SM_ShearerSpeed,SM_ShearerLocation,SM_ShearerMoveInLeft,SM_ShearerMoveInRight
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-06-01,-0.97812,-1.026472,-0.668515,-2.136844,-0.408555,-3.033168,-2.727174,-0.982279,-11.529689,-0.572725,-0.572323,-3.462614,-0.38534,0.653245,-0.267602,-0.341987


## Train

### Sliding window features

In [None]:
config.w = ifnone(config.get('w'), 48)
config.stride = ifnone(config.get('stride'), 1)
config.t = ifnone(config.get('t'), 0)

In [None]:
# parameters (uncomment to override the yaml file)
config.update({
    'w': config.w,
    'stride': config.stride,
    't': config.t  # TODO: Not supported yet
    }, allow_val_change=True)

In [None]:
test.equals(config.w % 12, 0)

True

In [None]:
input_data = df_slicer(df_train, w=config.w, s=config.stride)

In [None]:
data_test = input_data#Take 10 windows data_test[0:10,:,:]

In [None]:
data_test.shape

(172753, 48, 16)

In [None]:
#export utils
def baseline_model_predictor(input_array, operation = "mean"):
    " Perform an arithmetic operation (median or average) on a three-dimensional numpy array from df_slicer"
    # Calculate mean/median for each window in the dataset
    if operation == "mean":
        prediction = np.mean(input_array,axis=1)
    elif operation == "median":
        prediction = np.median(input_array,axis=1)
    # Generate an output numpy array with the same size that input_array with
    # baseline predictions
    # Create a 3-d numpy array with ones
    output_array = np.ones(input_array.shape)
    # Multiply it by the prediction array, with a new dimension in axis 1
    output_array = output_array * np.expand_dims(prediction, axis=1)
    return output_array

In [None]:
baseline_type = "median"

In [None]:
y_pred = baseline_model_predictor(data_test, baseline_type)

In [None]:
y_pred.shape

(172753, 48, 16)

In [None]:
config.update({
        'baseline_type': baseline_type
    })


In [None]:
# Input and output have the same shape
assert y_pred.shape == data_test.shape

In [None]:
#export utils
import tensorflow as tf
def get_windows_mse(predictions, original_data):
    " Function that calculates the mse for each of the windows in which an auto-encoder model has made a prediction."
    # Test that dimensions are correct.
    assert predictions.shape == original_data.shape
    # Create a mse object
    mse = tf.keras.losses.MeanSquaredError(
        reduction=tf.keras.losses.Reduction.NONE)
    # We need to adapt the axes to calculate the mse in the manner we want.
    prediction_swaped = np.swapaxes(predictions,1,2)
    original_data_swaped = np.swapaxes(original_data,1,2)
    # Calculate mses
    windows_mse = mse(original_data_swaped, prediction_swaped).numpy()
    
    return windows_mse

In [None]:
%%time
windows_mse = get_windows_mse(y_pred,data_test)

CPU times: user 3.41 s, sys: 6.98 s, total: 10.4 s
Wall time: 4.62 s


In [None]:
windows_mse.shape

(172753, 16)

MSE per time series

In [None]:
windows_mse.mean(axis=0)

array([0.25210897, 0.19519041, 0.29240048, 0.02972069, 0.02183987,
       0.07060798, 0.03001558, 0.01942385, 0.45632429, 0.28657532,
       0.2848851 , 0.04863259, 0.37448899, 0.0024526 , 0.43402708,
       0.46250715])

Total MSE:

In [None]:
windows_mse.mean()

0.2038250601232007

Log results on wandb:

In [None]:
run_baseline.log({
    'mse_overall': windows_mse.mean(),
    'mse_signal': windows_mse.mean(axis=0), # MSE of each variable
    'mse_raw': windows_mse # Raw mse. It has the mse for each window and signal
})



Plot figure to visualize MSE per variable:

In [None]:
import matplotlib.pyplot as plt

plt.bar(list(df_train.columns),  windows_mse.mean(axis=0), align='center', alpha=0.5)
plt.xticks(list(df_train.columns), rotation='vertical')
plt.ylabel('MSE')
plt.title('variable')


wandb.log({"mse_signal_plot": plt})

In [None]:
run_baseline.finish()

VBox(children=(Label(value=' 0.00MB of 0.01MB uploaded (0.00MB deduped)\n'), FloatProgress(value=0.08224882873…

0,1
mse_overall,0.20383
_step,1.0
_runtime,35.0
_timestamp,1604404101.0


0,1
mse_overall,▁
_step,▁█
_runtime,▁▁
_timestamp,▁▁


It proves that `get_windows_mse()` is working correctly compared to the sklearn function. This check is done because the function is swapped, and it is convenient that the results are the same regardless of the method used.


In [None]:
%%time
from sklearn.metrics import mean_squared_error
rme_sklearn = [0]* y_pred.shape[2]
for i in range(0,y_pred.shape[2]):
    y_pred_sel = y_pred[:,:,i]
    test_sel = data_test[:,:,i]
    rme_sklearn[i] = mean_squared_error(test_sel,y_pred_sel)

CPU times: user 2.98 s, sys: 634 ms, total: 3.61 s
Wall time: 3.61 s


In [None]:
rme_sklearn_round = np.around(rme_sklearn, decimals=3)
rme_keras_round = np.around(windows_mse.mean(axis=0),decimals=3)

In [None]:
test.all_equal(rme_sklearn_round, rme_keras_round)

True