# core

> Fill in a module description here

In [17]:
# | default_exp ad_complete


In [18]:
# | hide
from nbdev.showdoc import *


In [19]:
# | export
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from ts_vae_lstm.concepts import get_window
from scipy import signal
import os
import math
import torch


In [20]:
# | export

from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score

In [21]:
# | export
from ts_vae_lstm.vae import VAE, Encoder, Decoder, StochasticSampler
from fastcore.xtras import noop

In [22]:
num_workers = os.cpu_count()
device = "cuda" if torch.cuda.is_available() else "cpu"
num_workers, device


(4, 'cuda')

Train a VAE with no anomalies in the time series.

```--> Anomalies occur at:
  timestamp #0: 2014-11-01 19:00:00
  timestamp #1: 2014-11-27 15:30:00
  timestamp #2: 2014-12-25 15:00:00
  timestamp #3: 2015-01-01 01:00:00
  timestamp #4: 2015-01-27 00:00:00

Original csv file contains (10320,) timestamps.
Processed time series contain (10320,) readings.
Anomaly indices are [5943, 7184, 8527, 8835, 10081]

Training set mean is 14855.115757575757
Training set std is 6556.134705703313
Anomaly indices in the test set are [2643 3884 5227 5535 6781]```

In [23]:
data = np.load("../sample_data/nyc_taxi.npz")
for k in data.keys():
    print(k)


t
t_unit
readings
idx_anomaly
idx_split
training
test
train_m
train_std
t_train
t_test
idx_anomaly_test


In [24]:
data["test"].shape


(4820,)

In [25]:
# | export
df = pd.DataFrame(data["test"], index=data["t_test"], columns=["value"])
df.head(2)


Unnamed: 0,value
1,0.532643
2,0.551521


In [26]:
p = 48  # so that one window is one day

data = [
    {
        "subset": get_window(
            df.values,
            window_size=p,
            end_step=t,
            indices=list(df.index),
            return_indices=False,
        ),
        "end_step": t,
        "start_step": t - p,
    }
    for t in range(
        p, len(df), p
    )  # take steps every 48 so that we end up with non-overlapping sequences
]

In [27]:
vae_model_pth = f"../models/vae_500_z24.pth"
vae_model = torch.load(vae_model_pth, map_location=device)
vae_model

VAE(
  (encoder): Encoder(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (conv1): Conv2d(1, 32, kernel_size=(3, 1), stride=(2, 1))
    (conv2): Conv2d(32, 512, kernel_size=(3, 1), stride=(2, 1))
    (linear): Linear(in_features=5632, out_features=512, bias=False)
    (linear_mean): Linear(in_features=512, out_features=24, bias=False)
    (linear_var): Linear(in_features=512, out_features=24, bias=False)
  )
  (decoder): Decoder(
    (linear): Linear(in_features=24, out_features=512, bias=False)
    (dconv1): ConvTranspose2d(512, 128, kernel_size=(4, 1), stride=(1, 1))
    (dconv2): ConvTranspose2d(128, 64, kernel_size=(3, 1), stride=(1, 1))
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear_out): Linear(in_features=384, out_features=48, bias=False)
  )
  (latent_sampler): StochasticSampler()
)

In [36]:
@torch.no_grad()
def get_embeddings(x, n_windows=1, latent_dim=32, seq_len=1, sampler_repeat=200):
    """
    _summary_

    Parameters
    ----------
    x : _type_
        _description_
    n_windows : int, optional
        _description_, by default 1
    latent_dim : int, optional
        _description_, by default 32
    seq_len : int, optional
        _description_, by default 1
    sampler_repeat : int, optional
        Number of times to repeatedly sample from the sampler to ensure we have enough variablity in the embedding, by default 10

    Returns
    -------
    _type_
        _description_
    """
    # actual_shape = x.shape[0]
    x = (
        torch.from_numpy(x.astype(np.float32)).view(n_windows, -1, seq_len).to(device)
    )  # p, seq -> n_windows, p, seq
    embedded_x = torch.zeros(n_windows, latent_dim, seq_len).to(device)
    for idx in range(n_windows):
        batched_x_window = x[idx].unsqueeze(0)
        # print(batched_x_window.shape)
        z_mean, z_log_var = vae_model.encoder(batched_x_window)
        for _ in range(sampler_repeat):
            # explore multiple potential future embeddings by sampling from the latent space multiple times (Monte Carlo sampling).
            embedded_x[idx] += (
                vae_model.latent_sampler(z_mean, z_log_var).permute(1, 0)
                / sampler_repeat
            )

        # reshape
    embedded_x = embedded_x.reshape(latent_dim * n_windows, -1)
    return embedded_x

In [37]:
get_embeddings(data[0]["subset"], latent_dim=24)


tensor([[ 0.1013],
        [-0.1030],
        [-0.0095],
        [ 0.1046],
        [ 0.0419],
        [ 0.0247],
        [-0.3435],
        [ 0.1282],
        [ 0.0182],
        [-0.0849],
        [-0.0338],
        [ 1.7626],
        [-0.0362],
        [-0.0158],
        [ 0.0415],
        [-0.2683],
        [ 0.4677],
        [-0.0319],
        [-0.0100],
        [-0.0680],
        [-0.0270],
        [-0.0291],
        [ 0.4637],
        [ 0.0055]], device='cuda:0')

In [38]:
data_embeddings = [
    {
        "subset": get_embeddings(
            data[i]["subset"], latent_dim=24, n_windows=1, seq_len=1, sampler_repeat=200
        ),
        "end_step": data[i]["end_step"],
        "start_step": data[i]["start_step"],
    }
    for i in range(len(data))
]

We have now the testing embeddings. Can have already one benchmark with VAE reconstruction. But we now pass this to the LSTM model to predict expected embedding for next window.

In [41]:
from ts_vae_lstm.lstm import LSTMModel

In [42]:
lstm_model_pth = "../models/lstm_200.pth"
lstm_model = torch.load(lstm_model_pth, map_location=device)
lstm_model


LSTMModel(
  (lstm_input): LSTM(24, 128, batch_first=True)
  (lstm_hidden): LSTM(128, 128, batch_first=True, bidirectional=True)
  (lstm_output): LSTM(256, 24, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [44]:
data[0]["subset"].shape

(48, 1)

In [48]:
lstm_model(torch.from_numpy(data[0]["subset"].astype(np.float32)).unsqueeze(0))

RuntimeError: input.size(-1) must be equal to input_size. Expected 24, got 48

In [46]:
predicted_embeddings = [
    {
        "subset": data[i]["subset"],
        "next_subset": lstm_model(
            torch.from_numpy(data[i]["subset"].astype(np.float32)).unsqueeze(0)
        )
        .view(-1, 1)
        .cpu()
        .numpy(),
    }
    for i in range(len(data_embeddings))
]


RuntimeError: input.size(-1) must be equal to input_size. Expected 24, got 48

In [8]:
data["training"].shape
with torch.no_grad():
    xs_val, ys_val = next(iter(valid_dataloader))
    pred_ys_val = model(xs_val.to(device))

    for idx in range(batch_size):
        if idx >= 8:
            break
        ax = plt.subplot(4, 2, idx + 1)
        # idx_feature = 0
        pred_current_example, current_points, prev_points = (
            pred_ys_val[idx].detach().cpu().squeeze(),
            ys_val[idx].detach().cpu().squeeze(),
            xs_val[idx].detach().cpu().squeeze(),
        )
        pred_ts_prev = vae_model.decoder(prev_points.unsqueeze(0)).squeeze()
        pred_ts_values = vae_model.decoder(pred_current_example.unsqueeze(0)).squeeze()
        pred_ts_values_true = vae_model.decoder(current_points.unsqueeze(0)).squeeze()
        sns.lineplot(
            np.concatenate([pred_ts_prev.numpy(), pred_ts_values_true.numpy()], axis=0),
            alpha=0.5,
            linestyle="--",
        )  # , label="next true")
        padded_pred = np.concatenate(
            [[None] * len(pred_ts_values), pred_ts_values.numpy()], axis=0
        )
        sns.lineplot(padded_pred)
        mse_val = F.mse_loss(pred_ts_values, pred_ts_values_true)

        plt.ylim(-2, 2)
        plt.title(f"MSE = {mse_val:.4f}")

    plt.tight_layout()

(5500,)

In [9]:
df = pd.DataFrame(data["training"], index=data["t_train"], columns=["value"])
df


Unnamed: 0,value
1,-0.647416
2,-1.048144
3,-1.330880
4,-1.560078
5,-1.683378
...,...
5496,0.650634
5497,0.769805
5498,0.614499
5499,0.474090


In [481]:
# | hide
import nbdev

nbdev.nbdev_export()
