In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
RUN_NAME = "skew-init"  # | input("enter name for run")

In [None]:
import os
from datetime import datetime

# enable JIT compilation - must be done before loading torch!
os.environ["PYTORCH_JIT"] = "1"

In [None]:
from pathlib import Path
from time import time
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas
import torch
import torchinfo
from linodenet.models import LinODE, LinODECell, LinODEnet
from linodenet.projections.functional import skew_symmetric, symmetric
from pandas import DataFrame, Index, Series, Timedelta, Timestamp
from torch import Tensor, jit, tensor
from torch.optim import SGD, Adam, AdamW
from torch.utils.data import BatchSampler, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm, trange

import tsdm
from tsdm.datasets import DATASETS
from tsdm.encoders.functional import time2float
from tsdm.logutils import (
    log_kernel_information,
    log_metrics,
    log_model_state,
    log_optimizer_state,
)
from tsdm.metrics import LOSSES
from tsdm.tasks import KIWI_RUNS_TASK
from tsdm.utils import grad_norm, multi_norm

# Initialize Task

In [None]:
from tsdm.datasets import KIWI_RUNS

ds = KIWI_RUNS()

In [None]:
ds.batchloader

In [None]:
ds.rawdata_path

In [None]:
ds.rawdata_paths

In [None]:
ts = ds.timeseries

In [None]:
ts.loc[355, 11722]

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float32
NAN = tensor(float("nan"), dtype=DTYPE, device=DEVICE)
BATCH_SIZE = 128
PRD_HORIZON = 30
OBS_HORIZON = 90
HORIZON = SEQLEN = OBS_HORIZON + PRD_HORIZON

In [None]:
task = KIWI_RUNS_TASK(
    forecasting_horizon=PRD_HORIZON,
    observation_horizon=OBS_HORIZON,
    train_batch_size=BATCH_SIZE,
    eval_batch_size=2048,
)

DATASET = TASK.dataset
ts = TASK.timeseries
md = TASK.metadata
NUM_PTS, NUM_DIM = ts.shape

## Initialize Loss

In [None]:
task.get_dataloader((0, "train"))

In [None]:
LOSS = TASK.test_metric.to(device=DEVICE)

TASK.loss_weights

## Initialize DataLoaders

In [None]:
TRAINLOADER = TASK.batchloader
EVALLOADERS = TASK.dataloaders

## Hyperparamters

In [None]:
def join_dicts(d: dict[str, Any]) -> dict[str, Any]:
    """Recursively join dict by composing keys with '/'."""
    result = {}
    for key, val in d.items():
        if isinstance(val, dict):
            result |= join_dicts(
                {f"{key}/{subkey}": item for subkey, item in val.items()}
            )
        else:
            result[key] = val
    return result


def add_prefix(d: dict[str, Any], /, prefix: str) -> dict[str, Any]:
    return {f"{prefix}/{key}": item for key, item in d.items()}


# OPTIMIZER_CONIFG = {
#     "__name__": "SGD",
#     "lr": 0.001,
#     "momentum": 0,
#     "dampening": 0,
#     "weight_decay": 0,
#     "nesterov": False,
# }

OPTIMIZER_CONIFG = {
    "__name__": "Adam",
    "lr": 0.01,
    "betas": (0.9, 0.999),
    "eps": 1e-08,
    "weight_decay": 0,
    "amsgrad": False,
}

MODEL_CONFIG = {
    "__name__": "LinODEnet",
    "input_size": NUM_DIM,
    "hidden_size": 128,
    "embedding_type": "concat",
    "Encoder_cfg": {"nblocks": 10},
    "Decoder_cfg": {"nblocks": 10},
    "System_cfg": {
        "kernel_initialization": "gaussian",
        "kernel_parametrization": "skew_symmetric",
        "scale": 0.01,
    },
}

HPARAMS = join_dicts(
    {
        "Optimizer": OPTIMIZER_CONIFG,
        "Model": MODEL_CONFIG,
    }
)

## Initialize Model

In [None]:
MODEL = LinODEnet
model = MODEL(**MODEL_CONFIG)
model.to(device=DEVICE, dtype=DTYPE)
torchinfo.summary(model)

In [None]:
expA = torch.matrix_exp(model.kernel)

In [None]:
torch.linalg.eig(expA).eigenvalues.real.max()

In [None]:
torch.linalg.eig(model.kernel).eigenvalues.real.max()

In [None]:
for o in (-np.infty, -2, -1, 1, 2, np.infty, "fro", "nuc"):
    val = torch.linalg.matrix_norm(model.kernel, ord=o).item()
    val2 = torch.linalg.matrix_norm(expA, ord=o).item()
    o = str(o)
    print(f"{o=:6s}\t {val=:10.6f} \t {val2=:10.6f}")

## Initalize Optimizer

In [None]:
from tsdm.optimizers import OPTIMIZERS
from tsdm.utils import initialize_from

In [None]:
OPTIMIZER_CONIFG |= {"params": model.parameters()}
optimizer = initialize_from(OPTIMIZERS, **OPTIMIZER_CONIFG)

## Utility functions

In [None]:
batch = next(iter(TRAINLOADER[0]))
T, X = batch
targets = X[..., OBS_HORIZON:, TASK.targets.index].clone()
# assert targets.shape == (BATCH_SIZE, PRD_HORIZON, len(TASK.targets))

inputs = X.clone()
inputs[:, OBS_HORIZON:, TASK.targets.index] = NAN
inputs[:, OBS_HORIZON:, TASK.observables.index] = NAN
# assert inputs.shape == (BATCH_SIZE, HORIZON, NUM_DIM)

In [None]:
targets = X[..., OBS_HORIZON:, TASK.targets.index].clone()
targets.shape

In [None]:
def prep_batch(batch: tuple[Tensor, Tensor]):
    """Get batch and create model inputs and targets"""
    T, X = batch
    targets = X[..., OBS_HORIZON:, TASK.targets.index].clone()
    # assert targets.shape == (BATCH_SIZE, PRD_HORIZON, len(TASK.targets))

    inputs = X.clone()
    inputs[:, OBS_HORIZON:, TASK.targets.index] = NAN
    inputs[:, OBS_HORIZON:, TASK.observables.index] = NAN
    # assert inputs.shape == (BATCH_SIZE, HORIZON, NUM_DIM)
    return T, inputs, targets


def get_all_preds(model, dataloader):
    Y, Ŷ = [], []
    for batch in (pbar := tqdm(dataloader, leave=False)):
        with torch.no_grad():
            model.zero_grad()
            times, inputs, targets = prep_batch(batch)
            outputs = model(times, inputs)
            predics = outputs[:, OBS_HORIZON:, TASK.targets.index]
            loss = LOSS(targets, predics)
            Y.append(targets)
            Ŷ.append(predics)
        if pbar.n == 5:
            break

    targets, predics = torch.cat(Y, dim=0), torch.cat(Ŷ, dim=0)
    mask = torch.isnan(targets)
    targets[mask] = torch.tensor(0.0)
    predics[mask] = torch.tensor(0.0)
    # scale = 1/torch.mean(mask.to(dtype=torch.float32))
    # targets *= scale
    # predics *= scale
    return targets, predics

## Logging Utilities

In [None]:
from tsdm.logutils import compute_metrics


def log_all(i, model, writer, optimizer):
    kernel = model.system.kernel.clone().detach().cpu()
    log_kernel_information(i, writer, kernel, histograms=True)
    log_optimizer_state(i, writer, optimizer, histograms=True)


def log_hparams(i, writer, *, metric_dict, hparam_dict):
    hparam_dict |= {"epoch": i}
    metric_dict = add_prefix(metric_dict, "hparam")
    writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict)


metrics = {key: LOSSES[key] for key in ("ND", "NRMSE", "MSE", "MAE")}
# assert any(isinstance(TASK.test_metric, metric) for metric in metrics.values())
metrics = {key: LOSSES[key]() for key in ("ND", "NRMSE", "MSE", "MAE")} | {
    "WRMSE": LOSS
}

print("WARMUP")
t = torch.randn(NUM_DIM).to(DEVICE)
x = torch.randn(1, NUM_DIM).to(device=DEVICE)
y = model(t, x)
torch.linalg.norm(y).backward()
model.zero_grad()

In [None]:
RUN_START = tsdm.utils.now()
CHECKPOINTDIR = Path(
    f"checkpoints/{MODEL.__name__}/{DATASET.__name__}/{RUN_NAME}/{RUN_START}"
)
CHECKPOINTDIR.mkdir(parents=True, exist_ok=True)
LOGGING_DIR = f"runs/{MODEL.__name__}/{DATASET.__name__}/{RUN_NAME}/{RUN_START}"
writer = SummaryWriter(LOGGING_DIR)

### Training Start

In [None]:
i = -1
epoch = 1

with torch.no_grad():
    # log optimizer state first !!!
    # log_optimizer_state(epoch, writer, optimizer, histograms=True)
    log_kernel_information(epoch, writer, model.system.kernel, histograms=True)

    for key in ((0, "train"), (0, "test")):
        dataloader = EVALLOADERS[key]
        y, ŷ = get_all_preds(model, dataloader)
        assert torch.isfinite(y).all()
        log_metrics(
            epoch, writer, metrics=metrics, targets=y, predics=ŷ, prefix=key[1]
        )


for _ in (epochs := trange(100)):
    break
    epoch += 1
    for batch in (batches := tqdm(TRAINLOADER[0])):
        i += 1
        # Optimization step
        model.zero_grad()
        times, inputs, targets = prep_batch(batch)

        forward_time = time()
        outputs = model(times, inputs)
        forward_time = time() - forward_time

        predics = outputs[:, OBS_HORIZON:, TASK.targets.index]

        # get rid of nan-values in teh targets.
        mask = torch.isnan(targets)
        targets[mask] = torch.tensor(0.0)
        predics[mask] = torch.tensor(0.0)

        # # compensate NaN-Value with upscaling
        # scale = 1/torch.mean(mask.to(dtype=torch.float32))
        # targets *= scale
        # predics *= scale

        loss = LOSS(targets, predics)

        backward_time = time()
        loss.backward()
        backward_time = time() - backward_time

        optimizer.step()

        # batch logging
        logging_time = time()
        with torch.no_grad():
            log_metrics(
                i,
                writer,
                metrics=metrics,
                targets=targets,
                predics=predics,
                prefix="batch",
            )
            log_optimizer_state(i, writer, optimizer, prefix="batch")

            lval = loss.clone().detach().cpu().numpy()
            gval = grad_norm(list(model.parameters())).clone().detach().cpu().numpy()
            if torch.any(torch.isnan(loss)):
                raise RuntimeError("NaN-value encountered!!")
        logging_time = time() - logging_time

        batches.set_postfix(
            loss=f"{lval:.2e}",
            gnorm=f"{gval:.2e}",
            Δt_forward=f"{forward_time:.1f}",
            Δt_backward=f"{backward_time:.1f}",
            Δt_logging=f"{logging_time:.1f}",
        )

    with torch.no_grad():
        # log optimizer state first !!!
        log_optimizer_state(epoch, writer, optimizer, histograms=True)
        log_kernel_information(epoch, writer, model.system.kernel, histograms=True)

        for key in ((0, "train"), (0, "test")):
            dataloader = EVALLOADERS[key]
            y, ŷ = get_all_preds(model, dataloader)
            metric_values = compute_metrics(metrics, targets=y, predics=ŷ)
            log_metrics(
                epoch, writer, metrics=metrics, values=metric_values, prefix=key[1]
            )
            # log_hparams(epoch, writer, metric_dict=metric_values, hparam_dict=HPARAMS)

        # Model Checkpoint
        torch.jit.save(model, CHECKPOINTDIR.joinpath(f"{MODEL.__name__}-{epochs.n}"))
        torch.save(
            {
                "optimizer": optimizer,
                "epoch": epoch,
                "batch": i,
            },
            CHECKPOINTDIR.joinpath(f"{optimizer.__class__.__name__}-{epochs.n}"),
        )

In [None]:
buffers = dict(model.named_buffers())

In [None]:
timedeltas = model.timedeltas.detach().cpu()
xhat_pre = model.xhat_pre.detach().cpu()
xhat_post = model.xhat_post.detach().cpu()
zhat_pre = model.zhat_pre.detach().cpu()
zhat_post = model.zhat_post.detach().cpu()
xhat_pre.shape, xhat_post.shape, zhat_pre.shape, zhat_post.shape

## Relative size change xhat_pre ⟶ xhat_post

In [None]:
%matplotlib inline
plt.style.use("bmh")

BATCH_DIM, LEN, DIM = tuple(xhat_pre.shape)
n, m = model.input_size, model.hidden_size


def gmean(x, dim=()):
    """Geometric mean"""
    return torch.exp(torch.mean(torch.log(x), dim=dim))


predata = xhat_pre
postdata = xhat_post
xpremag = torch.mean(
    torch.linalg.norm(xhat_pre[1:], dim=-1) / torch.linalg.norm(xhat_pre[:-1], dim=-1),
    dim=0,
)
xpstmag = torch.mean(
    torch.linalg.norm(xhat_post[1:], dim=-1)
    / torch.linalg.norm(xhat_post[:-1], dim=-1),
    dim=0,
)
zpremag = torch.mean(
    torch.linalg.norm(zhat_pre[1:], dim=-1) / torch.linalg.norm(zhat_pre[:-1], dim=-1),
    dim=0,
)
zpstmag = torch.mean(
    torch.linalg.norm(zhat_post[1:], dim=-1)
    / torch.linalg.norm(zhat_post[:-1], dim=-1),
    dim=0,
)

system_mag = torch.linalg.norm(zhat_pre[:, 1:], dim=-1) / torch.linalg.norm(
    zhat_post[:, :-1], dim=-1
)
system_mag = torch.cat([torch.ones(BATCH_DIM, 1), system_mag], dim=-1)
combine_mag = torch.linalg.norm(zhat_post, dim=-1) / torch.linalg.norm(zhat_pre, dim=-1)
# system_mag = torch.cat([torch.ones(BATCH_DIM, 1), system_mag], dim=-1)
decoder_mag = (torch.linalg.norm(xhat_pre, dim=-1) / n) / (
    torch.linalg.norm(zhat_pre, dim=-1) / m
)
filter_mag = torch.linalg.norm(xhat_post, dim=-1) / torch.linalg.norm(xhat_pre, dim=-1)
encoder_mag = (torch.linalg.norm(zhat_post, dim=-1) / m) / (
    torch.linalg.norm(xhat_post, dim=-1) / n
)

filter_mag = torch.mean(filter_mag, dim=0)
system_mag = torch.mean(system_mag, dim=0)
combine_mag = torch.mean(combine_mag, dim=0)
decoder_mag = torch.mean(decoder_mag, dim=0)
encoder_mag = torch.mean(encoder_mag, dim=0)

fig, ax = plt.subplots(ncols=4, nrows=2, figsize=(15, 8), sharey="row")
ax[0, 0].semilogy(xpremag)
ax[0, 0].set_title(
    r"Relative Magnitude change $\hat{x}_t  \rightarrow \hat{x}_{t+1}  $"
)
ax[0, 1].semilogy(xpstmag)
ax[0, 1].set_title(
    r"Relative Magnitude change $\hat{x}_t' \rightarrow \hat{x}_{t+1}' $"
)
ax[0, 2].semilogy(zpremag)
ax[0, 2].set_title(
    r"Relative Magnitude change $\hat{z}_t  \rightarrow \hat{z}_{t+1}  $"
)
ax[0, 3].semilogy(zpstmag)
ax[0, 3].set_title(
    r"Relative Magnitude change $\hat{z}_t' \rightarrow \hat{z}_{t+1}' $"
)
ax[1, 0].semilogy(decoder_mag)
ax[1, 0].set_title(r"Relative magnitude change $\hat{z}_t  \rightarrow \hat{x}_t$")
# ax[1, 1].semilogy(filter_mag)
# ax[1, 1].set_title(r"Relative magnitude change $\hat{x}_t  \rightarrow \hat{x}_t'$")
# ax[1, 2].semilogy(encoder_mag)
# ax[1, 2].set_title(r"Relative magnitude change $\hat{x}_t' \rightarrow \hat{z}_t'$")
ax[1, 1].semilogy(encoder_mag)
ax[1, 1].set_title(r"Relative magnitude change $\hat{x}_t' \rightarrow \hat{z}_t'$")
ax[1, 2].semilogy(system_mag)
ax[1, 2].set_title(r"Relative magnitude change $\hat{x}_t' \rightarrow \hat{z}_t'$")
ax[1, 3].semilogy(combine_mag)
ax[1, 3].set_title(r"Relative magnitude change $\hat{z}_t \rightarrow \hat{z}_{t}'$")
ax[1, 0].set_yscale("log")
fig.savefig("Encoder is bad.pdf")

In [None]:
dummy = torch.randn(10_000, m, device="cuda")
dummy2 = model.encoder(dummy)
dummy1 = torch.linalg.norm(dummy, dim=-1) / m
dummy2 = torch.linalg.norm(dummy2, dim=-1) / m
chg = (dummy2 / dummy1).clone().detach().cpu().numpy()
plt.hist(chg, bins="auto")

In [None]:
model.decoder

In [None]:
model.encoder

In [None]:
torch.mean(xpremag), torch.mean(system_mag)

In [None]:
decoder_mag

In [None]:
torch.linalg.eig(model.kernel).eigenvalues.real.max()

In [None]:
model.timedeltas.shape

In [None]:
torch.linalg.matrix_norm(torch.matrix_exp(model.kernel))

In [None]:
expA = torch.matrix_exp(model.kernel)

for o in (-np.infty, -2, -1, 1, 2, np.infty, "fro", "nuc"):
    val = torch.linalg.matrix_norm(model.kernel, ord=o).item()
    val2 = torch.linalg.matrix_norm(expA, ord=o).item()
    o = str(o)
    print(f"{o=:6s}\t {val=:10.6f} \t {val2=:10.6f}")

In [None]:
from matplotlib import cm

mat = model.kernel.clone().detach().cpu()
# mat = 0.5 + (mat - mat.mean()) / (6 * mat.std())
# mat = kernel.clip(0, 1)
# colormap = cm.get_cmap("seismic")
# mat = colormap(mat)
# plt.imshow(mat)

In [None]:
torch.linalg.norm(mat - mat.t())

In [None]:
mat = expA.clone().detach().cpu()
mat = 0.5 + (mat - mat.mean()) / (6 * mat.std())
mat = kernel.clip(0, 1)
colormap = cm.get_cmap("seismic")
RGBA = colormap(mat)
plt.imshow(RGBA)

In [None]:
?torch.linalg.matrix_norm

In [None]:
torch.exp(1)

In [None]:
torch.nn.init.calculate_gain("leaky_relu")