In [None]:
import torch
from torch import nn
from torch.nn import GRUCell
import numpy as np
from opt_einsum import contract

# RITS Model (BRITS Paper)

combining observation estimation and input:

- $\hat x_t = \operatorname{Linear}(h_t)$
- $\tilde x_t = \begin{cases}x_t: \text{if observed} \\ \hat x_t: \text{else}\end{cases}$
- $\hat h_t = \operatorname{ODEsolve}(f, t, (t-1, h_{t-1}))$
- $h_t = \operatorname{GRUCell}(\tilde x_t, \hat h_t)$



/home/rscholz/.tsdm/models/Latent-ODELoss functions:

- 

In [None]:
class LinODE(nn.Module):
    """
    Linear System module

    x' = Ax + Bu + w
     y = Cx + Du + v

    """

    def __init__(self, input_size, initialization=None):
        super(LinODE, self).__init__()
        self.kernel = nn.Parameter(torch.randn(input_size, input_size))

    def forward(self, t, z):
        """
        Forward using matrix exponential
        # TODO: optimize if clauses away by changing definition in constructor.
        """

        zhat = z.copy()
        Δt = torch.diff(t)
        AΔt = contract("kl, n -> nkl", self.kernel, Δt)
        expAΔt = torch.matrix_exp(At)
        zhat[1:] = contract("nkl, nl -> nk", expAΔt, z[:-1])

        return zhat

In [None]:
class AttrDict(dict):
    # https://stackoverflow.com/a/14620633/9318372
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [None]:
d = AttrDict({"k": 1, "l": 2})

In [None]:
class LinODERNN(nn.Module):
    # default hyperparameters
    HP = {
        'GRUCell' : {'bias' : True, 'hidden_size' : None},
        'LinODE' :  {'hidden_size': None, initialization: 'None'}
    }
    
    def __set_HP():
        self.HP['LinODE']['hidden_size']
    
    def __init__(self, input_size, HP: dict):
        self.__set_HP(input_size, HP: dict)
        self.init_HP()
        self.dynamics = LinODE(**HP['LinODE'])
        self.encoder = 
        self.decoder = 
        self.filter = 
        
        
    def forward(self):
        """c
        input: t: tensor shape (..., N,)
            Observation timepoints corresponding to the observed values
        input: x: tensor shape (..., N, M) dtype: float. 
            Observed data, NaN indicates Missing values
        input:
        output: xhat: tensor shape (..., N, M)
            Predicted values. The values may differ from x for non-NaN entries, since the model assumes that observational data is noisy.
            Q: Does this make any sense for categorical data? Not really..., but one can use sigmoid for example.
        """
        
        xhat = None
        
        return xhat
    
    def predict(self, t, x):
        xhat = self(t, x)
        
        # TODO: treat categorical features.
        
        return xhat

In [None]:
mask = np.random.choice([True, False], size=(5, 6))
np.where(mask, np.random.randn(5, 6))

In [None]:
d = AttrDict()
d.update({"items": ["jacket", "necktie", "trousers"]})
d.items

How to handle input? We have multiple Options:

1. Input $t_\text{obs}$, $x_\text{obs}$, and $t_\text{predict}$, return $x_\text{predict}$
    - similar to regular ODESELVE input, but with many time observations instead of single initial condition.
2. Input $t_\text{obs+predict}$, $x_\text{obs}$, fill $x$ with nan values at prediction points (reduce problem to imputation task)
3. Input $t$, $x$, $u$. The controls $u$ can occur at future time points (pre-scheduled) controls


### Question? How to handle initial hidden state & initial state estimation in RNN?

1. Initialize with zero or randomly (kinda dumb, but has to do for now)
2. Initialize through initializer network, 
    - small deepset / Time series set function network
    - ODE-RNN encoder like in Latent-ODE encoder


In [None]:
class LinODERNN(nn.Module):
    # default hyperparameters
    HP = {
        'GRUCell' : {'bias' : True, 'hidden_size' : None},
        'LinODE' : {'hidden_size': None, initialization: 'None'}
    }
    
    def __set_HP()
    
    def __init__(self, input_size, **hyperparameters):
        self.__set_HP(**hyperparameters)
        self.init_HP()
        self.GRUCell = nn.GRUCell()
        self.LinODE = LinODE()
        
    def forward(self):
        """c
        input: t: tensor shape (..., N,)
            Observation timepoints corresponding to the observed values
        input: x: tensor shape (..., N, M) dtype: float. 
            Observed data, NaN indicates Missing values
        input:
        output: xhat: tensor shape (..., N, M)
            Predicted values. The values may differ from x for non-NaN entries, since the model assumes that observational data is noisy.
            Q: Does this make any sense for categorical data? Not really..., but one can use sigmoid for example.
        """
        
        xhat = None
        
        return xhat
    
    def predict(self, t, x):
        xhat = self(t, x)
        
        # TODO: treat categorical features.
        
        return xhat

In [None]:
import numpy as np
import matplotlib.pyplot as plt

N = 100_000
n = 20

A = np.random.randn(N, n, n)
symA = (A + np.einsum("ijk-> ikj", A)) / 2
skewA = (A - np.einsum("ijk-> ikj", A)) / 2

In [None]:
conds = np.linalg.cond(A)
symconds = np.linalg.cond(symA)
skewconds = np.linalg.cond(skewA)

In [None]:
from scipy import stats


def visualize_distribution(x, bins=100, log=True, ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6), tight_layout=True)

    if log:
        x = np.log10(x)
        ax.set_xscale("log")
        ax.set_yscale("log")
        bins = np.logspace(np.floor(np.min(x)), np.ceil(np.max(x)), num=bins, base=10)
    ax.hist(conds, bins=bins, density=True)
    print(
        f"median: {np.median(x):.2}   mode:{stats.mode(x)[0][0]:.2}   mean: {np.mean(x):.2}  stdev:{np.std(x):.2}"
    )

In [None]:
fig, ax = plt.subplots(
    ncols=3, figsize=(12, 4), tight_layout=True, sharex=True, sharey=True
)
visualize_distribution(conds, ax=ax[0])
visualize_distribution(symconds, ax=ax[1])
visualize_distribution(skewconds, ax=ax[2])

In [None]:
nn.init.kaiming_normal_(torch.empty(10, 10))

In [None]:
def random_matrix(input_size, kind=None):
    """
    kind options:
    symmetric,
    skew symmetric,
    orthogonal,
    normal,
    """

    A = nn.init.kaiming_normal_(torch.empty(input_size, input_size))

    if kind == "symmetric":
        return (A + A.T) / 2
    if kind == "skew-symmetric":
        return (A - A.T) / 2

In [None]:
?GRUCell