# Deep Time Series Interpretability 

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import math

## Causal Convolution Layer

In [2]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [3]:
class CausalConv1d(torch.nn.Conv1d):
    def __init__(self,
                 out_dim,
                 kernel_size,
                 bias=True):

        super(CausalConv1d, self).__init__(
            in_channels=1,
            out_channels=out_dim,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            groups=1,
            bias=bias)
        
        self.__padding = kernel_size - 1
        
    def forward(self, input):
        padded = F.pad(input.unsqueeze(1), (self.__padding, 0))
        return torch.tanh(super(CausalConv1d, self).forward(padded)).permute(0, 2, 1)

In [4]:
ins = torch.rand((5, 12))

layer = CausalConv1d(2, 5)

print(layer(ins).shape)

torch.Size([5, 12, 2])


In [5]:
class ConvMultiHeadAttention(torch.nn.Module): 
    def __init__(self, 
                 d_in,
                 d_model, 
                 num_heads,
                 dropout=0.1, 
                 ): 
        super(ConvMultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0 
        self.d_in = d_in
        self.d_model = d_model 
        self.num_heads = num_heads 
        self.head_dim = d_model // num_heads
        
        self.positional_embedding = torch.nn.Embedding(d_in, d_model) 
        
        self.conv = CausalConv1d(d_model, 9) 
        
        self.qkv_proj = torch.nn.Linear(d_model, 3*d_model)
    
    def forward(self, x, mask): 
        batch_size, seq_length = x.size()
        
        pos = self.positional_embedding(torch.arange(seq_length)) #[seqlen, dims]
                
        x_conv = self.conv(x) #[batch, seqlen, dims]
        
        input_embed = x_conv + pos #[batch, seqlen, dims]
        
        qkv = self.qkv_proj(input_embed) 
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)
        
        values, _ = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) #(batch, seqlen, head, dims)
        values = values.reshape(batch_size, seq_length, self.d_model)
        
        return values 
    
    def greedy_decode(self, x, t0): 
        mask = torch.cat((torch.ones((x.shape[0], 8, x.shape[1], x.shape[1])), 
                         torch.zeros((x.shape[0], 8, x.shape[1], x.shape[1))))
        for i in range(t0, self.d_in-1): 
            out = self.forward()
        

In [6]:
ins = torch.rand((3, 6))
mask = torch.ones((3, 4, 6, 6))

layer = ConvMultiHeadAttention(6, 64, 4)

print(layer(ins, mask).shape)

torch.Size([3, 6, 64])


## The Transformer

In [9]:
class TimeFormer(torch.nn.Module): 
    def __init__(self, 
                 d_in, 
                 d_model, 
                 num_heads, 
                 dropout=0.1): 
        super(TimeFormer, self).__init__()
        
        
        self.attention = ConvMultiHeadAttention(d_in, d_model, num_heads, dropout)
        
        self.norm1 = torch.nn.LayerNorm(d_model)
        
        self.mlp = torch.nn.Sequential(torch.nn.Linear(d_model, d_model), 
            torch.nn.Dropout(dropout), 
            torch.nn.GELU(), 
            torch.nn.Linear(d_model, d_model),
            torch.nn.Dropout(dropout)
            )
            
        self.norm2 = torch.nn.LayerNorm(d_model)
            
        
        self.last_layer = torch.nn.Linear(d_model, 1)
    
    def forward(self, x, mask): 
        attention_out = self.attention(x, mask).sum(axis=1)
        
        before_mlp = self.norm1(attention_out)
        
        after_mlp = self.mlp(before_mlp)
        
        hidden_state = self.norm2(torch.nn.GELU()(before_mlp + after_mlp))
        
        out = self.last_layer(hidden_state)
        
        return out

In [10]:
ins = torch.rand((3, 6))
mask = torch.ones((3, 4, 6, 6))

layer = TimeFormer(6, 64, 4)
layer(ins, mask)

tensor([[-0.0723],
        [-0.1504],
        [-0.2453]], grad_fn=<AddmmBackward0>)