In [1]:
import torch

def create_patch(y, patch_size, stride):
    # [bs x seq_len]
    y_next = y.clone()
    # append the last column stride times
    y_next = torch.cat([y_next, y[:, -1].unsqueeze(1).repeat(1, stride)], dim=1)
    # split into patches
    y_next = y_next.unfold(1, patch_size, stride).to(y.device)
    return y_next  # [bs  x num_patch  x patch_len]


def find_num_patches(window, patch_size, stride):
    return (window - patch_size) // stride + 2


B = 2
L = 10
patch_len = 4
stride = 2


y = torch.rand(B, L)
y_next = create_patch(y, patch_len, stride)
assert y_next.shape[1] == find_num_patches(L, patch_len, stride)

In [2]:
import torch
import torch.nn  as nn

embedding = nn.Embedding(10, 3)
input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
embedding(input).shape

torch.Size([2, 4, 3])

In [6]:
y_next[0]

tensor([[0.8909, 0.6685, 0.3515, 0.4457],
        [0.3515, 0.4457, 0.6368, 0.9790],
        [0.6368, 0.9790, 0.4218, 0.4305],
        [0.4218, 0.4305, 0.2632, 0.2384],
        [0.2632, 0.2384, 0.2384, 0.2384]])

In [39]:
import torch
import numpy as np
from icecream import ic
from torch import nn
from torch.nn import TransformerEncoderLayer, Linear
class MTST_layer(nn.Module):
    def __init__(self, patch_sizes, num_patches, strides, window_size, n_head):
        super().__init__()
        self.trans_layers = [
            make_model(seq_len=seq_len, d_model=patch_size) for (seq_len, patch_size) in zip(num_patches, patch_sizes)
        ]
        flatten_size = (patch_sizes * num_patches).sum()
        self.ff = Linear(flatten_size, window_size)
        self.patch_sizes = patch_sizes
        self.window_size = window_size
        self.num_patches = num_patches
        self.strides = strides

    def forward(self, y):
        outputs = []
        bs = y.shape[0]
        for i in range(len(self.patch_sizes)):
            y_i = create_patch(y, self.patch_sizes[i], self.strides[i])
            # [bs x num_patch x patch_len]
            y_i = self.trans_layers[i](y_i)
            y_i = y_i.flatten(start_dim=1)
            outputs.append(y_i)
            # flatten the dims except first
        outputs = torch.cat(outputs)
        y = self.ff(outputs)
        return y

def create_patch(y, patch_size, stride):
    # [bs x seq_len]
    y_next = y.clone()
    # append the last column stride times
    y_next = torch.cat([y_next, y[:, -1].unsqueeze(1).repeat(1, stride)], dim=1)
    # split into patches
    y_next = y_next.unfold(1, patch_size, stride).to(y.device)
    return y_next # [bs  x num_patch  x patch_len]

def find_num_patches(window, patch_size, stride):
    return (window - patch_size) // stride + 2

bs = 2
patch_size = 16
stride = 1
n_head = 2
seq_len = 100
patch_sizes = [patch_size]
strides = [stride]
num_patches = [find_num_patches(seq_len, patch_sizes[i], strides[i]) for i in range(len(patch_sizes))] 

patch_sizes = np.array(patch_sizes)
num_patches = np.array(num_patches)
strides = np.array(strides)
window_size = 4

y = torch.rand(bs, seq_len)
y_next = create_patch(y, patch_size, stride)
y_next.shape
layer = MTST_layer(patch_sizes, num_patches, strides, window_size, n_head)
y = layer(y)
y.shape

torch.Size([2, 4])

tensor([[[0.2561, 0.9682, 0.6184, 0.6864, 0.6104, 0.4142, 0.8504, 0.8215,
          0.5809, 0.2869],
         [0.2658, 0.6479, 0.9247, 0.6005, 0.6061, 0.8402, 0.0150, 0.3218,
          0.7634, 0.7869],
         [0.1375, 0.6987, 0.8741, 0.3301, 0.6945, 0.2388, 0.9418, 0.1316,
          0.6855, 0.1273]],

        [[0.6349, 0.6530, 0.5069, 0.8983, 0.4652, 0.8932, 0.4598, 0.2305,
          0.6794, 0.4683],
         [0.0771, 0.7707, 0.2292, 0.7859, 0.4219, 0.0658, 0.1208, 0.0491,
          0.4933, 0.9733],
         [0.1662, 0.0629, 0.5811, 0.7097, 0.3592, 0.4266, 0.8215, 0.6264,
          0.5168, 0.2538]]])

In [1]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
# import altair as alt
# from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
# from torchtext.vocab import build_vocab_from_iterator
# import torchtext.datasets as datasets
# import spacy
# import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [2]:
# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [32]:
import copy
import math
from icecream import ic
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x))
        return self.sublayer[1](x, self.feed_forward)
    
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, attn, seq_len, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linear = nn.Linear(d_model, d_model)
        self.attn = attn(d_model, h, seq_len, dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        "Implements Figure 2"
        x = self.attn(x)
        return self.linear(x)
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))
    
def make_model(seq_len, attn=RelativeGlobalAttention, 
    N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model, attn, seq_len, dropout)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    model = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N)


    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

batch_size = 10
seq_len = 20
d_model = 8
model = make_model(seq_len, d_model=d_model)

x = torch.rand(batch_size, seq_len, d_model)
src_mask = torch.ones(1, 1, seq_len)
y = model(x)
y.shape

torch.Size([10, 20, 8])

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class RelativeGlobalAttention(nn.Module):
    def __init__(self, d_model, num_heads, max_len=1024, dropout=0.1):
        super().__init__()
        d_head, remainder = divmod(d_model, num_heads)
        if remainder:
            raise ValueError(
                "incompatible `d_model` and `num_heads`"
            )
        self.max_len = max_len
        self.d_model = d_model
        self.num_heads = num_heads
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.query = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.Er = nn.Parameter(torch.randn(max_len, d_head))
        self.register_buffer(
            "mask", 
            torch.ones(max_len, max_len)
            .unsqueeze(0).unsqueeze(0)
        )
        # self.mask.shape = (1, 1, max_len, max_len)

    
    def forward(self, x):
        # x.shape == (batch_size, seq_len, d_model)
        batch_size, seq_len, _ = x.shape
        
        if seq_len > self.max_len:
            raise ValueError(
                "sequence length exceeds model capacity"
            )
        
        k_t = self.key(x).reshape(batch_size, seq_len, self.num_heads, -1).permute(0, 2, 3, 1)
        # k_t.shape = (batch_size, num_heads, d_head, seq_len)
        v = self.value(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        q = self.query(x).reshape(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        # shape = (batch_size, num_heads, seq_len, d_head)
        
        start = self.max_len - seq_len
        Er_t = self.Er[start:, :].transpose(0, 1)
        # Er_t.shape = (d_head, seq_len)
        QEr = torch.matmul(q, Er_t)
        # QEr.shape = (batch_size, num_heads, seq_len, seq_len)
        Srel = self.skew(QEr)
        # Srel.shape = (batch_size, num_heads, seq_len, seq_len)
        
        QK_t = torch.matmul(q, k_t)
        # QK_t.shape = (batch_size, num_heads, seq_len, seq_len)
        attn = (QK_t + Srel) / math.sqrt(q.size(-1))
        mask = self.mask[:, :, :seq_len, :seq_len]
        # mask.shape = (1, 1, seq_len, seq_len)
        attn = attn.masked_fill(mask == 0, float("-inf"))
        # attn.shape = (batch_size, num_heads, seq_len, seq_len)
        attn = F.softmax(attn, dim=-1)
        out = torch.matmul(attn, v)
        # out.shape = (batch_size, num_heads, seq_len, d_head)
        out = out.transpose(1, 2)
        # out.shape == (batch_size, seq_len, num_heads, d_head)
        out = out.reshape(batch_size, seq_len, -1)
        # out.shape == (batch_size, seq_len, d_model)
        return self.dropout(out)
        
    
    def skew(self, QEr):
        # QEr.shape = (batch_size, num_heads, seq_len, seq_len)
        padded = F.pad(QEr, (1, 0))
        # padded.shape = (batch_size, num_heads, seq_len, 1 + seq_len)
        batch_size, num_heads, num_rows, num_cols = padded.shape
        reshaped = padded.reshape(batch_size, num_heads, num_cols, num_rows)
        # reshaped.size = (batch_size, num_heads, 1 + seq_len, seq_len)
        Srel = reshaped[:, :, 1:, :]
        # Srel.shape = (batch_size, num_heads, seq_len, seq_len)
        return Srel

batch_size = 10

seq_len = 100
d_model = 768
num_heads = 12

test_in = torch.randn(batch_size, seq_len, d_model)
l = RelativeGlobalAttention(d_model, num_heads)
l(test_in).shape

torch.Size([10, 100, 768])

In [43]:
from tsl.datasets import AirQuality, MetrLA, PemsBay
from tsl.ops.imputation import add_missing_values

def get_dataset(dataset_name: str):
    if dataset_name.startswith("air"):
        return AirQuality(impute_nans=True, small=dataset_name[3:] == "36")
    # build missing dataset
    if dataset_name.endswith("_point"):
        p_fault, p_noise = 0.0, 0.25
        dataset_name = dataset_name[:-6]
    elif dataset_name.endswith("_block"):
        p_fault, p_noise = 0.0015, 0.05
        dataset_name = dataset_name[:-6]
    else:
        raise ValueError(f"Invalid dataset name: {dataset_name}.")
    if dataset_name == "la":
        return add_missing_values(
            MetrLA(),
            p_fault=p_fault,
            p_noise=p_noise,
            min_seq=12,
            max_seq=12 * 4,
            seed=9101112,
        )
    if dataset_name == "bay":
        return add_missing_values(
            PemsBay(),
            p_fault=p_fault,
            p_noise=p_noise,
            min_seq=12,
            max_seq=12 * 4,
            seed=56789,
        )
    raise ValueError(f"Invalid dataset name: {dataset_name}.")

dataset_name = "bay_point"
dataset = get_dataset(dataset_name)
target = dataset.dataframe().values
mask 

2024-01-03 12:06:20,202 [INFO]: Generating mask with base p=0.0


In [1]:

import torch
import torch.nn as nn
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 1)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([0])
embedding(input)[0]


tensor([0.4747], grad_fn=<SelectBackward0>)