In [305]:
import pandas as pd
import numpy as np
import torch
from torch import nn, einsum
from torch.nn.modules.linear import Linear
import torch.optim as optim
import torch.nn.functional as F
import math
from functools import partial

# Model

## Pre-Layer

In [306]:
class PreLayer(nn.Module):
    def __init__(self, d_model, in_dim=1):
        super().__init__()
        self.linear = nn.Linear(in_dim, d_model)

    def forward(self, x):
        out = self.linear(x)
        return out

## Post-layer

In [307]:
class PostLayer(nn.Module):
    def __init__(self, dim, vocab_num):
        super().__init__()
        self.linear = nn.Linear(dim, vocab_num)
    def forward(self,x):
        out = self.linear(x)
        return out

## Positional Encoding

In [308]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len: int = 100):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(1)]
        return x

## Full Attention

In [309]:
def full_attention(query, key, value, causal=False, dropout=0.0):
    device = key.device
    B_k, h_k, n_k, d_k = key.shape
    B_q, h_q, n_q, d_q = query.shape

    scale = einsum("bhqd,bhkd->bhqk", query, key)/math.sqrt(d_k)
    print("scale",scale)

    if causal:
        ones = torch.ones(B_k, h_k, n_q, n_k).to(device)
        mask = torch.tril(ones)
        scale = scale.masked_fill(mask == 0, -1e9)
        print("decoder masking",scale)
    atn = F.softmax(scale, dim=-1)
    print("attention",atn)
    if dropout is not None:
        atn = F.dropout(atn, p=dropout)   
    out = einsum("bhqk,bhkd->bhqd", atn, value)
    print("out",out)
    return out

In [310]:
def to_eachhead(x, head_num, split_num=3):
    B, n, pre_d = x.shape
    new_d = pre_d//split_num
    assert pre_d%split_num == 0, f"have to be multiple of {split_num}"
    assert new_d%head_num == 0, "dim must be divided by head_num"

    tpl = torch.chunk(x, split_num, dim=2)
    out = []
    for t in tpl:
        out.append(t.reshape(B, n, head_num, new_d//head_num).transpose(1,2))
    return out

In [311]:
def concat_head(x):
    B, h, n, _d = x.shape
    out = x.transpose(1,2).reshape(B, n, _d*h)
    return out

## Multi-head Attention

### MHSA

In [312]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, head_num):
        super().__init__()
        self.to_qvk = nn.Linear(dim, dim*3)
        self.make_head = partial(to_eachhead, head_num=head_num, split_num=3)
        self.mhsa = full_attention
    
    def forward(self, x):
        qvk = self.to_qvk(x)
        q, v, k = self.make_head(qvk)
        out = self.mhsa(q, k, v)
        out = concat_head(out)
        return out

### Masking MHSA

In [313]:
class MultiHeadCausalAttention(nn.Module):
    def __init__(self, dim, head_num):
        super().__init__()
        self.to_qvk = nn.Linear(dim, dim*3)
        self.make_head = partial(to_eachhead, head_num=head_num, split_num=3)
        self.mhca = partial(full_attention, causal=True)

    def forward(self, x):
        qvk = self.to_qvk(x)
        q, v, k = self.make_head(qvk)
        out = self.mhca(q, k, v)
        out = concat_head(out)
        return out

### Encoder-Decoder Self Attention

In [314]:
class MultiHeadSourceAttention(nn.Module):
    def __init__(self, dim, head_num):
        super().__init__()
        self.to_kv = nn.Linear(dim, dim*2)
        self.to_q = nn.Linear(dim, dim)
        self.make_head_kv = partial(to_eachhead, head_num=head_num, split_num=2)
        self.make_head_q = partial(to_eachhead, head_num=head_num, split_num=1)
        self.mhsa = full_attention

    def forward(self, x, memory):
        mem = self.to_kv(memory)
        x = self.to_q(x)

        k, v = self.make_head_kv(mem)
        q = self.make_head_q(x)[0]
        print("K matrix",k)
        print("v matrix",v)
        print("Q matrix",q)
        out = self.mhsa(q, k, v)
        out = concat_head(out)
        return out

## Feed Forward

In [315]:
class FeedForward(nn.Module):
    def __init__(self, dim, hid_dim):
        super().__init__()
        self.linear1 = nn.Linear(dim, hid_dim, bias=True)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hid_dim, dim, bias=True)
    
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        return out

## Encoder

In [316]:
class EncoderLayer(nn.Module):
    def __init__(self, dim, head_num, ff_hidnum, dropout_ratio, norm_first=False):
        super().__init__()
        self.dor = dropout_ratio
        self.mhsa = MultiHeadSelfAttention(dim, head_num)
        self.ln1 = nn.LayerNorm(dim)
        self.ff = FeedForward(dim, ff_hidnum)
        self.ln2 = nn.LayerNorm(dim)
        self.norm_first = norm_first

    def forward(self, x):
        res = torch.clone(x)

        if self.norm_first:
          out = self.ln1(x)
          out = self.mhsa(out)
          out = F.dropout(out, p=self.dor) + res

          res = torch.clone(out)
          out = self.ln2(out)
          out = self.ff(out)
          out = F.dropout(out, p=self.dor) + res
        else:
          out = self.mhsa(x)
          out = F.dropout(out, p=self.dor) + res
          print("Encoder residual 1",out)
          out = self.ln1(out)
          print("LN1 ENC",out)

          res = torch.clone(out)
          out = self.ff(out)
          
          out = F.dropout(out, p=self.dor) + res
          print("Encoder residual 2",out)
          out = self.ln2(out)

        return out

In [317]:
class Encoder(nn.Module):
    def __init__(self, depth, dim, head_num, ff_hidnum=2048, dropout_ratio=0.0, norm_first=False):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(dim, head_num, ff_hidnum, dropout_ratio, norm_first) for i in range(depth)])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

## Decoder

In [318]:
class DecoderLayer(nn.Module):
    def __init__(self, dim, head_num, ff_hidnum, dropout_ratio, norm_first=False):
        super().__init__()
        self.dor = dropout_ratio
        self.mhca = MultiHeadCausalAttention(dim, head_num)
        self.ln1 = nn.LayerNorm(dim)
        self.mhsa = MultiHeadSourceAttention(dim, head_num)
        self.ln2 = nn.LayerNorm(dim)
        self.ff = FeedForward(dim, ff_hidnum)
        self.ln3 = nn.LayerNorm(dim)
        self.norm_first = norm_first

    def forward(self, x, memory):
        res = torch.clone(x)

        if self.norm_first:
          out = self.ln1(x)
          out = self.mhca(out)
          out = F.dropout(out, p=self.dor) + res
          
          res = torch.clone(out)
          out = self.ln2(out)
          out = self.mhsa(out, memory)
          out = F.dropout(out, p=self.dor) + res

          res = torch.clone(out)
          out = self.ln3(out)
          out = self.ff(out)
          out = F.dropout(out, p=self.dor) + res

        else:
          out = self.mhca(x)
          out = F.dropout(out, p=self.dor) + res
          print("Residual Connection Dec 1",out)
          out = self.ln1(out)

          res = torch.clone(out)
          out = self.mhsa(out, memory)
          print("MHSA enc-dec",out)
          out = F.dropout(out, p=self.dor) + res
          print("Residual Connection Dec 2",out)
          out = self.ln2(out)

          res = torch.clone(out)
          out = self.ff(out)
          out = F.dropout(out, p=self.dor) + res
          print("Residual Connection Dec 3",out)
          out = self.ln3(out)

        return out

In [319]:
class Decoder(nn.Module):
    def __init__(self, depth, dim, head_num, ff_hidnum, dropout_ratio=0.0, norm_first=False):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(dim, head_num, ff_hidnum, dropout_ratio, norm_first) for i in range(depth)])
    
    def forward(self, x, memory):
        for layer in self.layers:
            x = layer(x, memory)
        return x

## Deep Transformer

In [320]:
class Transformer(nn.Module):
    def __init__(self, device, d_model, in_dim, N_enc, N_dec, h_enc, h_dec, ff_hidnum,dropout_model=0, norm_first=False):
        super().__init__()
        self.device = device
        self.x_pre = PreLayer(d_model,in_dim)
        self.y_pre = PreLayer(d_model,in_dim)
        self.pos = PositionalEncoding(d_model)
        self.enc = Encoder(N_enc,d_model, h_enc, ff_hidnum, dropout_model, norm_first)
        self.dec = Decoder(N_dec,d_model, h_dec, ff_hidnum, dropout_model, norm_first)
        self.post = PostLayer(d_model, 1)

    def forward(self, x, y):
        x_emb = self.x_pre(x)
        y_emb = self.y_pre(y)
        x_emb_pos = self.pos(x_emb)
        y_emb_pos = self.pos(y_emb)
        memory = self.enc(x_emb_pos)
        out = self.dec(y_emb_pos, memory)
        out = self.post(out)
        out = out.squeeze(-1)
        return out

    def generate(self, x, forcast_step, y_start):
        device = x.device
        x = x.to(device)
        B, N, D = x.shape
        x = self.x_pre(x) 
        x = self.pos(x)
        z = self.enc(x) 
        y = y_start
        for i in range(forcast_step):
            y_pred = self.y_pre(y)
            y_pred = self.pos(y_pred)
            y_pred = self.dec(y_pred, z)
            y_pred = self.post(y_pred)
            y = torch.cat([y, y_pred[:,[-1],:]], dim=1)
        y_pred = y_pred.squeeze(-1)
        return y_pred

## Data

In [321]:
data = pd.read_csv("data_manual.csv")
data = data.to_numpy().astype("float32")
data

array([[ 1.        ],
       [-0.9436108 ],
       [-0.51129353],
       [-0.1228345 ],
       [-1.        ],
       [-0.9945804 ],
       [-0.8878732 ]], dtype=float32)

In [322]:
x = np.expand_dims(np.transpose(data[:5]),axis=-1)
y = np.expand_dims(np.transpose(data[4:6]),axis=-1)
tgt = np.expand_dims(np.transpose(data[5:]),axis=-1)
print(x.shape)

(1, 5, 1)


In [323]:
from torch.utils.data import TensorDataset, DataLoader
x = np.expand_dims(np.transpose(data[:5]),axis=-1)
y = np.expand_dims(np.transpose(data[4:6]),axis=-1)
tgt = np.expand_dims(np.transpose(data[5:]),axis=-1)

x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
tgt_tensor = torch.tensor(tgt, dtype=torch.float32)
# Combine x, y, and tgt into a single dataset
dataset = TensorDataset(x_tensor, y_tensor, tgt_tensor)

# Create a DataLoader with the dataset
batch_size = 1  # Set your desired batch size
shuffle = False  # Set shuffle to True or False based on your requirements

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [324]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [325]:
import random

def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
fix_seed(22)

In [326]:
net = Transformer(device, d_model=4,N_dec=1, N_enc=1, ff_hidnum=3, h_dec=1, h_enc=1,in_dim=1)
net = net.to(device)

optimizer = optim.SGD(net.parameters(),lr=0.01)

In [327]:
def print_forward_result(name):
    def hook(module, input, output):
        print(f"Forward result of {name}:")
        print(output)
    return hook
def print_backward_result(name):
    def hook(module, input, output):
        print(f"Backward result of {name}:")
        print(output)
    return hook


In [328]:
def register_hooks(module, name=""):
    for name, child in module.named_children():
        if list(child.children()):
            register_hooks(child, name)
        else:
            child.register_forward_hook(print_forward_result(name))
register_hooks(net)

def register_hooks_decoder(module, name=""):
    for name, child in module.named_modules():
        if isinstance(child, nn.Module):
            child.register_full_backward_hook(print_backward_result(name))

register_hooks_decoder(net)

In [329]:
before_backprop = net.state_dict()
print(before_backprop)

OrderedDict([('x_pre.linear.weight', tensor([[-0.2682],
        [ 0.4050],
        [-0.3793],
        [-0.9806]], device='cuda:0')), ('x_pre.linear.bias', tensor([ 0.3155, -0.6107,  0.9012,  0.3775], device='cuda:0')), ('y_pre.linear.weight', tensor([[0.6347],
        [0.5150],
        [0.4983],
        [0.3748]], device='cuda:0')), ('y_pre.linear.bias', tensor([-0.4872, -0.8655,  0.8132,  0.8351], device='cuda:0')), ('pos.pe', tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992],
        [-0.9589,  0.2837,  0.0500,  0.9988],
        [-0.2794,  0.9602,  0.0600,  0.9982],
        [ 0.6570,  0.7539,  0.0699,  0.9976],
        [ 0.9894, -0.1455,  0.0799,  0.9968],
        [ 0.4121, -0.9111,  0.0899,  0.9960],
        [-0.5440, -0.8391,  0.0998,  0.9950],
        [-1.0000,  0.0044,  0.1098,  0.9940],
        [-0.5366

In [330]:
train_loss_log = []

criterion = nn.MSELoss()

for epoch in range(1):
  tgt_log_train = np.zeros((1,1))
  pred_log_train = np.zeros((1,1))

  for iter, (x, y, tgt) in enumerate(dataloader):
    x, y, tgt = x.to(device), y.to(device), tgt.to(device)
    tgt = tgt[:,:,0]
    net.train()
            
    out = net(x, y)
    loss = criterion(out, tgt)
            
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
 

    train_loss_log.append(loss.item())
            
    print("epoch : {}, itr :{}, MSE loss : {}, RMSE loss : {}".format(epoch, iter, loss.item(), torch.sqrt(loss).item()))
          

Forward result of linear:
tensor([[[ 0.0473, -0.2056,  0.5219, -0.6031],
         [ 0.5686, -0.9929,  1.2591,  1.3028],
         [ 0.4526, -0.8178,  1.0951,  0.8789],
         [ 0.3484, -0.6604,  0.9478,  0.4980],
         [ 0.5837, -1.0157,  1.2805,  1.3581]]], device='cuda:0',
       grad_fn=<ViewBackward0>)
Forward result of linear:
tensor([[[-1.1219, -1.3805,  0.3148,  0.4602],
         [-1.1185, -1.3777,  0.3175,  0.4623]]], device='cuda:0',
       grad_fn=<ViewBackward0>)
Forward result of pos:
tensor([[[ 0.0473,  0.7944,  0.5219,  0.3969],
         [ 1.4100, -0.4526,  1.2691,  2.3028],
         [ 1.3619, -1.2339,  1.1151,  1.8787],
         [ 0.4895, -1.6504,  0.9778,  1.4975],
         [-0.1731, -1.6694,  1.3205,  2.3573]]], device='cuda:0',
       grad_fn=<AddBackward0>)
Forward result of pos:
tensor([[[-1.1219, -0.3805,  0.3148,  1.4602],
         [-0.2770, -0.8374,  0.3275,  1.4622]]], device='cuda:0',
       grad_fn=<AddBackward0>)
Forward result of to_qvk:
tensor([[[ 0.310

In [331]:
after_backprop = net.state_dict()
print(after_backprop)

OrderedDict([('x_pre.linear.weight', tensor([[-0.2682],
        [ 0.4051],
        [-0.3793],
        [-0.9806]], device='cuda:0')), ('x_pre.linear.bias', tensor([ 0.3156, -0.6108,  0.9014,  0.3772], device='cuda:0')), ('y_pre.linear.weight', tensor([[0.6352],
        [0.5144],
        [0.4985],
        [0.3749]], device='cuda:0')), ('y_pre.linear.bias', tensor([-0.4876, -0.8650,  0.8131,  0.8349], device='cuda:0')), ('pos.pe', tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992],
        [-0.9589,  0.2837,  0.0500,  0.9988],
        [-0.2794,  0.9602,  0.0600,  0.9982],
        [ 0.6570,  0.7539,  0.0699,  0.9976],
        [ 0.9894, -0.1455,  0.0799,  0.9968],
        [ 0.4121, -0.9111,  0.0899,  0.9960],
        [-0.5440, -0.8391,  0.0998,  0.9950],
        [-1.0000,  0.0044,  0.1098,  0.9940],
        [-0.5366

In [332]:
for name, param in net.named_parameters():
    if param.grad is not None:
        print(f'Gradient name: {name}')
        print(f'Gradient value:\n{param.grad}\n')

Gradient name: x_pre.linear.weight
Gradient value:
tensor([[ 0.0001],
        [-0.0038],
        [-0.0029],
        [ 0.0027]], device='cuda:0')

Gradient name: x_pre.linear.bias
Gradient value:
tensor([-0.0094,  0.0090, -0.0236,  0.0324], device='cuda:0')

Gradient name: y_pre.linear.weight
Gradient value:
tensor([[-0.0458],
        [ 0.0560],
        [-0.0115],
        [-0.0116]], device='cuda:0')

Gradient name: y_pre.linear.bias
Gradient value:
tensor([ 0.0458, -0.0561,  0.0116,  0.0115], device='cuda:0')

Gradient name: enc.layers.0.mhsa.to_qvk.weight
Gradient value:
tensor([[ 2.6829e-05, -4.7889e-04, -1.1183e-04,  2.5020e-05],
        [-2.6730e-04,  3.2727e-04, -5.0506e-04, -7.6907e-04],
        [ 2.9824e-05,  2.8572e-04,  1.9906e-04,  1.5080e-04],
        [ 4.4530e-05, -2.8407e-04,  4.4842e-06,  1.0558e-04],
        [ 2.1313e-03, -2.9665e-03,  3.0108e-03,  5.1463e-03],
        [ 2.6034e-03, -3.6856e-03,  3.9202e-03,  6.5806e-03],
        [-2.2524e-02,  3.1925e-02, -3.4498e-02, -