In [71]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [144]:
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f70785aab70>

In [159]:
class MyTransformer:
    pass


class MyEncoderLayer:
    def __init__(
        self,
        d_model: int,
        num_heads: int,
        max_seq: int,
        hidden: int,
        drop_prob: float,
        eps: float = 1e-5
    ) -> None:
        # self.positional_encoding = MyPositionalEncoding(d_model, max_seq)
        self.attention_layer = MyMultiheadAttention(d_model, num_heads)
        self.layer_norm = MyLayerNorm(d_model, eps=1e-5)
        self.ffn = MyFNN(d_model, hidden, drop_prob, eps)

    def forward(self, x) -> torch.tensor:
        x = self.


class MyEncoder:
    def __init__(self) -> None:
        pass


class MyDecoderLayer:
    pass



class MyDecoder:
    pass


class MySentenceEmbedding:
    """
    1. Convert word into indices
        - start token
        - end token
        - padding token
    2. create embeddings
    3. 
    """
    def __init__(self) -> None:
        pass
    
    

class MySingleHeadAttention(nn.Module):
    def __init__(self, d_model: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.qkv_layer = nn.Linear(self.d_model, self.d_model * 3)
        self.out_layer = nn.Linear(self.d_model, self.d_model)

    def forward(self, x):
        """
        attention = softmax(Q @ K.T/√dim) @ V
        """
        print(f"x.size: {x.size()}")
        batch, max_seq = x.size()[:2]
        qkv = self.qkv_layer(x)
        print(f"qkv.size: {qkv.size()}")
        q, k, v = torch.chunk(qkv, 3, dim=-1)
        scaled_dot = q @ k.transpose(-1, -2) / math.sqrt(q.size()[-1])
        attention = F.softmax(scaled_dot @ v, dim=-1)
        print(f"attention size: {attention.size()}")
        out = self.out_layer(attention)
        print(f"out size: {out.size()}")
        return out


class MyMultiheadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int) -> None:
        super().__init__()
        self.d_model = d_model
        assert d_model % num_heads == 0, "Must be divisible"
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads
        self.qkv_layer = nn.Linear(d_model, d_model * 3)
        self.out_layer = nn.Linear(d_model, d_model)

    def forward(self, x) -> torch.tensor:
        """
        attention = softmax(Q @ K.T/√dim) @ V
        """
        batch, seq, _ = x.size()
        print(f"x.size: {x.size()}")
        qkv = self.qkv_layer(x)
        print(f"qkv.size: {qkv.size()}")
        qkv = qkv.reshape(batch, seq, self.num_heads, self.head_dim * 3)
        print(f"qkv.size: {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.size: {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q.size: {qkv.size()}")
        attention = F.softmax(
            q @ k.transpose(-1, -2) / math.sqrt(q.size()[-1]) @ v, dim=-1
        )
        print(f"attention.size: {attention.size()}")
        attention = attention.reshape(batch, seq, self.num_heads * self.head_dim)
        print(f"attention.size: {attention.size()}")
        out = self.out_layer(attention)
        print(f"out.size: {out.size()}")
        return out


class MyPositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq: int) -> None:
        self.d_model = d_model
        self.max_seq = max_seq

    def forward(self) -> torch.tensor:
        even_i = torch.arange(0, self.max_seq, 2)
        odd_i = torch.arange(1, self.max_seq, 2)
        position_half = torch.arange(
            start=0, end=self.max_seq // 2, step=1, dtype=torch.float32
        )
        position = torch.repeat_interleave(position_half, 2).reshape(self.max_seq, 1)
        print(f"position: {position}, shape: {position.shape}")
        pe_even = torch.sin(position / torch.pow(10000, (2 * even_i / self.d_model)))
        pe_odd = torch.sin(position / torch.pow(10000, (2 * odd_i / self.d_model)))
        pe = torch.stack((pe_even, pe_odd), dim=-1).flatten(start_dim=1)
        print(f"pe: {pe}, shape: {pe.shape}")
        return pe


class MyLayerNorm(nn.Module):
    def __init__(self, ebed_dim: int, eps: int = 1e-5) -> None:
        self.ebed_dim = ebed_dim
        self.gamma = nn.Parameter(nn.ones(ebed_dim, dtype=torch.float32))
        self.beta = nn.Parameter(nn.zeros(ebed_dim, dtype=torch.float32))
        self.eps = eps

    def forward(self, x) -> torch.tensor:
        mean = torch.mean(x, dim=(-1, -2), keepdim=True)
        var = torch.var(x, dim=(-1, -2), keepdim=True)
        std = (var + self.eps) ** 0.5
        out = (x - mean) / std * self.gamma + self.beta
        return out


class MyFNN:
    def __init__(self, ebed_dim: int, hidden: int, drop_prob: float = 0.2) -> None:
        self.ebed_dim = ebed_dim
        self.linear1 = nn.Linear(ebed_dim, hidden)
        self.linear2 = nn.Linear(hidden, ebed_dim)
        self.dropout = nn.Dropout(p=drop_prob)
        self.relu = nn.ReLU()

    def forward(self, x) -> torch.tensor:
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.relu
        out = self.linear2(x)
        return out


class MyEncoderLayer:
    def __init__(
        self,
        d_model: int,
        num_heads: int,
        hidden: int,
        drop_prob: float,
        eps: float = 1e-5
    ) -> None:
        # self.positional_encoding = MyPositionalEncoding(d_model, max_seq)
        self.attention_layer = MyMultiheadAttention(d_model, num_heads)
        self.layer_norm1 = MyLayerNorm(d_model, eps=1e-5)
        self.layer_norm2 = MyLayerNorm(d_model, eps=1e-5)
        self.ffn = MyFNN(d_model, hidden, drop_prob, eps)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x) -> torch.tensor:
        residual = x.clone()
        x = self.attention_layer(x)
        x = self.dropout1(x)
        x = self.layer_norm1(x + residual)
        residual = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.layer_norm2(x + residual)
        return x


class MyEncoder:
    def __init__(self) -> None:
        pass


class MyDecoderLayer:
    pass



class MyDecoder:
    pass


class SingleHeadAttention(nn.Module):
    def __init__(self, d_model: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.qkv_layer = nn.Linear(self.d_model, self.d_model * 3)
        self.out_layer = nn.Linear(self.d_model, self.d_model)

    def forward(self, x):
        """
        attention = softmax(Q @ K.T/√dim) @ V
        """
        print(f"x.size: {x.size()}")
        batch, max_seq = x.size()[:2]
        qkv = self.qkv_layer(x)
        print(f"qkv.size: {qkv.size()}")
        q, k, v = torch.chunk(qkv, 3, dim=-1)
        scaled_dot = q @ k.transpose(-1, -2) / math.sqrt(q.size()[-1])
        attention = F.softmax(scaled_dot @ v, dim=-1)
        print(f"attention size: {attention.size()}")
        out = self.out_layer(attention)
        print(f"out size: {out.size()}")
        return out


class MyMultiheadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int) -> None:
        super().__init__()
        self.d_model = d_model
        assert d_model % num_heads == 0, "Must be divisible"
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads
        self.qkv_layer = nn.Linear(d_model, d_model * 3)
        self.out_layer = nn.Linear(d_model, d_model)

    def forward(self, x) -> torch.tensor:
        """
        attention = softmax(Q @ K.T/√dim) @ V
        """
        batch, seq, _ = x.size()
        print(f"x.size: {x.size()}")
        qkv = self.qkv_layer(x)
        print(f"qkv.size: {qkv.size()}")
        qkv = qkv.reshape(batch, seq, self.num_heads, self.head_dim * 3)
        print(f"qkv.size: {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.size: {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q.size: {qkv.size()}")
        attention = F.softmax(
            q @ k.transpose(-1, -2) / math.sqrt(q.size()[-1]) @ v, dim=-1
        )
        print(f"attention.size: {attention.size()}")
        attention = attention.reshape(batch, seq, self.num_heads * self.head_dim)
        print(f"attention.size: {attention.size()}")
        out = self.out_layer(attention)
        print(f"out.size: {out.size()}")
        return out


class MyPositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq: int) -> None:
        self.d_model = d_model
        self.max_seq = max_seq

    def forward(self) -> torch.tensor:
        even_i = torch.arange(0, self.max_seq, 2)
        odd_i = torch.arange(1, self.max_seq, 2)
        position_half = torch.arange(
            start=0, end=self.max_seq // 2, step=1, dtype=torch.float32
        )
        position = torch.repeat_interleave(position_half, 2).reshape(self.max_seq, 1)
        print(f"position: {position}, shape: {position.shape}")
        pe_even = torch.sin(position / torch.pow(10000, (2 * even_i / self.d_model)))
        pe_odd = torch.sin(position / torch.pow(10000, (2 * odd_i / self.d_model)))
        pe = torch.stack((pe_even, pe_odd), dim=-1).flatten(start_dim=1)
        print(f"pe: {pe}, shape: {pe.shape}")
        return pe


class MyLayerNorm(nn.Module):
    def __init__(self, ebed_dim: int, eps: int = 1e-5) -> None:
        self.ebed_dim = ebed_dim
        self.gamma = nn.Parameter(nn.ones(ebed_dim, dtype=torch.float32))
        self.beta = nn.Parameter(nn.zeros(ebed_dim, dtype=torch.float32))
        self.eps = eps

    def forward(self, x) -> torch.tensor:
        mean = torch.mean(x, dim=(-1, -2), keepdim=True)
        var = torch.var(x, dim=(-1, -2), keepdim=True)
        std = (var + self.eps) ** 0.5
        out = (x - mean) / std * self.gamma + self.beta
        return out


class MyFNN:
    def __init__(self, ebed_dim: int, hidden: int, drop_prob: float = 0.2) -> None:
        self.ebed_dim = ebed_dim
        self.linear1 = nn.Linear(ebed_dim, hidden)
        self.linear2 = nn.Linear(hidden, ebed_dim)
        self.dropout = nn.Dropout(p=drop_prob)
        self.relu = nn.ReLU()

    def forward(self, x) -> torch.tensor:
        x = self.linear1(x)
        x = self.dropout(x)
        x = self.relu
        out = self.linear2(x)
        return out

SyntaxError: invalid syntax (1690674131.py, line 21)

In [152]:
x.size()[-2:]

torch.Size([8, 16])

In [158]:
x.mean(dim=(-1, -2), keepdim=True)

tensor([[[ 0.0524]],

        [[-0.0010]]])

In [146]:
max_seq = 8
batch = 2
d_model = 16
x = torch.randn(batch, max_seq, d_model, generator=g)
print(x.size())
x

torch.Size([2, 8, 16])


tensor([[[-1.1258e+00, -1.1524e+00, -2.5058e-01, -4.3388e-01,  8.4871e-01,
           6.9201e-01, -3.1601e-01, -2.1152e+00,  3.2227e-01, -1.2633e+00,
           3.4998e-01,  3.0813e-01,  1.1984e-01,  1.2377e+00,  1.1168e+00,
          -2.4728e-01],
         [-1.3527e+00, -1.6959e+00,  5.6665e-01,  7.9351e-01,  5.9884e-01,
          -1.5551e+00, -3.4136e-01,  1.8530e+00,  7.5019e-01, -5.8550e-01,
          -1.7340e-01,  1.8348e-01,  1.3894e+00,  1.5863e+00,  9.4630e-01,
          -8.4368e-01],
         [-6.1358e-01,  3.1593e-02, -4.9268e-01,  2.4841e-01,  4.3970e-01,
           1.1241e-01,  6.4079e-01,  4.4116e-01, -1.0231e-01,  7.9244e-01,
          -2.8967e-01,  5.2507e-02,  5.2286e-01,  2.3022e+00, -1.4689e+00,
          -1.5867e+00],
         [-6.7309e-01,  8.7283e-01,  1.0554e+00,  1.7784e-01, -2.3034e-01,
          -3.9175e-01,  5.4329e-01, -3.9516e-01, -4.4622e-01,  7.4402e-01,
           1.5210e+00,  3.4105e+00, -1.5312e+00, -1.2341e+00,  1.8197e+00,
          -5.5153e-01],
    

In [147]:
mht = MyMultiheadAttention(d_model=d_model, num_heads=4)
mht.forward(x)

x.size: torch.Size([2, 8, 16])
qkv.size: torch.Size([2, 8, 48])
qkv.size: torch.Size([2, 8, 4, 12])
qkv.size: torch.Size([2, 4, 8, 12])
q.size: torch.Size([2, 4, 8, 12])
attention.size: torch.Size([2, 4, 8, 4])
attention.size: torch.Size([2, 8, 16])
out.size: torch.Size([2, 8, 16])


tensor([[[-1.6080e-01,  3.8284e-01, -4.4325e-03, -1.3599e-01, -1.8348e-02,
          -2.1855e-01,  5.3361e-02,  1.6159e-01, -1.6878e-01, -1.8494e-01,
          -3.4242e-01, -9.3192e-02,  2.2041e-01, -1.2789e-01,  1.6106e-01,
           3.4265e-02],
         [-1.9514e-01,  4.4145e-01,  1.0494e-01, -6.8306e-02,  6.4535e-02,
          -2.2076e-02,  7.5763e-02,  2.1320e-01, -2.5232e-02, -3.3109e-01,
          -3.2964e-01, -1.4295e-01,  1.5792e-01, -2.8889e-01,  1.0140e-02,
          -1.8596e-01],
         [-3.9832e-02,  5.5733e-01,  3.1923e-01,  3.6236e-02, -1.0058e-01,
          -4.2197e-02,  1.3679e-01,  2.3495e-01,  1.7310e-02, -9.9014e-02,
          -1.8496e-01, -3.4619e-01,  2.3857e-01, -2.3925e-01,  1.2257e-01,
          -8.6098e-02],
         [-2.0804e-01,  3.8881e-01,  4.2467e-02, -7.2547e-02, -1.2706e-02,
          -1.7483e-01,  5.8801e-02,  1.4549e-01, -1.3260e-01, -1.9473e-01,
          -3.2912e-01, -5.0149e-02,  2.1737e-01, -1.9278e-01,  1.2767e-01,
          -2.5657e-02],
    

In [148]:
mype = MyPositionalEncoding(d_model=d_model, max_seq=max_seq)
mype.forward()

position: tensor([[0.],
        [0.],
        [1.],
        [1.],
        [2.],
        [2.],
        [3.],
        [3.]]), shape: torch.Size([8, 1])
pe: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [8.4147e-01, 3.1098e-01, 9.9833e-02, 3.1618e-02, 9.9998e-03, 3.1623e-03,
         1.0000e-03, 3.1623e-04],
        [8.4147e-01, 3.1098e-01, 9.9833e-02, 3.1618e-02, 9.9998e-03, 3.1623e-03,
         1.0000e-03, 3.1623e-04],
        [9.0930e-01, 5.9113e-01, 1.9867e-01, 6.3203e-02, 1.9999e-02, 6.3245e-03,
         2.0000e-03, 6.3246e-04],
        [9.0930e-01, 5.9113e-01, 1.9867e-01, 6.3203e-02, 1.9999e-02, 6.3245e-03,
         2.0000e-03, 6.3246e-04],
        [1.4112e-01, 8.1265e-01, 2.9552e-01, 9.4726e-02, 2.9995e-02, 9.4867e-03,
         3.0000e-03, 9.4868e-04],
        [1.4112e-01, 8.1265e-01, 2.9552e-

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [8.4147e-01, 3.1098e-01, 9.9833e-02, 3.1618e-02, 9.9998e-03, 3.1623e-03,
         1.0000e-03, 3.1623e-04],
        [8.4147e-01, 3.1098e-01, 9.9833e-02, 3.1618e-02, 9.9998e-03, 3.1623e-03,
         1.0000e-03, 3.1623e-04],
        [9.0930e-01, 5.9113e-01, 1.9867e-01, 6.3203e-02, 1.9999e-02, 6.3245e-03,
         2.0000e-03, 6.3246e-04],
        [9.0930e-01, 5.9113e-01, 1.9867e-01, 6.3203e-02, 1.9999e-02, 6.3245e-03,
         2.0000e-03, 6.3246e-04],
        [1.4112e-01, 8.1265e-01, 2.9552e-01, 9.4726e-02, 2.9995e-02, 9.4867e-03,
         3.0000e-03, 9.4868e-04],
        [1.4112e-01, 8.1265e-01, 2.9552e-01, 9.4726e-02, 2.9995e-02, 9.4867e-03,
         3.0000e-03, 9.4868e-04]])

In [162]:
embedding = nn.Embedding(100, 3, padding_idx=0)
input_x = torch.tensor([[0,0,0,1,0,0,1,1,1,2]])
embedding(input_x)

tensor([[[ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.8835, -0.5635, -0.4923],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.8835, -0.5635, -0.4923],
         [ 0.8835, -0.5635, -0.4923],
         [ 0.8835, -0.5635, -0.4923],
         [-0.4962, -0.2699,  1.3900]]], grad_fn=<EmbeddingBackward0>)