## Embedding Output Shape?

: (Batch, Sample, Token(=Time Steps), d_model(=emb_dim))

In [26]:
import torch

a = torch.tensor([[1, 17, 18, 19],
                [2, 22, 23, 24],
                 [3, 4, 5, 6]])
b = torch.tensor([[1, 13, 15, 17],
                 [2, 21, 28, 24],
                 [4, 5, 13, 15]])
c = torch.stack([a, b])
print('(B, S, T) :', c.shape)
c

(B, S, T) : torch.Size([2, 3, 4])


tensor([[[ 1, 17, 18, 19],
         [ 2, 22, 23, 24],
         [ 3,  4,  5,  6]],

        [[ 1, 13, 15, 17],
         [ 2, 21, 28, 24],
         [ 4,  5, 13, 15]]])

In [28]:
import torch.nn as nn

# Batch, d_model
emb = nn.Embedding(29, 7)
print('(Batch, Sample, Token, d_model) : ', emb(c).shape)
# 참고로 len(vocab)이 input으로 들어오는 id+1 범위여야 작동한다
emb(c)

(Batch, Sample, Token, d_model) :  torch.Size([2, 3, 4, 7])


tensor([[[[ 0.2584,  1.8426,  1.0916,  0.9989, -0.5735, -0.3674, -0.9559],
          [ 0.2777,  0.2957,  1.1920, -0.5935, -0.4059, -1.0595, -2.1664],
          [-0.8182, -1.4669,  0.7383, -0.7970,  0.2455, -1.2437, -2.1387],
          [ 0.1167, -0.7758, -0.3058, -0.7731,  0.6431,  1.0676, -0.7772]],

         [[-0.1114, -0.0111, -0.0706,  1.0348, -1.7673,  0.7506,  0.4907],
          [-0.3289, -0.1181, -0.3083,  0.5114,  1.3850,  1.2151, -1.9012],
          [-0.4969,  1.4924, -1.2457, -0.6173, -0.3539, -0.1250,  1.2120],
          [-1.1135, -0.9453,  0.7297,  1.8926,  0.3260, -1.3908,  0.1047]],

         [[ 0.8959,  0.7753, -0.4108, -0.5333,  0.5469,  1.4377, -0.0402],
          [ 1.5575, -1.5602,  0.1295,  0.0712,  0.8427,  1.0777, -0.0539],
          [-0.4557, -0.5587, -0.4264, -2.0967, -0.0101,  1.4386, -0.7226],
          [ 0.3589, -1.3021,  0.1996, -1.5563, -1.3508,  0.6137,  1.9654]]],


        [[[ 0.2584,  1.8426,  1.0916,  0.9989, -0.5735, -0.3674, -0.9559],
          [ 0.554

## Scaled Embedding

깃헙 코드 가져옴

In [29]:
class ScaledEmbedding(nn.Module):
    """
    Boost learning rate for embeddings (with `scale`).
    Also, can make embeddings continuous with `smooth`.
    """
    def __init__(self, num_embeddings: int, embedding_dim: int,
                 scale: float = 10., smooth=False):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        if smooth:
            weight = torch.cumsum(self.embedding.weight.data, dim=0)
            # when summing gaussian, overscale raises as sqrt(n), so we nornalize by that.
            weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None]
            self.embedding.weight.data[:] = weight
        self.embedding.weight.data /= scale
        self.scale = scale

    @property
    def weight(self):
        return self.embedding.weight * self.scale

    def forward(self, x):
        out = self.embedding(x) * self.scale
        return out

In [31]:
freqs = 4096 // 2
chin_z = 2
emb_smooth = True
emb_scale = 10
freq_embs = 0.2

# 주파수 2048까지만 사용하니까, len(vocab)이 2048이 된 것
freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
freq_emb_scale = freq_embs

In [32]:
freq_emb

ScaledEmbedding(
  (embedding): Embedding(2048, 2)
)

In [51]:
# B, C, F, T
x = torch.randn((3, 2, 150, 200))

if freq_emb is not None:
    # x.shape[-2] == freq
    # 주파수 인덱스
    frs = torch.arange(x.shape[-2], device=x.device)
    
    # Transpose + 차원추가 + X와 같게 B와 Time steps 추가(좌우 복사)
    emb = freq_emb(frs).t()[None, :, :, None].expand_as(x)
    x = x + freq_emb_scale * emb
##############################################################
##############################################################

In [99]:
x.shape

torch.Size([3, 2, 150, 200])

### 핵심 : 참조논문 그대로 코드를 구현하지 않았다 (=참조 논문 수식과 다름. cos 존재 x)
실험 결과, smoothing을 통해  
**인접한 freq.들이 처음에 유사한 값**을 가지도록 만든 뒤,  
학습을 통해 **pos emb들을 학습**하도록 코드를 구현했다  
> 주파수 축에 Conv.을 사용하는 것은 픽셀에 Conv. 사용하는 것과 다르게 그 의미가 다르다  
전문적인 용어로 말하자면 이동에 대한 국소적 불변성(invariant to translation)이 성립하지 않기 때문이고,  
직관적으로 설명하자면 이미지 픽셀과 다르게 주파수는 **범위에 높고 낮음에 따라 그 의미**가 달라지기 때문이다    
예컨대, 주파수 범위에 따라 굵은 남자 목소리인지, 높은 여성 목소리인지가 달라지듯 말이다  
따라서 주파수에 순서 정보를 추가해야 한다


in Hybrid Demucs Github  
"add frequency embedding **to allow for non equivariant convolutions** over the frequency axis."

---
1. smoothing 실험  
2. expand_as 실험 (차원 조정)
---

in Hybrid Demucs Paper...  
"To account for that, Isik et al. (2020) suggest injecting an embedding of the frequency before
applying the convolution. We use the same approach, with the addition that we smooth the
initial embedding so that close frequencies have similar embeddings."  
  
https://arxiv.org/pdf/2008.04470.pdf


In [82]:
# smoothing
# 인접한 freq.들이 처음에 유사한 값을 가지도록 조정
tmp = nn.Embedding(5, 2)
print(tmp.weight)
print(torch.cumsum(tmp.weight.data, dim=0))

weight = torch.cumsum(tmp.weight.data, dim=0)
weight = weight / torch.arange(1, 5+1).to(weight).sqrt()[:, None]
print('sqrt idx : ',  torch.arange(1, 5+1).to(weight).sqrt()[:, None])
weight

Parameter containing:
tensor([[-0.2759,  0.8243],
        [ 1.3810, -1.8629],
        [ 0.0308, -1.1921],
        [ 0.1048, -0.3961],
        [ 0.5584,  0.1597]], requires_grad=True)
tensor([[-0.2759,  0.8243],
        [ 1.1051, -1.0385],
        [ 1.1359, -2.2307],
        [ 1.2408, -2.6267],
        [ 1.7992, -2.4671]])
sqrt idx :  tensor([[1.0000],
        [1.4142],
        [1.7321],
        [2.0000],
        [2.2361]])


tensor([[-0.2759,  0.8243],
        [ 0.7815, -0.7344],
        [ 0.6558, -1.2879],
        [ 0.6204, -1.3134],
        [ 0.8046, -1.1033]])

In [98]:
# expand test
tmp.weight = nn.Parameter(weight)
res = tmp(torch.arange(0, 5)).t()[None, :, :, None]
ex = torch.randn((1,2,5,3))
res = res.expand_as(ex)
print(res.shape)
res # B, C, F, T
# column과 마지막 row는? 바로 토큰 pos다!

torch.Size([1, 2, 5, 3])


tensor([[[[-0.2759, -0.2759, -0.2759],
          [ 0.7815,  0.7815,  0.7815],
          [ 0.6558,  0.6558,  0.6558],
          [ 0.6204,  0.6204,  0.6204],
          [ 0.8046,  0.8046,  0.8046]],

         [[ 0.8243,  0.8243,  0.8243],
          [-0.7344, -0.7344, -0.7344],
          [-1.2879, -1.2879, -1.2879],
          [-1.3134, -1.3134, -1.3134],
          [-1.1033, -1.1033, -1.1033]]]], grad_fn=<ExpandBackward0>)