### RevNet 연산 검증

In [1]:
import torch
import numpy as np
import time
from torch import nn

In [2]:
x1 = torch.rand((3, 3))
x2 = torch.rand((3, 3))

In [3]:
x1, x2

(tensor([[0.0436, 0.2922, 0.3363],
         [0.9921, 0.0649, 0.4308],
         [0.4252, 0.5185, 0.7432]]),
 tensor([[0.0520, 0.4300, 0.3192],
         [0.1293, 0.6289, 0.0639],
         [0.4274, 0.7762, 0.7109]]))

In [4]:
# 편의상 f와 g를 모두 Linear 함수로 정의함.
f = nn.Linear(3, 3)
g = nn.Linear(3, 3)

In [5]:
y1 = x1 + f(x2)
y2 = x2 + g(y1)

In [6]:
y1, y2

(tensor([[ 0.2103,  0.6087, -0.1216],
         [ 1.2446,  0.2857,  0.1223],
         [ 0.4025,  0.8605,  0.4191]], grad_fn=<AddBackward0>),
 tensor([[-0.1389,  0.3948,  0.4288],
         [ 0.4352,  0.9487, -0.2886],
         [ 0.5939,  0.4911,  0.9132]], grad_fn=<AddBackward0>))

In [7]:
r2 = y2 - g(y1)
r1 = y1 - f(r2)

In [8]:
r1, r2

(tensor([[0.0436, 0.2922, 0.3363],
         [0.9921, 0.0649, 0.4308],
         [0.4252, 0.5185, 0.7432]], grad_fn=<SubBackward0>),
 tensor([[0.0520, 0.4300, 0.3192],
         [0.1293, 0.6289, 0.0639],
         [0.4274, 0.7762, 0.7109]], grad_fn=<SubBackward0>))

y1 과 y2 만을 가지고 r1 과 r2 를 만들었는데,
그 값이 x1 과 x2 와 일치한다.

### 리포머 모델 실행 속도 측정

In [13]:
from transformers import ReformerModelWithLMHead
from transformers import ReformerModel

In [14]:
model = ReformerModel.from_pretrained("google/reformer-enwik8")

Downloading config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/569M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/reformer-enwik8 were not used when initializing ReformerModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing ReformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ReformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def make_random_inputs(batch_size, sequence_length):
    x = np.random.randint(0, 258, (batch_size, sequence_length))
    inputs = torch.from_numpy(x)
    return inputs

In [12]:
batch_size = 32
sequence_length = 64

In [15]:
for _ in range(6):
    inputs = make_random_inputs(batch_size, sequence_length)
    
    start = time.time()
    o = model(inputs)
    end = time.time()
    
    print(f'{end-start:.2f} seconds for input size of({batch_size},{sequence_length})')
    
    batch_size = batch_size // 2
    sequence_length = sequence_length * 2

2.11 seconds for input size of(32,64)
2.24 seconds for input size of(16,128)
2.85 seconds for input size of(8,256)
4.70 seconds for input size of(4,512)
4.45 seconds for input size of(2,1024)
4.42 seconds for input size of(1,2048)


기존의 O(L^2) 시간 복잡도를 가진 어텐션이라면
배치 사이즈가 1/2 로 줄어든다고 하더라도 시퀀스 길이가 늘어나면 시간은 제곱으로 늘어야 하지만
시퀀스 길이가 증가할 때마다 상대적으로 조금씩 실행 속도가 증가한다는 것을 알 수 있다.