셀프 어텐션 추가

문자들 간에 정보를 주고받는 방식 (평균 방식)

In [1]:
# 배치가 2 이고, 시퀀스 길이가 4, 임베딩 차원이 6 인 데이터를 생성

import torch
torch.manual_seed(1441)
num_batches, sequence_length, embedding_dim = 2, 4, 6
embeddings_tensor = torch.randn(num_batches,
                                sequence_length,
                                embedding_dim)
embeddings_tensor.shape

torch.Size([2, 4, 6])

In [2]:
# 이 코드의 목표는 더 나은 예측을 위해 시퀀스들이 서로 어떻게 정보를 주고받을 수 있는지? 를 알아보는 것
# 여기서 주목할 점은 4개의 시퀀스가 순차적으로 입력된다는 점

# 이전 임베딩의 평균을 저장할 텐서 초기화
averaged_embeddings = torch.zeros((num_batches, sequence_length, embedding_dim))

# 각 배치에 대해 반복
for batch_index in range(num_batches):
    # 각 시퀀스 위치에 대해 반복
    for sequence_position in range(sequence_length):
        # 현재 시퀀스 위치까지의 이전 임베딩을 슬라이스
        previous_embeddings = embeddings_tensor[batch_index, :sequence_position + 1]
        # 현재 위치까지의 임베딩의 평균을 계산
        averaged_embeddings[batch_index, sequence_position] = torch.mean(
            previous_embeddings,
            dim=0
        )

In [3]:
print(embeddings_tensor[0])
print(averaged_embeddings[0])

tensor([[-1.1437, -1.2611, -0.1634, -0.5255, -1.0879,  0.3712],
        [ 2.2335,  0.3099, -1.3975,  1.1141, -0.3373,  0.6924],
        [ 0.2644,  1.1567, -0.5040, -0.7986,  2.6778,  1.4161],
        [ 1.3159, -0.5231,  1.2933, -0.8819,  0.7118,  0.4209]])
tensor([[-1.1437, -1.2611, -0.1634, -0.5255, -1.0879,  0.3712],
        [ 0.5449, -0.4756, -0.7804,  0.2943, -0.7126,  0.5318],
        [ 0.4514,  0.0685, -0.6883, -0.0700,  0.4175,  0.8266],
        [ 0.6675, -0.0794, -0.1929, -0.2730,  0.4911,  0.7252]])


In [4]:
print(embeddings_tensor[0][0])
print(averaged_embeddings[0][0])

tensor([-1.1437, -1.2611, -0.1634, -0.5255, -1.0879,  0.3712])
tensor([-1.1437, -1.2611, -0.1634, -0.5255, -1.0879,  0.3712])


In [5]:
print(embeddings_tensor[0][1])
print(averaged_embeddings[0][1])

tensor([ 2.2335,  0.3099, -1.3975,  1.1141, -0.3373,  0.6924])
tensor([ 0.5449, -0.4756, -0.7804,  0.2943, -0.7126,  0.5318])


In [6]:
(embeddings_tensor[0][0][0] + averaged_embeddings[0][1][0]) / 2

tensor(-0.2994)

행렬 곱

In [7]:
# 행렬곱 연산 예시

A = torch.ones(3,3)
B = torch.randint(0, 10, (3,2)).float()
AB = A @ B

print(" A 행렬 ")
print(A)
print("==============")
print("==============")
print(" B 행렬 ")
print(B)
print("==============")
print("==============")
print(" AB 행렬 ")
print(AB)

 A 행렬 
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
 B 행렬 
tensor([[7., 2.],
        [0., 5.],
        [2., 2.]])
 AB 행렬 
tensor([[9., 9.],
        [9., 9.],
        [9., 9.]])


tril 이라는 함수를 이용해서 구현

In [8]:
weight = torch.tril(torch.ones(sequence_length, sequence_length))
print(weight)
weight = weight / weight.sum(1, keepdim=True)
print(weight)

tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])


In [10]:
# 행렬을 이용해 앞서 for 문으로 만든 행렬 평균과 마스크를 씌워서, 앞서 계산한 값과 같은 값이 나오는지 확인해 보자.
matrix_averaged_embeddings = weight @ embeddings_tensor
torch.allclose(averaged_embeddings, matrix_averaged_embeddings)

True

In [12]:
import torch.nn.functional as F

# torch 에서 제공하는 masked_fill 함수를 이용해보자.
weight = torch.tril(torch.ones(sequence_length, sequence_length))
weight = weight.masked_fill(weight == 0, float('-inf')) # 0이라는 숫자에는 -inf를 쓰우겠다는 코드이다.
print(weight)
weight = F.softmax(weight, dim=-1)
print(weight)

tensor([[1., -inf, -inf, -inf],
        [1., 1., -inf, -inf],
        [1., 1., 1., -inf],
        [1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])


In [13]:
weight_tril_embeddings = weight @ embeddings_tensor
torch.allclose(averaged_embeddings, weight_tril_embeddings)

True

## 셀프 어텐션이란?

입력 시퀀스(문장)를 쿼리(Q), 키(K), 밸류(V) 세 개로 복사하고 계산

결과적으로 각 단어의 새로운 표현은 시퀀스 내 모든 단어와의 관계를 반영 ---> 이러한 과정이 셀프 어텐션 메커니즘의 핵심 작동 원리

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 고정된 난수 시드 설정
torch.manual_seed(1111)

# 배치 크기, 시퀀스 길이, 채널 수 설정
batch_size, seq_length, num_channels = 2, 4, 4
input_tensor = torch.randn(batch_size, seq_length, num_channels)

# 각 헤드의 크기
head_size = 16

# Key, Query, Value 변환을 위한 선형 레이어
key_transform = nn.Linear(num_channels, head_size, bias=False)
query_transform = nn.Linear(num_channels, head_size, bias=False)
value_transform = nn.Linear(num_channels, head_size, bias=False)

# Key, Query, Value 변환 수행
keys = key_transform(input_tensor)
queries = query_transform(input_tensor)
values = value_transform(input_tensor)

# Attention 스코어 계산
attention_scores = queries @ keys.transpose(-2, -1)

# 하삼각행렬 생성 및 마스킹
mask_lower_triangle = torch.tril(torch.ones(seq_length, seq_length))
attention_scores = attention_scores.masked_fill(mask_lower_triangle == 0, float('-inf'))

# 소프트맥스 함수를 사용하여 확률 정규화
normalized_scores = F.softmax(attention_scores, dim=-1)

# 최종 출력 계산
output_tensor = normalized_scores @ values

output_tensor

tensor([[[-0.4755, -0.5409, -0.1864,  0.2951, -1.0717, -0.6172, -0.0176,
           0.1793, -0.1113,  0.6589, -0.4507, -0.1181, -0.9728, -0.8870,
           0.2349, -0.0431],
         [-0.4675, -0.5344, -0.1847,  0.2859, -1.0581, -0.6044, -0.0154,
           0.1778, -0.1141,  0.6524, -0.4473, -0.1211, -0.9561, -0.8733,
           0.2352, -0.0451],
         [-0.0760, -0.1545, -0.0268, -0.0634, -0.2490, -0.0492,  0.0418,
           0.0039, -0.1387,  0.1754, -0.1870, -0.1300, -0.1049, -0.1437,
           0.0797, -0.0811],
         [ 1.0050,  0.6488,  0.1280, -1.3952,  1.4225,  1.7320,  0.3957,
          -0.0998, -0.6179, -0.5368,  0.1755, -0.6712,  2.0809,  1.6208,
           0.2876, -0.4129]],

        [[-0.1629, -0.3577,  0.2200, -0.0743, -0.4798, -0.1531,  0.1460,
          -0.3159, -0.3507,  0.2564, -0.4777,  0.0395, -0.2861, -0.3503,
          -0.0974, -0.1463],
         [-0.1699, -0.3586,  0.1711, -0.0815, -0.4939, -0.1562,  0.1316,
          -0.2638, -0.3395,  0.2754, -0.4681, -0.0

스케일링 이라고 불리는 과정이 필요하다.
계산된 attention_scores 를 특정 값(root dk)으로 나누는 것

왜 나눠야 하는가?

바로 소프트맥스 함수 때문이다.

구체적으로 소프트맥스 함수는 모델이 고려 중인 모든 가능한 다음 단어들에 대해 확률을 계산하는데,
이 과정에서 어텐션 점수가 극단적으로 커지거나 작아질 수 있다.
따라서 특정 값으로 나누는 스케일링을 통해서, 분산 값을 감소시켜서, 소프트맥스 함수가 여러 위치의 정보를 골로루 반영할 수 있게 된다.

In [15]:
# dk로 왜 나누어주는지 코드로 설명하는 부분
k = torch.randn(batch_size, sequence_length, embedding_dim)
q = torch.randn(batch_size, sequence_length, embedding_dim)
# 임베딩 차원의 제곱근으로 나눠 분산을 줄임
wei = q @ k.transpose(-2, -1) * (embedding_dim ** -0.5)
wei.var()

tensor(0.8900)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 고정된 난수 시드 설정
torch.manual_seed(1111)

# 배치 크기, 시퀀스 길이, 채널 수 설정
batch_size, sequence_length, embedding_dim = 2, 4, 4
input_tensor = torch.randn(batch_size, sequence_length, embedding_dim)

# 헤드 사이즈 설정
head_dimension = 16

# Key, Query, Value 변환을 위한 선형 레이어
key_layer = nn.Linear(embedding_dim, head_dimension, bias=False)
query_layer = nn.Linear(embedding_dim, head_dimension, bias=False)
value_layer = nn.Linear(embedding_dim, head_dimension, bias=False)

# Key, Query, Value 변환 수행
key_matrix = key_layer(input_tensor)
query_matrix = query_layer(input_tensor)

# 스케일링 계수를 적용한 Attention 스코어 계산
scaling_factor = embedding_dim ** -0.5
attention_scores = query_matrix @ key_matrix.transpose(-2, -1) * scaling_factor

# 하삼각 행렬로 마스킹, 무한대로 채움
mask = torch.tril(torch.ones(sequence_length, sequence_length))
attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))

# 소프트맥스를 적용하여 Attention 확률 정규화
normalized_attention = F.softmax(attention_scores, dim=-1)

# Value 변환 적용
value_matrix = value_layer(input_tensor)

# 최종 출력 계산
output_tensor = normalized_attention @ value_matrix

output_tensor

tensor([[[-4.7553e-01, -5.4087e-01, -1.8645e-01,  2.9508e-01, -1.0717e+00,
          -6.1721e-01, -1.7619e-02,  1.7932e-01, -1.1134e-01,  6.5890e-01,
          -4.5073e-01, -1.1805e-01, -9.7278e-01, -8.8699e-01,  2.3494e-01,
          -4.3051e-02],
         [-3.7282e-01, -4.5845e-01, -1.6476e-01,  1.7766e-01, -8.9889e-01,
          -4.5412e-01,  1.1151e-02,  1.6013e-01, -1.4667e-01,  5.7623e-01,
          -4.0744e-01, -1.5664e-01, -7.6102e-01, -7.1314e-01,  2.3889e-01,
          -6.8812e-02],
         [ 3.3135e-02, -3.0254e-02,  3.8257e-02, -1.3334e-01,  1.8626e-02,
           8.7150e-02,  4.3044e-02, -7.2718e-02, -1.1493e-01, -2.8212e-03,
          -8.7858e-02, -9.4005e-02,  1.4480e-01,  7.8447e-02, -1.1284e-02,
          -7.3810e-02],
         [ 8.0965e-01,  5.1643e-01,  1.1648e-01, -1.1408e+00,  1.1586e+00,
           1.3968e+00,  3.1847e-01, -1.0840e-01, -5.1064e-01, -4.4907e-01,
           1.2734e-01, -5.5556e-01,  1.7125e+00,  1.3270e+00,  2.0701e-01,
          -3.4455e-01]],

  

## 셀프 어텐션 적용하기

In [20]:
# 데이터 준비
from datasets import load_dataset
import torch

dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")

data = dataset
ko_text = "".join(data["train"]["document"])
ko_chars = sorted(list(set((ko_text))))
ko_vocab_size = len(ko_chars)
print("총 글자 수 :", ko_vocab_size)

character_to_ids = {char:i for i, char in enumerate(ko_chars)}
ids_to_character = {i:char for i, char in enumerate(ko_chars)}
token_encode = lambda s:[character_to_ids[c] for c in s]
token_decode = lambda l: "".join([ids_to_character[i] for i in l])

tokenized_data = torch.tensor(token_encode(ko_text), dtype=torch.long)

n = int(0.9 * len(tokenized_data))
train_dataset = tokenized_data[:n]
test_dataset = tokenized_data[n:]

torch.manual_seed(1234)

  from .autonotebook import tqdm as notebook_tqdm


총 글자 수 : 2701


<torch._C.Generator at 0x7ff64d146390>

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
block_size = 8
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200

In [18]:
n_embed = 32
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, inputs):
        batch_size, sequence_length, embedding_dim = inputs.shape
        keys = self.key(inputs)
        queries = self.query(inputs)
        weights = queries @ keys.transpose(-2, -1) * (embedding_dim ** -0.5)
        weights = weights.masked_fill(
            self.tril[:sequence_length, :sequence_length] == 0, float("-inf")
            )
        weights = F.softmax(weights, dim=-1)
        values = self.value(inputs)
        output = weights @ values
        return output

In [21]:
def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device) # .to 를 추가
    return x, y

@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        ##### self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)
        #####
        self.token_embedding_table = nn.Embedding(vocab_length, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.attention_head = Head(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_length)
        #####

    def forward(self, inputs, targets=None):
        ##### logits = self.embedding_token_table(inputs)
        #####
        batch, sequence = inputs.shape

        token_embed = self.token_embedding_table(inputs)
        pos_embed = self.position_embedding_table(
            torch.arange(sequence, device=device)
            )
        x = token_embed + pos_embed
        x = self.attention_head(x)
        logits = self.lm_head(x)
        #####
        if targets is None:
            loss = None
        else:
            batch, seq_length, vocab_length = logits.shape
            logits = logits.view(batch * seq_length, vocab_length)
            targets = targets.view(batch*seq_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            ##### logits, loss = self.forward(inputs)
            #####
            inputs_cond = inputs[:, -block_size:]
            logits, loss = self.forward(inputs_cond)
            #####            
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs

model = semiGPT(ko_vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iteration):
    if step % eval_interval == 0 :
        losses = compute_loss_metrics()
        print(f'step : {step}, train loss : {losses["train"]:.4f}, val loss : {losses["eval"]:.4f}')

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1,1), dtype=torch.long, device=device)
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))

step : 0, train loss : 7.9283, val loss : 7.9312
step : 300, train loss : 4.1318, val loss : 4.1247
step : 600, train loss : 3.8816, val loss : 3.8712
step : 900, train loss : 3.7493, val loss : 3.7830
step : 1200, train loss : 3.7262, val loss : 3.7230
step : 1500, train loss : 3.6835, val loss : 3.6935
step : 1800, train loss : 3.6461, val loss : 3.6392
step : 2100, train loss : 3.6338, val loss : 3.6267
step : 2400, train loss : 3.6196, val loss : 3.6018
step : 2700, train loss : 3.5939, val loss : 3.5671
step : 3000, train loss : 3.5839, val loss : 3.5897
step : 3300, train loss : 3.5493, val loss : 3.5488
step : 3600, train loss : 3.5535, val loss : 3.5423
step : 3900, train loss : 3.5455, val loss : 3.5386
step : 4200, train loss : 3.5253, val loss : 3.5398
step : 4500, train loss : 3.5146, val loss : 3.5544
step : 4800, train loss : 3.5234, val loss : 3.5316
step : 5100, train loss : 3.5334, val loss : 3.5185
step : 5400, train loss : 3.5377, val loss : 3.5217
step : 5700, train

# 멀티헤드 어텐션과 피드포워드

## 멀티헤드 어텐션 만들기

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
block_size = 8
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200
n_embed = 32


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, inputs):
        batch_size, sequence_length, embedding_dim = inputs.shape
        keys = self.key(inputs)
        queries = self.query(inputs)
        weights = queries @ keys.transpose(-2, -1) * (embedding_dim ** -0.5)
        weights = weights.masked_fill(self.tril[:sequence_length, :sequence_length] == 0, float("-inf"))
        weights = F.softmax(weights, dim=-1)
        values = self.value(inputs)
        output = weights @ values
        return output



def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

#####
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self,inputs):
        return torch.cat([head(inputs) for head in self.heads], dim=-1)
#####


class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_length, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        ##### self.attention_head = Head(n_embed)
        self.attention_head = MultiHeadAttention(4, n_embed//4)
        self.lm_head = nn.Linear(n_embed, vocab_length)

    def forward(self, inputs, targets=None):
        batch, sequence = inputs.shape

        token_embed = self.token_embedding_table(inputs)
        pos_embed = self.position_embedding_table(torch.arange(sequence, device=device))
        x = token_embed + pos_embed
        x = self.attention_head(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch, sequence, embed_size = logits.shape
            logits = logits.view(batch * sequence, embed_size)
            targets = targets.view(batch * sequence)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            inputs_cond = inputs[:, -block_size:]
            logits, loss = self(inputs_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs


model = semiGPT(ko_vocab_size).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for step in range(max_iteration):
    if step % eval_interval == 0 :
        losses = compute_loss_metrics()
        print(f'step : {step}, train loss : {losses["train"]:.4f}, val loss : {losses["eval"]:.4f}')

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1,1), dtype=torch.long, device=device)
print("-----------------------------------------------")
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))

step : 0, train loss : 7.9658, val loss : 7.9669
step : 300, train loss : 4.7464, val loss : 4.7608
step : 600, train loss : 4.5965, val loss : 4.5786
step : 900, train loss : 4.4728, val loss : 4.4871
step : 1200, train loss : 4.3662, val loss : 4.3601
step : 1500, train loss : 4.2662, val loss : 4.2693
step : 1800, train loss : 4.1669, val loss : 4.1954
step : 2100, train loss : 4.1074, val loss : 4.1166
step : 2400, train loss : 4.0435, val loss : 4.0385
step : 2700, train loss : 3.9849, val loss : 3.9822
step : 3000, train loss : 3.9027, val loss : 3.9230
step : 3300, train loss : 3.8783, val loss : 3.8630
step : 3600, train loss : 3.8299, val loss : 3.8285
step : 3900, train loss : 3.7853, val loss : 3.8289
step : 4200, train loss : 3.7798, val loss : 3.7827
step : 4500, train loss : 3.7618, val loss : 3.7438
step : 4800, train loss : 3.7202, val loss : 3.7268
step : 5100, train loss : 3.7047, val loss : 3.6923
step : 5400, train loss : 3.6969, val loss : 3.6672
step : 5700, train

## FeedForward

어텐션 메커니즘은 입력 시퀀스의 각 요소와 전체 시퀀스 간의 관계를 계산한다.  
이 과정은 주로 입력 데이터의 전체적인 맥락을 파악하는 데 중점을 둔다.  
하지만 어텐션 메커니즘만으로는 데이터의 복잡한 패턴이나 비선형적 관계를 충분히 학습하기는 어렵다.

이 때 FeedForward 네트워크가 중요한 역할을 한다.

각 어텐션 블록 뒤에 FeedForward 네트워크를 배치하여  
각 시퀀스 위치마다 독립적으로 적용되며, 비선형 활성화 함수를 포함해 모델의 표현력을 높여준다.

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
block_size = 8
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200
n_embed = 32


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, inputs):
        batch_size, sequence_length, embedding_dim = inputs.shape
        keys = self.key(inputs)
        queries = self.query(inputs)
        weights = queries @ keys.transpose(-2, -1) * (embedding_dim ** -0.5)
        weights = weights.masked_fill(self.tril[:sequence_length, :sequence_length] == 0, float("-inf"))
        weights = F.softmax(weights, dim=-1)
        values = self.value(inputs)
        output = weights @ values
        return output



def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self,inputs):
        return torch.cat([head(inputs) for head in self.heads], dim=-1)


#####
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed)
        )

    def forward(self, input_tensor):
        return self.layer(input_tensor)
#####

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_length, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.attention_head = MultiHeadAttention(4, n_embed//4)
        self.feed_forward = FeedForward(n_embed)
        # Block이 들어갈 위치
        self.lm_head = nn.Linear(n_embed, vocab_length)

    def forward(self, inputs, targets=None):
        batch, sequence = inputs.shape

        token_embed = self.token_embedding_table(inputs)
        pos_embed = self.position_embedding_table(torch.arange(sequence, device=device))
        x = token_embed + pos_embed
        x = self.attention_head(x)
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch, sequence, embed_size = logits.shape
            logits = logits.view(batch * sequence, embed_size)
            targets = targets.view(batch * sequence)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):

            inputs_cond = inputs[:, -block_size:]

            logits, loss = self(inputs_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs


model = semiGPT(ko_vocab_size).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for step in range(max_iteration):
    if step % eval_interval == 0 :
        losses = compute_loss_metrics()
        print(f'step : {step}, train loss : {losses["train"]:.4f}, val loss : {losses["eval"]:.4f}')

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1,1), dtype=torch.long, device=device)
print("-----------------------------------------------")
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))

step : 0, train loss : 7.9134, val loss : 7.9140
step : 300, train loss : 4.2537, val loss : 4.2542
step : 600, train loss : 3.9467, val loss : 3.9158
step : 900, train loss : 3.8114, val loss : 3.8251
step : 1200, train loss : 3.7188, val loss : 3.7574
step : 1500, train loss : 3.6790, val loss : 3.7008
step : 1800, train loss : 3.6621, val loss : 3.6497
step : 2100, train loss : 3.6264, val loss : 3.6413
step : 2400, train loss : 3.6409, val loss : 3.6387
step : 2700, train loss : 3.5764, val loss : 3.5724
step : 3000, train loss : 3.5750, val loss : 3.5602
step : 3300, train loss : 3.5557, val loss : 3.5321
step : 3600, train loss : 3.5314, val loss : 3.5302
step : 3900, train loss : 3.5377, val loss : 3.5498
step : 4200, train loss : 3.5265, val loss : 3.5095
step : 4500, train loss : 3.5345, val loss : 3.5185
step : 4800, train loss : 3.4759, val loss : 3.5043
step : 5100, train loss : 3.4933, val loss : 3.4989
step : 5400, train loss : 3.4805, val loss : 3.4875
step : 5700, train

## Blocks 만들기

GPT와 같은 복잡한 신경망 모델에서 블록(Block)은 모델의 설계와 구현에 중요한 구조적 단위이다.

블록 구조는 모델 내 다양한 계층과 구성 요소를 하나로 묶어 모듈화, 재사용성, 확장성을 크게 향샹시킨다.

In [None]:
# Block 클래스의 코드
class Block(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.attention = MultiHeadAttention(n_heads, head_size)
        self.feed_forward = FeedForward(n_embed)
        self.layer_norm1 = nn.LayerNorm(n_embed)
        self.layer_norm2 = nn.LayerNorm(n_embed)

    def forward(self, input_tensor):
        input_tensor = input_tensor + self.attention(self.layer_norm1(input_tensor))
        input_tensor = input_tensor + self.feed_forward(self.layer_norm2(input_tensor))
        return input_tensor

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
block_size = 8
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200
n_embed = 32
n_head = 4
n_layer = 4
dropout = 0.1

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, inputs):
        batch_size, sequence_length, embedding_dim = inputs.shape
        keys = self.key(inputs)
        queries = self.query(inputs)
        weights = queries @ keys.transpose(-2, -1) * (embedding_dim ** -0.5)
        weights = weights.masked_fill(self.tril[:sequence_length, :sequence_length] == 0, float("-inf"))
        weights = F.softmax(weights, dim=-1)
        values = self.value(inputs)
        output = weights @ values
        return output


def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self,inputs):
        return torch.cat([head(inputs) for head in self.heads], dim=-1)


class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, input_tensor):
        return self.layer(input_tensor)


class Block(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.attention = MultiHeadAttention(n_heads, head_size)
        self.feed_forward = FeedForward(n_embed)
        self.layer_norm1 = nn.LayerNorm(n_embed)
        self.layer_norm2 = nn.LayerNorm(n_embed)

    def forward(self, input_tensor):
        input_tensor = input_tensor + self.attention(self.layer_norm1(input_tensor))
        input_tensor = input_tensor + self.feed_forward(self.layer_norm2(input_tensor))
        return input_tensor


class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, 4) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_length)

    def forward(self, inputs, targets=None):
        batch, sequence = inputs.shape

        token_embed = self.embedding_token_table(inputs) # (B, T, C)
        pos_embed = self.position_embedding_table(torch.arange(sequence, device=device)) # (T, C)
        x = token_embed + pos_embed
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch, sequence, embed_size = logits.shape
            logits = logits.view(batch * sequence, embed_size)
            targets = targets.view(batch * sequence)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            inputs_cond = inputs[:, -block_size:]

            logits, loss = self(inputs_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs


model = semiGPT(ko_vocab_size).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for step in range(max_iteration):
    if step % eval_interval == 0 :
        losses = compute_loss_metrics()
        print(f'step : {step}, train loss : {losses["train"]:.4f}, val loss : {losses["eval"]:.4f}')

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1,1), dtype=torch.long, device=device)
print("-----------------------------------------------")
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))

step : 0, train loss : 8.1663, val loss : 8.1647
step : 300, train loss : 4.1377, val loss : 4.1575
step : 600, train loss : 3.8563, val loss : 3.8552
step : 900, train loss : 3.7203, val loss : 3.7374
step : 1200, train loss : 3.6500, val loss : 3.6534
step : 1500, train loss : 3.5745, val loss : 3.5688
step : 1800, train loss : 3.5277, val loss : 3.5341
step : 2100, train loss : 3.5020, val loss : 3.4801
step : 2400, train loss : 3.4688, val loss : 3.4600
step : 2700, train loss : 3.4423, val loss : 3.4285
step : 3000, train loss : 3.4408, val loss : 3.4653
step : 3300, train loss : 3.4356, val loss : 3.4213
step : 3600, train loss : 3.4103, val loss : 3.3991
step : 3900, train loss : 3.3984, val loss : 3.3733
step : 4200, train loss : 3.3921, val loss : 3.3700
step : 4500, train loss : 3.3866, val loss : 3.3745
step : 4800, train loss : 3.3862, val loss : 3.3390
step : 5100, train loss : 3.3544, val loss : 3.3493
step : 5400, train loss : 3.3564, val loss : 3.3382
step : 5700, train

In [25]:
# 테스트
input_word = "의사"
input_ids = [character_to_ids[char] for char in input_word if char in character_to_ids]

# 입력 텐서 생성
inputs = torch.tensor([input_ids], dtype=torch.long).to(device)

# 모델을 사용하여 텍스트 생성
outputs = model.generate(inputs, 100)

# 생성된 결과 디코딩
generated_text = "".join([ids_to_character.get(i, '') for i in outputs[0].tolist()])

print("-----------------------------------------------")
print("Generated Text: ", generated_text)

-----------------------------------------------
Generated Text:  의사를 사무가 후보내다. 놀지 중국 사위주는 일대를 수표로 지금 10대 3주년 9의 연 결제를 거나3눴다. 나전기엔 카카오 것으로 인상멧은 “이부 금융가 보인다. 3만 MAMZ 서션 


# 토크나이저

효과적인 토크나이저는 텍스트의 의미를 잘 보존하면서도 데이터를 효율적으로 처리할 수 있게 한다.

In [27]:
# 토크나이저 만들기
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast

# 저장할 디렉토리 경로 설정
SAVE_DIR = "content/"

# 디렉토리가 없으면 생성
os.makedirs(SAVE_DIR, exist_ok=True)

# 원하는 어휘 크기 설정
VOCAB_SIZE = 10000

# 토크나이저 초기화
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

# 트레이너 준비 (vocab_size 지정)
trainer = BpeTrainer(
    special_tokens=["<unk>", "<s>", "</s>", "<pad>"],
    vocab_size=VOCAB_SIZE
)

# 토크나이저 학습
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset["train"]), batch_size):
        yield dataset["train"][i : i + batch_size]["document"]

tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

# 토크나이저를 JSON 파일로 저장
tokenizer_path = os.path.join(SAVE_DIR, "tokenizer.json")
tokenizer.save(tokenizer_path)

# 토크나이저를 Hugging Face 형식으로 변환
huggingface_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>"
)

# Hugging Face 형식의 토크나이저 저장
huggingface_path = os.path.join(SAVE_DIR, "huggingface_tokenizer")
huggingface_tokenizer.save_pretrained(huggingface_path)

# Hugging Face 형식의 토크나이저 로드
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(huggingface_path)

# 어휘 크기 확인
print(f"Vocabulary size: {len(tokenizer.get_vocab())}")

# 테스트
test_texts = ["안녕하세요", "자연어 처리는 매우 흥미로운 분야입니다", "인공지능과 기계학습의 발전이 놀랍습니다"]
for text in test_texts:
    encoded = tokenizer.encode(text)
    print(f"Original: {text}")
    print(f"Encoded: {encoded}")
    print(f"Decoded: {tokenizer.decode(encoded)}")
    print(f"Tokens: {tokenizer.convert_ids_to_tokens(encoded)}")
    print()




Vocabulary size: 10000
Original: 안녕하세요
Encoded: [1912, 1172, 2549, 9020]
Decoded: 안 녕 하 세요
Tokens: ['안', '녕', '하', '세요']

Original: 자연어 처리는 매우 흥미로운 분야입니다
Encoded: [4466, 1945, 2242, 2982, 4637, 2648, 1580, 3063, 2931, 2949]
Decoded: 자연 어 처 리는 매우 흥 미 로운 분야 입니다
Tokens: ['자연', '어', '처', '리는', '매우', '흥', '미', '로운', '분야', '입니다']

Original: 인공지능과 기계학습의 발전이 놀랍습니다
Encoded: [3765, 982, 5093, 5017, 2063, 3100, 2065, 1177, 1394, 2727]
Decoded: 인공지능 과 기계 학습 의 발전 이 놀 랍 습니다
Tokens: ['인공지능', '과', '기계', '학습', '의', '발전', '이', '놀', '랍', '습니다']



In [28]:
tokenizer

PreTrainedTokenizerFast(name_or_path='content/huggingface_tokenizer', vocab_size=10000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# 최종 코드

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# 하이퍼파라미터
batch_size = 32
block_size = 8
max_iteration = 100
eval_interval = 10
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 10
n_embed = 32
n_head = 4
n_layer = 4
dropout = 0.1


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, inputs):
        batch_size, sequence_length, embedding_dim = inputs.shape
        keys = self.key(inputs)
        queries = self.query(inputs)
        weights = queries @ keys.transpose(-2, -1) * (embedding_dim ** -0.5)
        weights = weights.masked_fill(self.tril[:sequence_length, :sequence_length] == 0, float("-inf"))
        weights = F.softmax(weights, dim=-1)
        values = self.value(inputs)
        output = weights @ values
        return output


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self,inputs):
        return torch.cat([head(inputs) for head in self.heads], dim=-1)


class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, input_tensor):
        return self.layer(input_tensor)


class Block(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.attention = MultiHeadAttention(n_heads, head_size)
        self.feed_forward = FeedForward(n_embed)
        self.layer_norm1 = nn.LayerNorm(n_embed)
        self.layer_norm2 = nn.LayerNorm(n_embed)

    def forward(self, input_tensor):
        input_tensor = input_tensor + self.attention(self.layer_norm1(input_tensor))
        input_tensor = input_tensor + self.feed_forward(self.layer_norm2(input_tensor))
        return input_tensor


# 데이터셋 전처리
def preprocess_dataset(dataset, tokenizer):
    encoded_data = [tokenizer.encode(text, add_special_tokens=False) for text in dataset]
    tensor_data = [torch.tensor(seq, dtype=torch.long) for seq in encoded_data if len(seq) >= block_size + 1]
    return tensor_data

def create_dataloader(tensor_data, batch_size, block_size):
    dataset = TensorDataset(
        torch.stack([seq[:block_size] for seq in tensor_data]).to(device),
        torch.stack([seq[1:block_size+1] for seq in tensor_data]).to(device)
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

class semiGPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.ModuleList([Block(n_embed, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=device))
        x = token_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# 데이터 전처리
n = int(0.9 * len(dataset["train"]["document"]))
train_data = preprocess_dataset(dataset["train"]["document"][:n], tokenizer)
test_data = preprocess_dataset(dataset["train"]["document"][n:], tokenizer)

# 데이터 로더 생성
train_loader = create_dataloader(train_data, batch_size, block_size)
test_loader = create_dataloader(test_data, batch_size, block_size)

# 모델 초기화
vocab_size = len(tokenizer.get_vocab())
model = semiGPT(vocab_size).to(device)
print(f"모델의 파라미터 수: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# 평가 함수
@torch.no_grad()
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    for batch in data_loader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        logits, loss = model(x, y)
        total_loss += loss.item()
    return total_loss / len(data_loader)

# 학습 루프
from tqdm.auto import tqdm
for step in tqdm(range(max_iteration)):
    if step % eval_interval == 0:
        train_loss = evaluate(train_loader)
        val_loss = evaluate(test_loader)
        print(f'step : {step}, train loss : {train_loss:.4f}, val loss : {val_loss:.4f}')

    model.train()
    for batch in train_loader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# 텍스트 생성
context = "의사"
context_encoded = tokenizer.encode(context, return_tensors='pt').to(device)
generated_ids = model.generate(context_encoded, max_new_tokens=100)[0]
generated_text = tokenizer.decode(generated_ids)
print("Generated Text:", generated_text)

모델의 파라미터 수: 0.70M


  0%|          | 0/100 [00:00<?, ?it/s]

step : 0, train loss : 9.3706, val loss : 9.3801


 10%|█         | 10/100 [02:02<17:50, 11.89s/it]

step : 10, train loss : 3.6829, val loss : 5.8988


 20%|██        | 20/100 [04:04<15:53, 11.92s/it]

step : 20, train loss : 3.3648, val loss : 6.0488


 30%|███       | 30/100 [06:02<13:09, 11.28s/it]

step : 30, train loss : 3.2323, val loss : 6.2776


 40%|████      | 40/100 [07:58<11:16, 11.28s/it]

step : 40, train loss : 3.1974, val loss : 6.2955


 50%|█████     | 50/100 [09:53<09:22, 11.24s/it]

step : 50, train loss : 3.1459, val loss : 6.2845


 60%|██████    | 60/100 [11:49<07:30, 11.26s/it]

step : 60, train loss : 3.0931, val loss : 6.4014


 70%|███████   | 70/100 [13:46<05:51, 11.71s/it]

step : 70, train loss : 3.0674, val loss : 6.4840


 80%|████████  | 80/100 [15:45<03:47, 11.36s/it]

step : 80, train loss : 3.0808, val loss : 6.4226


 90%|█████████ | 90/100 [17:47<01:59, 11.97s/it]

step : 90, train loss : 3.0595, val loss : 6.4553


100%|██████████| 100/100 [19:50<00:00, 11.91s/it]


Generated Text: 의사 상품 게임 은 재건축 · 공원 · 3개월 만에 4 개 지정 세 노조 제안 으로 존 분석 총 개선 둔화 하면서 패션 사람들이 엄 신용 위원회 기간 3개 도 진단 사 치 업 권 영수 부회장이 글로벌 세계 가상인간 상반기 제출한 청 법인 날 식 … 이다 지급 참여 예고 스에서 6 10일 2022 한덕수 총리 임 병 내 국방 럴 4차 산업 산업 연구소 에서 설립 허준이 홍 배 총 고등과학원 AS 농 민 선 정부 첫 뒤 시즌 와 쿨 지수 공개 입법 보니 개선 … 사법 세를 된다 마감 ... 6개 시 성구 나 점 프로세
