언어 모델 만들기

데이터 준비

In [2]:
from datasets import load_dataset
import torch

dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")

data = dataset
ko_text = "".join(data["train"]["document"])
ko_chars = sorted(list(set((ko_text))))
ko_vocab_size = len(ko_chars)
print("총 글자 수 :", ko_vocab_size)

character_to_ids = {char:i for i, char in enumerate(ko_chars)}
ids_to_character = {i:char for i, char in enumerate(ko_chars)}
token_encode = lambda s:[character_to_ids[c] for c in s]
token_decode = lambda l: "".join([ids_to_character[i] for i in l])

tokenized_data = torch.tensor(token_encode(ko_text), dtype=torch.long)

n = int(0.9 * len(tokenized_data))
train_dataset = tokenized_data[:n]
test_dataset = tokenized_data[n:]

block_size = 8 # 한 번에 모델이 처리할 수 있는 글자의 수

torch.manual_seed(1234)

batch_size = 4
block_size = 8

def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    return x, y

example_x, example_y = batch_function("train")

총 글자 수 : 2701


In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)   # 단어를 벡터로 변환하는 테이블을 만듦
        #
        # nn.Embedding(num_embeddings, embedding_dim)
        # num_embeddings 는 임베딩을 할 단어의 총 수
        # embedding_dim 은 각 단어를 표현할 벡터의 차원

    def forward(self, inputs, targets):
        logits = self.embedding_token_table(inputs)

        return logits

model = semiGPT(ko_vocab_size)
output = model(example_x, example_y)
print(output.shape)

# torch.Size([4, 8, 2701])
# batch_sie 가 4, 시퀀스 길이가 8, 어휘 크기가 2701 임을 의미

torch.Size([4, 8, 2701])


In [4]:
#에러가 발생되도록 설정한 코드 ---> IndexError: index out of range in self
embedding = nn.Embedding(4, 4)
embedding(torch.tensor([[0, 1, 2, 10]]))

IndexError: index out of range in self

In [5]:
#에러가 발생되도록 세팅된 코드
import torch
import torch.nn as nn
from torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)

    def forward(self, inputs, targets):
        logits = self.embedding_token_table(inputs)

        # loss 함수에 cross entropy 사용 설정
        # 모델은 target size [4, 2701] 을 기대했는데, 실제 targets 는 [4, 8] 을 받았기 때문이다.

        # shape 조정해야 함
        # logits 는 [4, 8, 2701] 이므로 이를 [32, 2701] 로 바꾸고
        # targets 는 [4, 8] 에서 [32] 로 변경해야 한다.
        loss = F.cross_entropy(logits, targets)
        return logits, loss

model = semiGPT(ko_vocab_size)
output, loss = model(example_x, example_y)  # -> RuntimeError: Expected target size [4, 2701], got [4, 8]
print(output)

RuntimeError: Expected target size [4, 2701], got [4, 8]

In [6]:
# shape 변환된 코드
import torch
import torch.nn as nn
from torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)

    def forward(self, inputs, targets):
        logits = self.embedding_token_table(inputs)

        batch, seq_length, vocab_length = logits.shape
        logits = logits.view(batch * seq_length, vocab_length)

        targets = targets.view(batch*seq_length)
        
        loss = F.cross_entropy(logits, targets)

        print("logits의 shape는 : ", logits.shape, "입니다.")
        print("targets의 shape는 : ", targets.shape, "입니다.")

        return logits, loss

model = semiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

logits의 shape는 :  torch.Size([32, 2701]) 입니다.
targets의 shape는 :  torch.Size([32]) 입니다.
tensor(8.2693, grad_fn=<NllLossBackward0>)


In [7]:
example_x.shape, example_y.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

generate 메서드 추가

학습한 모델이 예측한 글자를 생성하기 위한 함수

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)

    def forward(self, inputs, targets=None):
        logits = self.embedding_token_table(inputs)
        if targets is None:
            loss = None
        else:
            batch, seq_length, vocab_length = logits.shape
            logits = logits.view(batch * seq_length, vocab_length)
            targets = targets.view(batch*seq_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(inputs)
            logits = logits[:, -1, :]
            print(logits.shape)
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs

model = semiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

token_decode(model.generate(torch.zeros((1,1),
                                        dtype=torch.long),
                            max_new_tokens=10)[0].tolist())

tensor(8.4701, grad_fn=<NllLossBackward0>)
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])


' 鋪좇異e₊굿比끄족公'

In [9]:
import torch

logits = torch.tensor(
    [
        [
            [0.1, 0.2, 0.3, 0.4],
            [0.2, 0.3, 0.4, 0.1],
            [0.3, 0.4, 0.1, 0.2]
        ]
    ]
)

result = logits[:,-1,:]
print("선택되는 값        : ", result)
print("결과에 대한 size 값 : ", result.size())

선택되는 값        :  tensor([[0.3000, 0.4000, 0.1000, 0.2000]])
결과에 대한 size 값 :  torch.Size([1, 4])


GPU 사용하도록

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length).to(device)

    def forward(self, inputs, targets=None):
        inputs = inputs.to(device)
        if targets is not None:
            targets = targets.to(device)
        
        logits = self.embedding_token_table(inputs)
        if targets is None:
            loss = None
        else:
            batch, seq_length, vocab_length = logits.shape
            logits = logits.view(batch * seq_length, vocab_length)
            targets = targets.view(batch*seq_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        inputs = inputs.to(device)
        for _ in range(max_new_tokens):
            logits, loss = self.forward(inputs)
            logits = logits[:, -1, :]
            print(logits.shape)
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1).to(device)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs

model = semiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

token_decode(model.generate(torch.zeros((1,1),
                                        dtype=torch.long),
                            max_new_tokens=10)[0].tolist())

[INFO] Using device: cuda
tensor(8.3893, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])


' 뗍둔異프숩뷔튬꺼乎혐'

optimizer 추가하기

모델 훈련 시 손실 함수를 이용해 모델의 예측 값과 실제 정답 데이터 사이의 차이(손실)를 계산하고  
손실을 최소화하기 위해 모델의 매개변수를 적절히 조정한다.

옵티마이저는 이 매개변수 조정 과정을 담당

In [19]:
learning_rate = 1e-2
model = semiGPT(ko_vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [20]:
from tqdm.auto import tqdm

batch_size = 32
for steps in tqdm(range(10000)):
    example_x, example_y = batch_function("train")
    example_x = example_x.to(device)
    example_y = example_y.to(device)
    logits, loss = model(example_x, example_y)
    # 옵티마이저 초기화 
    optimizer.zero_grad(set_to_none=True)
    # 역전파 계산 
    loss.backward()
    # 가중치 업데이트 
    optimizer.step()

print(loss.item())

100%|██████████| 10000/10000 [00:13<00:00, 752.63it/s]

3.1945948600769043





In [21]:
print(token_decode(model.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=10)[0].tolist()))

torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
torch.Size([1, 2701])
 퍼니다. 등급증스에


Loss 함수 만들기

In [22]:
# torch.no_grad() 데코렝이터에 의해 해당 함수 내에서 이뤄지는 모든 연산에 대해 그레디언트 계산을 자동으로 비활성화한다.
# 중간중간 평가하는 함수용
# 모델을 평가하는 단계에서는 그레디언트 계산과 가중치 업데이트가 필요 없다.

@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out