# IMDB 데이터셋으로 학습한 영화 리뷰 GPT 모델 구현하기

## Tokenizer 준비

In [1]:
import math
import time

import numpy as np

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [2]:
# Hugging Face datasets 라이브러리에서 IMDB 데이터셋을 로드
ds = load_dataset("stanfordnlp/imdb")

# 새로운 WordPiece 토크나이저 초기화
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# BERT와 유사한 정규화 설정 (소문자 변환 및 기타 텍스트 조정)
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

# BERT와 유사한 사전 토크나이징 설정 (텍스트를 단어로 분리)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()


def get_training_corpus():
    # 데이터셋에서 텍스트 데이터의 청크를 생성하는 제너레이터 함수
    for i in range(0, len(ds["train"]), 1000):
        yield ds["train"][i : i + 1000]["text"]


# 토크나이저를 위한 특수 토큰 정의
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]"]

# 지정된 어휘 크기와 특수 토큰으로 토크나이저를 훈련시키기 위한 WordPieceTrainer 초기화
trainer = trainers.WordPieceTrainer(vocab_size=10000, special_tokens=special_tokens)

# 훈련된 토크나이저를 사용하여 텍스트 데이터로 훈련
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

# Hugging Face Transformers 라이브러리와 호환되는 형식으로 훈련된 토크나이저로 변환
tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)








In [3]:
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
NUM_EPOCHS = 10
MAX_TOKEN_LEN = 400

In [4]:
def collate_imdb(batch):
    texts, labels = [], []

    for row in batch:
        tokenized = tokenizer(
            row["text"], truncation=True, max_length=MAX_TOKEN_LEN
        ).input_ids

        # [1:]은 [CLS] 토큰을 제거하기 위함
        labels.append(torch.LongTensor(tokenized[1:]))

        # [:-1]은 [SEP] 토큰을 제거하기 위함
        texts.append(torch.LongTensor(tokenized[:-1]))

    # 배치 내 모든 텍스트와 label 시퀀스를 패딩하여 같은 길이로 맞춤
    text_inputs = pad_sequence(
        texts, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    label_inputs = pad_sequence(
        labels, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    return text_inputs, label_inputs

In [5]:
train_data_loader = DataLoader(
    ds["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_imdb
)
test_data_loader = DataLoader(
    ds["test"], batch_size=BATCH_SIZE, collate_fn=collate_imdb
)

In [6]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10_000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, None], np.arange(d_model)[None, :], d_model
    )
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, n_heads):
        super().__init__()

        self.input_dim = input_dim
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.wq = nn.Linear(input_dim, d_model)
        self.wk = nn.Linear(input_dim, d_model)
        self.wv = nn.Linear(input_dim, d_model)
        self.wo = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        batch_size, seq_length, _ = x.size()

        # 1. Q, K, V 생성
        q, k, v = self.wq(x), self.wk(x), self.wv(x)

        # 1.1 Reshape Q, K, V
        q = q.view(batch_size, seq_length, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(batch_size, seq_length, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(batch_size, seq_length, self.n_heads, self.d_k).transpose(1, 2)

        # 2. Attention 점수 계산
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)

        # 3. Mask 적용 (필요한 경우)
        if mask is not None:
            scores = scores + (mask[:, None] * -1e9)

        # 4. Softmax 적용 및 Value 와 곱셈
        attention_weights = self.softmax(scores)
        output = torch.matmul(attention_weights, v)

        # 4.1 Transpose 및 Reshape
        output = (
            output.transpose(1, 2)
            .contiguous()
            .view(batch_size, seq_length, self.d_model)
        )

        # 5. 최종 선형 변환
        output = self.wo(output)

        return output

In [8]:
class TransformerLayer(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, dff, dropout_rate=0.1):
        super().__init__()

        self.multi_head_attention = MultiHeadAttention(input_dim, d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model),
        )

        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout()
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        x1 = self.multi_head_attention(x, mask)
        x1 = self.dropout1(x1)
        x1 = self.layer_norm1(x1 + x)

        x2 = self.ffn(x1)
        x2 = self.dropout2(x2)
        return self.layer_norm2(x2 + x1)

In [9]:
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, dff, max_len):
        super().__init__()

        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dff = dff
        self.max_len = max_len

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(
            positional_encoding(max_len, d_model), requires_grad=False
        )
        self.layers = nn.ModuleList(
            [TransformerLayer(d_model, d_model, n_heads, dff) for _ in range(n_layers)]
        )
        self.classification = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.shape[1]
        mask1 = (x == tokenizer.pad_token_id)[..., None]  # (batch_size, seq_len, 1)
        mask2 = (
            torch.tril(torch.ones(seq_len, seq_len))
            .type(torch.ByteTensor)
            .to(x.device)[None]
        )
        mask = mask1 & mask2

        x = self.embedding(x)
        x = x * math.sqrt(self.d_model)
        x = x + self.pos_encoding[:, :seq_len]

        for layer in self.layers:
            x = layer(x, mask)

        return self.classification(x)

In [10]:
if torch.backends.mps.is_available():
    my_device = torch.device("mps")
elif torch.cuda.is_available():
    my_device = torch.device("cuda")
else:
    my_device = torch.device("cpu")

In [11]:
gpt_model = GPT(
    vocab_size=len(tokenizer),
    d_model=32,
    n_heads=4,
    n_layers=5,
    dff=32,
    max_len=MAX_TOKEN_LEN,
).to(my_device)
gpt_model

GPT(
  (embedding): Embedding(10001, 32)
  (layers): ModuleList(
    (0-4): 5 x TransformerLayer(
      (multi_head_attention): MultiHeadAttention(
        (wq): Linear(in_features=32, out_features=32, bias=True)
        (wk): Linear(in_features=32, out_features=32, bias=True)
        (wv): Linear(in_features=32, out_features=32, bias=True)
        (wo): Linear(in_features=32, out_features=32, bias=True)
        (softmax): Softmax(dim=-1)
      )
      (ffn): Sequential(
        (0): Linear(in_features=32, out_features=32, bias=True)
        (1): ReLU()
        (2): Linear(in_features=32, out_features=32, bias=True)
      )
      (layer_norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.5, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (classification): Linear(in_features=32, out_features=10001, bias=True)
)

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(gpt_model.parameters(), lr=LEARNING_RATE)

In [13]:
for epoch in range(NUM_EPOCHS):
    gpt_model.train()

    total_loss = 0.0
    start_time = time.time()
    for t, l in train_data_loader:
        optimizer.zero_grad()

        inputs, labels = t.to(my_device), l.to(my_device)

        outputs = gpt_model(inputs)

        predictions = outputs.reshape(-1, len(tokenizer))
        labels = labels.reshape(-1)
        mask = (inputs == tokenizer.pad_token_id).reshape(-1)

        loss = criterion(predictions, labels)
        loss = (loss * ~mask).sum() / (~mask).sum()
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    print(
        f"Epoch {epoch+1:3d} | {time.time() - start_time:.2f}s | Total Loss: {total_loss:.4f}"
    )

Epoch   1 | 82.28s | Total Loss: 1838.3804


In [17]:
input_text = "I am "
tokens_org = tokenizer(input_text).input_ids
tokens = torch.LongTensor(tokens_org)[None].to(my_device)

last_token_pred = gpt_model(tokens)[0, -1].argmax()
tokenizer.decode(tokens_org + [last_token_pred.item()])

'i am the'

In [18]:
def generate_text(device, model, start_text, max_length=50):
    model.eval()
    tokens = torch.LongTensor(tokenizer.encode(start_text))[None].to(device)
    generated_tokens = tokens[0].tolist()

    with torch.no_grad():  # 그래디언트 계산 비활성화
        for _ in range(max_length):
            predictions = model(tokens)
            next_token = predictions[0, -1, :].argmax().item()

            generated_tokens.append(next_token)
            tokens = torch.cat(
                [tokens, torch.LongTensor([[next_token]]).to(device)], dim=1
            )

            if next_token == tokenizer.sep_token_id:
                break

    return tokenizer.decode(generated_tokens)

In [19]:
input_text = "how was the movie?"
generate_text(my_device, gpt_model, input_text)

'how was the movie?, and the film. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie. the movie.'