### Transformer으로 IMDB 리뷰 분리하기
- Transformer : Transformer는 RNN없이 순수 self_attention만으로 시퀀스를 처리하며, BERT와 같은 사전학습 언어 모델의 핵심 구조

In [1]:
# Library
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split 

In [2]:
# Hyper Parameter
num_words = 5000
max_len = 100
embedding_dim=128
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [3]:
# IMDB 데이터 로드 (5000개의 단어만 사용)
(train_input, train_target), (test_input, test_target) = imdb.load_data(num_words=num_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


### 훈련데이터를 훈련과 검증으로 분할

In [4]:
train_input, val_input, train_target, val_target = train_test_split(
                                                        train_input,
                                                        train_target,
                                                        test_size=0.2,
                                                        random_state=42
)

----
### Sequence Padding
: 전체 자릿수를 100으로 가정했을 경우 한 문장에 3개 토큰만 있을 경우 나머지 97개는 비워지고 이를 0으로 채우는 과정

In [5]:
# 패딩 처리 (최대길이 100)
train_seq = pad_sequences(train_input, maxlen = max_len)
val_seq = pad_sequences(val_input, maxlen=max_len)

In [6]:
# Pytorch Tensor 변환
train_seq_tensor = torch.tensor(train_seq, dtype=torch.long)
train_target_tensor = torch.tensor(train_target, dtype=torch.float32)

val_seq_tensor = torch.tensor(val_seq, dtype=torch.long)
val_target_tensor = torch.tensor(val_target, dtype=torch.float32)


In [7]:
# Pytorch Dataloader 생성

train_dataset = TensorDataset(train_seq_tensor, train_target_tensor)
val_dataset = TensorDataset(val_seq_tensor, val_target_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### Positional Encoding(Transformer에서는 필수)
: 순서 정보 보존 

In [26]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0) # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x
        

### 모델 정의

In [27]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, nhead, num_layers, hidden_dim, output_dim, max_len=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim, max_len)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Linear(embed_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x) 
        embedded = self.pos_encoder(embedded) # positional encoding
        embedded = embedded.permute(1, 0, 2)  # Transformer (seq, batch, emb)
        out = self.transformer_encoder(embedded)
        out = out.mean(dim=0)
        out = self.fc(out)
        return self.sigmoid(out).squeeze(1)

### 학습함수

In [28]:
def train(model, loader, criterion, optimizer):
    model.train()
    total_loss, total_correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        preds = model(x) # 예측값
        loss = criterion(preds, y) # 손실 계산
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += ((preds >= 0.5).float() == y).sum().item()

    accuracy = total_correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

### 평가 함수

In [29]:
# 모델 평가 함수
def evaluate(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
    
            preds = model(x) # 예측값
            loss = criterion(preds, y) # 손실 계산
    
            total_loss += loss.item()
            total_correct += ((preds >= 0.5).float() == y).sum().item()

    accuracy = total_correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

### 학습 실행

In [30]:
# 인스턴스 정의

model = TransformerClassifier(
        vocab_size = num_words,
        embed_dim = embedding_dim,
        nhead = 4,
        num_layers = 2,
        hidden_dim = 256,
        output_dim = 1,
        max_len = max_len
).to(device)

In [31]:
# 손실함수 및 옵티마이져 설정
criterion = nn.BCELoss() # Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [32]:
# 학습 실행

num_epochs = 20

best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    print(f'Epoch : {epoch+1}/{num_epochs}')
    print(f'- Train Loss : {train_loss:.4f} | {train_acc*100:.2f}%')
    print(f'- Val Loss   : {val_loss  :.4f} | {val_acc*100:.2f}%')

    # 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'transformer_imdb.pth')

Epoch : 1/20
- Train Loss : 0.6519 | 60.31%
- Val Loss   : 0.5726 | 70.12%
Epoch : 2/20
- Train Loss : 0.5298 | 73.47%
- Val Loss   : 0.5218 | 72.66%
Epoch : 3/20
- Train Loss : 0.4682 | 77.45%
- Val Loss   : 0.4625 | 77.56%
Epoch : 4/20
- Train Loss : 0.4356 | 79.56%
- Val Loss   : 0.4474 | 78.90%
Epoch : 5/20
- Train Loss : 0.4072 | 81.33%
- Val Loss   : 0.4239 | 80.20%
Epoch : 6/20
- Train Loss : 0.3862 | 82.59%
- Val Loss   : 0.4116 | 80.84%
Epoch : 7/20
- Train Loss : 0.3705 | 83.61%
- Val Loss   : 0.4075 | 81.22%
Epoch : 8/20
- Train Loss : 0.3549 | 84.31%
- Val Loss   : 0.4030 | 81.52%
Epoch : 9/20
- Train Loss : 0.3423 | 84.97%
- Val Loss   : 0.4090 | 81.48%
Epoch : 10/20
- Train Loss : 0.3300 | 85.98%
- Val Loss   : 0.3998 | 81.94%
Epoch : 11/20
- Train Loss : 0.3180 | 86.52%
- Val Loss   : 0.4145 | 81.76%
Epoch : 12/20
- Train Loss : 0.3079 | 86.92%
- Val Loss   : 0.4108 | 81.80%
Epoch : 13/20
- Train Loss : 0.2961 | 87.47%
- Val Loss   : 0.4243 | 81.86%
Epoch : 14/20
- Train

In [10]:
# 모델 파라미터 설정
vocab_size = num_words
embedding_dim = 128
hidden_dim = 64
output_dim = 1 # 감성 분류(0 or 1)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
model = AttentionLSTM(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
model

AttentionLSTM(
  (embedding): Embedding(5000, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (attn): Linear(in_features=64, out_features=1, bias=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

### 모델 학습 준비

In [12]:
# 손실함수 및 옵티마이져 설정
criterion = nn.BCELoss() # Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=1e-4)

### 모델 학습 함수

In [13]:
def train(model, loader, criterion, optimizer):
    model.train()
    total_loss, total_correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        preds = model(x).squeeze(1) # 예측값
        loss = criterion(preds, y) # 손실 계산
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += ((preds >= 0.5).float() == y).sum().item()

    accuracy = total_correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

In [14]:
# 모델 평가 함수
def evaluate(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
    
            preds = model(x).squeeze(1) # 예측값
            loss = criterion(preds, y) # 손실 계산
    
            total_loss += loss.item()
            total_correct += ((preds >= 0.5).float() == y).sum().item()

    accuracy = total_correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

### 모델 학습 실행

In [15]:
num_epochs = 20

best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    print(f'Epoch : {epoch+1}/{num_epochs}')
    print(f'- Train Loss : {train_loss:.4f} | {train_acc*100:.2f}%')
    print(f'- Val Loss   : {val_loss  :.4f} | {val_acc*100:.2f}%')

    # 모델 저장
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_attention_model.pth')

Epoch : 1/20
- Train Loss : 0.6866 | 57.36%
- Val Loss   : 0.6786 | 61.06%
Epoch : 2/20
- Train Loss : 0.6399 | 64.93%
- Val Loss   : 0.5993 | 68.56%
Epoch : 3/20
- Train Loss : 0.5615 | 71.23%
- Val Loss   : 0.5518 | 72.00%
Epoch : 4/20
- Train Loss : 0.5145 | 74.84%
- Val Loss   : 0.5176 | 74.52%
Epoch : 5/20
- Train Loss : 0.4820 | 77.16%
- Val Loss   : 0.4976 | 75.68%
Epoch : 6/20
- Train Loss : 0.4577 | 78.94%
- Val Loss   : 0.4835 | 76.66%
Epoch : 7/20
- Train Loss : 0.4365 | 79.88%
- Val Loss   : 0.4674 | 77.84%
Epoch : 8/20
- Train Loss : 0.4189 | 80.83%
- Val Loss   : 0.4579 | 78.52%
Epoch : 9/20
- Train Loss : 0.4023 | 81.86%
- Val Loss   : 0.4557 | 78.74%
Epoch : 10/20
- Train Loss : 0.3876 | 82.81%
- Val Loss   : 0.4479 | 79.24%
Epoch : 11/20
- Train Loss : 0.3748 | 83.28%
- Val Loss   : 0.4443 | 79.12%
Epoch : 12/20
- Train Loss : 0.3634 | 83.94%
- Val Loss   : 0.4371 | 79.70%
Epoch : 13/20
- Train Loss : 0.3505 | 84.69%
- Val Loss   : 0.4370 | 80.44%
Epoch : 14/20
- Train

### 모델 평가

In [16]:
# 저장된 모델 로드
model.load_state_dict(torch.load('best_attention_model.pth', weights_only=False))
test_loss, test_acc = evaluate(model, val_loader, criterion)
print(f'Test Accuracy : {test_acc*100:.2f}%')

Test Accuracy : 81.26%


> 전체 댓글의 토큰중 50개만 선택하여 구성하였는데 거의 70%넘게 예측력이 나왔다. 