# IMBD 데이터

https://github.com/groovallstar/pytorch_rnn_tutorial/blob/main/8_2_torchtext_migration.ipynb

In [1]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split
from torch.utils.data import DataLoader

from torchtext.datasets import IMDB
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

tokenizer = get_tokenizer('basic_english')
train_iter, test_iter = IMDB(root='.data', split=('train', 'test'))

def train_valid_split(train_iterator, split_ratio=0.8, seed=42):
    train_count = int(split_ratio * len(train_iterator))
    valid_count = len(train_iterator) - train_count
    generator = torch.Generator().manual_seed(seed)
    train_set, valid_set = random_split(
        train_iterator, lengths=[train_count, valid_count],
        generator=generator)
    return train_set, valid_set

train_iter = to_map_style_dataset(train_iter)
test_iter = to_map_style_dataset(test_iter)

train_set, val_set = train_valid_split(train_iter)

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    iterator=yield_tokens(train_iter),
    min_freq=5,
    specials=['<unk>'],)
vocab.set_default_index(vocab['<unk>'])

def collate_batch(batch):
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x)

    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_tensor = pad_sequence(text_list, padding_value=1, batch_first=True)
    return text_tensor, label_list

train_dataloader = DataLoader(
    train_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(
    val_set, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(
    test_iter, batch_size=64, shuffle=True, collate_fn=collate_batch)

In [2]:
train_set[0]

(2,
 'I saW this film while at Birmingham Southern College in 1975, when it was shown in combination with the Red Balloon. Both films are similar in their dream-like quality. The bulk of the film entails a fish swimming happily in his bowl while his new owner, a little boy, is away at school. A cat enters the room where the fish and his bowl are, and begins to warily stalk his "prey." The boy begins his walk home from school, and the viewer wonders whether he will arrive in time to save his fish friend. The fish becomes agitated by the cat\'s presence, and finally jumps out of the bowl! The cat quickly walks over to the fish, gently picks him up with his paws, and returns him to his bowl. The boy returns happily to his fish, none the wiser.<br /><br />The ending is amazing in both its irony and its technical complexity. It is hard to imagine how the director could\'ve pulled the technical feat back in 1959 -- it seems more a trick for 2003.<br /><br />If you can find it, watch it -- yo

In [3]:
len(train_set), len(val_set)

(20000, 5000)

In [4]:
len(train_dataloader), len(val_dataloader), len(test_dataloader) # 배치 사이즈 64

(313, 79, 391)

In [5]:
n_classes = 2
vocab_size = len(vocab)

In [6]:
vocab_size

30122

# 모델 정의

In [7]:
import torch.nn as nn

class Gru(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        """
        n_layers: 히든 레이어 개수
        hidden_dim: 히든 레이어 차원
        n_vocab: 사전 사이즈
        embed_dim: 임베딩된 데이터의 차원
        n_classes: 레이블 수
        dropout_p: 드랍아웃 비율
        """
        super(Gru, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
        
    def forward(self, x):
        x = self.embed(x) # 텍스트를 단어 단위인 토큰으로 벡터 변환
        h_0 = self._init_state(batch_size=x.size(0)) # 아래 함수 참고
        x, _ = self.gru(x, h_0) # GRU의 리턴 값(배치 사이즈, 입력값 길이, 히든 스테이트의 길이) -> 텐서 형태
        h_t = x[:, -1, :] # 텐서로 크기가 변경, 마지막 히든 스테이트만 가져옴
        self.dropout(h_t)
        logit = self.out(h_t) # 배치 사이즈와 히든 스테이트의 크기 -> 배치 사이즈의 출력층의 크기로 변환
        return logit
    
    def _init_state(self, batch_size=1):
        "첫 번째 히든 스테이트를 0 벡터로 초기화"
        weight = next(self.parameters()).data # 첫 가중치 데이터 추출(텐서)
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_() # 현재 모델의 가중치와 같은 모양의 텐서의 값을 0으로 초기화

In [8]:
lr = 0.001
EPOCHS = 10

model = Gru(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 모델 학습 및 검증

In [9]:
import torch.nn.functional as f

best_val_loss = None
best_model = None

for epoch in range(1, EPOCHS+1):
    model.train()
    for i, (text, label) in enumerate(train_dataloader):
        text, label = text.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        output = model(text)
        
        label.data.sub_(1) # <unk>:0 인 token 값 제거
        loss = f.cross_entropy(output, label)
        loss.backward()
        optimizer.step()
        
        if i == 100: break
    
    model.eval()

    val_loss_sum = 0
    val_correct = 0

    with torch.no_grad():
        for text, label in val_dataloader:
            text, label = text.to(DEVICE), label.to(DEVICE)
            label.data.sub_(1) # <unk>:0 인 token 값 제거
            output = model(text)
            val_loss_sum += f.cross_entropy(output, label, reduction='sum').item()
            pred = output.max(1)[1].view(label.size()).data
            val_correct += (pred == label.data).sum()

    val_loss = val_loss_sum / len(val_set)
    val_acc = val_correct / len(val_set) * 100
    
    print(f'[Epoch: {epoch:2d}] train loss: {loss.item():.4f} | val loss: {val_loss:.4f} val accuracy: {val_acc:.2f}%')
    
    if not best_val_loss or val_loss < best_val_loss:
        best_model = model
        best_val_loss = val_loss

[Epoch:  1] train loss: 0.7029 | val loss: 0.6939 val accuracy: 50.28%
[Epoch:  2] train loss: 0.6938 | val loss: 0.6930 val accuracy: 50.36%
[Epoch:  3] train loss: 0.6937 | val loss: 0.6947 val accuracy: 49.82%
[Epoch:  4] train loss: 0.6919 | val loss: 0.6934 val accuracy: 49.60%
[Epoch:  5] train loss: 0.7015 | val loss: 0.6991 val accuracy: 50.42%
[Epoch:  6] train loss: 0.7097 | val loss: 0.6969 val accuracy: 49.72%
[Epoch:  7] train loss: 0.6917 | val loss: 0.6931 val accuracy: 50.34%
[Epoch:  8] train loss: 0.6899 | val loss: 0.6934 val accuracy: 49.78%
[Epoch:  9] train loss: 0.6947 | val loss: 0.6927 val accuracy: 50.38%
[Epoch: 10] train loss: 0.6861 | val loss: 0.6934 val accuracy: 49.86%


## 최적 모델 저장

In [10]:
def train(model, optimizer, train_iter):
    model.train()
    for i, (text, label) in enumerate(train_iter):
        text, label = text.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        output = model(text)
        
        label.data.sub_(1) # <unk>:0 인 token 값 제거
        loss = f.cross_entropy(output, label)
        loss.backward()
        optimizer.step()
            
    return loss

def evaluate(model, val_iter, len_val_data):
    model.eval()

    loss_sum = 0
    correct = 0

    with torch.no_grad():
        for text, label in val_iter:
            text, label = text.to(DEVICE), label.to(DEVICE)
            label.data.sub_(1) # <unk>:0 인 token 값 제거
            output = model(text)
            loss_sum += f.cross_entropy(output, label, reduction='sum').item()
            pred = output.max(1)[1].view(label.size()).data
            correct += (pred == label.data).sum()

    loss = loss_sum / len_val_data
    acc = correct / len_val_data * 100
    
    return loss, acc

In [11]:
class EarlyStopping:
    def __init__(self, patience=5):
        self.loss = float('inf')
        self.patience = 0
        self.patience_limit = patience
        
    def step(self, loss):
        if self.loss > loss:
            self.loss = loss
            self.patience = 0
        else:
            self.patience += 1
    
    def is_stop(self):
        return self.patience >= self.patience_limit

In [12]:
lr = 0.001
EPOCHS = 50

model = Gru(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_model = None
best_val_loss = None

es = EarlyStopping(patience=10)

for epoch in range(1, EPOCHS+1):
    loss = train(model, optimizer, train_dataloader)
    
    val_loss, val_acc = evaluate(model, val_dataloader, len(val_set))
    
    print(f'[Epoch: {epoch:2d}] train loss: {loss.item():.4f} | val loss: {val_loss:.4f} val accuracy: {val_acc:.2f}%')
    
    es.step(val_loss)
    if es.is_stop():
        break
    
    if not best_val_loss or val_loss < best_val_loss:
        best_model = model
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'bestmodel.pt')

[Epoch:  1] train loss: 0.6921 | val loss: 0.6930 val accuracy: 50.46%
[Epoch:  2] train loss: 0.6975 | val loss: 0.6929 val accuracy: 50.56%
[Epoch:  3] train loss: 0.6968 | val loss: 0.6986 val accuracy: 50.50%
[Epoch:  4] train loss: 0.3523 | val loss: 0.4016 val accuracy: 81.60%
[Epoch:  5] train loss: 0.2616 | val loss: 0.2689 val accuracy: 89.28%
[Epoch:  6] train loss: 0.0240 | val loss: 0.3060 val accuracy: 89.04%
[Epoch:  7] train loss: 0.0034 | val loss: 0.3735 val accuracy: 89.14%
[Epoch:  8] train loss: 0.0235 | val loss: 0.4406 val accuracy: 88.62%
[Epoch:  9] train loss: 0.0025 | val loss: 0.4832 val accuracy: 88.92%
[Epoch: 10] train loss: 0.1246 | val loss: 0.5812 val accuracy: 87.74%
[Epoch: 11] train loss: 0.0154 | val loss: 0.5610 val accuracy: 88.80%
[Epoch: 12] train loss: 0.0006 | val loss: 0.5824 val accuracy: 88.54%
[Epoch: 13] train loss: 0.0009 | val loss: 0.5890 val accuracy: 89.04%
[Epoch: 14] train loss: 0.0016 | val loss: 0.6209 val accuracy: 88.84%
[Epoch

## 모델 테스트

In [13]:
test_loss, test_acc = evaluate(best_model, test_dataloader, len(test_iter))
print(f'test loss: {test_loss:.2f} | test accuracy: {test_acc:.2f}')

test loss: 0.67 | test accuracy: 88.02


# 모델 불러오기

In [14]:
best_model_state = torch.load('bestmodel.pt')
best_model_loaded = Gru(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
best_model_loaded.load_state_dict(best_model_state)

test_loss, test_acc = evaluate(best_model_loaded, test_dataloader, len(test_iter))
print(f'test loss: {test_loss:.2f} | test accuracy: {test_acc:.2f}')

test loss: 0.27 | test accuracy: 88.51
