# RNN 텍스트 분류기

In [6]:
# 데이터 준비
corpus = [
    "자연어 처리는 재미있다",
    "Python이 더 쉽다",
    "자연어 처리 공부는 어렵다",
    "Python 활용법을 더 찾아보자"
]
labels = [1, 0, 0, 1]

# 토큰화
tokenized_corpus = [sentence.split() for sentence in corpus]

# 단어 사전
vocab = {}
for tokens in tokenized_corpus:
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab) + 1

# 문장 인덱싱
indexed_corpus = []
for tokens in tokenized_corpus:
    indexed_sent = [vocab[token] for token in tokens]
    indexed_corpus.append(indexed_sent)

# 패딩 처리
max_seq_len = max(len(seq) for seq in indexed_corpus)
def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        seq = seq + [0] * (max_len - len(seq))
    return seq

padded_corpus = [pad_sequence(seq, max_seq_len) for seq in indexed_corpus]

In [7]:
import torch

inputs = torch.tensor(padded_corpus, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

In [8]:
# RNN 기반 텍스트 분류기 모델 정의
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return self.sigmoid(self.fc(hidden[-1]))

In [9]:
vocab_size = len(vocab) + 1
embed_size = 128
hidden_size = 64
num_classes = 1

model = RNNClassifier(vocab_size, embed_size, hidden_size, num_classes)
print(model)

RNNClassifier(
  (embedding): Embedding(13, 128)
  (rnn): RNN(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [10]:
import torch.optim as optim

# 학습
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 20

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs)
    # print(outputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs} | Loss == {loss.item():.4f}')

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1/20 | Loss == 0.8071
Epoch 2/20 | Loss == 0.2709
Epoch 3/20 | Loss == 0.1041
Epoch 4/20 | Loss == 0.0559
Epoch 5/20 | Loss == 0.0321
Epoch 6/20 | Loss == 0.0190
Epoch 7/20 | Loss == 0.0123
Epoch 8/20 | Loss == 0.0086
Epoch 9/20 | Loss == 0.0064
Epoch 10/20 | Loss == 0.0050
Epoch 11/20 | Loss == 0.0040
Epoch 12/20 | Loss == 0.0033
Epoch 13/20 | Loss == 0.0027
Epoch 14/20 | Loss == 0.0023
Epoch 15/20 | Loss == 0.0019
Epoch 16/20 | Loss == 0.0017
Epoch 17/20 | Loss == 0.0015
Epoch 18/20 | Loss == 0.0013
Epoch 19/20 | Loss == 0.0011
Epoch 20/20 | Loss == 0.0010


In [13]:
test_texts = [
    "자연어 처리는 재미있어!",
    "Python 너무 어려워 ㅠㅠ"
    ]

def preprocess_sentence(sentence, vocab, max_len):
    tokens = sentence.split()
    indices = [vocab.get(token, 0) for token in tokens]
    indices = pad_sequence(indices, max_len)
    return torch.tensor(indices, dtype=torch.long)

test_inputs = []
for sent in test_texts:
    test_inputs.append(preprocess_sentence(sent, vocab, max_seq_len))
test_inputs = torch.stack(test_inputs)

In [14]:
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    print(outputs)

tensor([[0.9941],
        [0.8916]])
