# RNN기반 분류기

In [2]:
# 데이터 로딩
from sklearn.datasets import fetch_20newsgroups

categories = ['comp.graphics', 'sci.space', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
X = newsgroups.data
y = newsgroups.target
print(newsgroups.target_names)
print(X[0])
print(y[0])

['comp.graphics', 'rec.sport.baseball', 'sci.space']
From: kjenks@gothamcity.jsc.nasa.gov
Subject: Life on Mars???
Organization: NASA/JSC/GM2, Space Shuttle Program Office 
X-Newsreader: TIN [version 1.1 PL8]
Lines: 12

I know it's only wishful thinking, with our current President,
but this is from last fall:

     "Is there life on Mars?  Maybe not now.  But there will be."
        -- Daniel S. Goldin, NASA Administrator, 24 August 1992

-- Ken Jenks, NASA/JSC/GM2, Space Shuttle Program Office
      kjenks@gothamcity.jsc.nasa.gov  (713) 483-4368

     "The man who makes no mistakes does not usually make
      anything."
        -- Edward John Phelps, American Diplomat/Lawyer (1825-1895)

2


In [10]:
# 데이터전처리
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000    # 사용할 단어 사전 크기 (상위 빈도 단어만 유지)
max_len = 200         # 모델 입력으로 사용할 시퀀스 길이

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X)    # 토크나이저 단어 사전 학습
X_encoded = tokenizer.texts_to_sequences(X)  # 문서를 정수 시퀀스로 변환
X_padded = pad_sequences(X_encoded, maxlen=max_len)  # 길이를 max_len으로 통일(패딩/자르기)
print(X_padded.shape)    # (문서 수, max_len)

(2954, 200)


In [None]:
# 데이터분리/텐서변환
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

# 전체 데이터에서 train/valid/test로 분리
X_train, X_test, y_train, y_test = train_test_split(torch.tensor(X_padded), torch.tensor(y), test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# dataset/dataloader
train_dataset = TensorDataset(X_train, y_train)  # 학습용(입력, 라벨) Dataset
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # 학습 데이터로더 생성 (셔플)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)     # 검증 데이터로더 생성 (고정)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)   # 테스트 데이터로더 생성 (고정)

In [None]:
# 모델 생성
import torch.nn as nn

# 임베딩 -> LSTM -> FC로 문장을 요약해 3개 클래스 로짓을 출력하는 분류기 모델
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        # embedding - lstm - dense
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # 토큰 ID -> 임베딩(0 토큰은 PAD: 학습사용안함)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)        # (B, T, *)
        self.fc = nn.Linear(hidden_size, 3)  # 마지막 은닉 -> 3 클래스 로짓 반환

    def forward(self, x):
        x = self.embedding(x)       # (B, T) -> (B, T, E)
        _, (h, c) = self.lstm(x)    # h:(L, B, H), C: (L, B, H)
        out = self.fc(h[-1])        # 마지막 레이어 은닉상태 (B, H) -> (B, 3)
        return out                  # softmax 전 로짓 반환

In [None]:
# 모델 학습
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cuda 또는 cpu
embedding_dim = 100
hidden_size = 128

model = LSTMClassifier(vocab_size, embedding_dim, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습루프
train_losses, train_accs = [], []
val_losses, val_accs = [], []

epochs = 50
for epoch in range(epochs):

    # 학습
    model.train()
    train_loss, train_correct, train_total = 0, 0, 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.detach().cpu().item()
        pred = output.argmax(dim=1)  # 가장 큰 로짓 인덱스로 클래스 예측
        train_correct += (pred == y_batch).sum().detach().cpu().item()
        train_total += len(y_batch)

    train_loss /= len(train_loader)
    train_acc = train_correct / train_total
    train_losses.append(train_loss)
    train_accs.append(train_acc)

    # 검증
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            loss = criterion(output, y_batch)

            val_loss += loss.detach().cpu().item()
            pred = output.argmax(dim=1)  # 가장 큰 로짓 인덱스로 클래스 예측
            val_correct += (pred == y_batch).sum().detach().cpu().item()
            val_total += len(y_batch)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

    # 출력(train_loss, val_loss)
    print(f'Epoch {epoch + 1}/{epochs}: '
          f'Train Loss {train_loss:.4f}, '
          f'Train Acc {train_acc:.4f}, '
          f'Val Loss {val_loss:.4f}, '
          f'Val Acc {val_acc:.4f}, ')


Epoch 1/50: Train Loss 1.0480, Train Acc 0.4788, Val Loss 0.9969, Val Acc 0.5222, 
Epoch 2/50: Train Loss 0.8941, Train Acc 0.6159, Val Loss 0.8736, Val Acc 0.6173, 
Epoch 3/50: Train Loss 0.6872, Train Acc 0.7243, Val Loss 0.7383, Val Acc 0.6850, 
Epoch 4/50: Train Loss 0.5185, Train Acc 0.8085, Val Loss 0.6547, Val Acc 0.7315, 
Epoch 5/50: Train Loss 0.3808, Train Acc 0.8635, Val Loss 0.6521, Val Acc 0.7146, 
Epoch 6/50: Train Loss 0.2608, Train Acc 0.9153, Val Loss 0.5672, Val Acc 0.7822, 
Epoch 7/50: Train Loss 0.1833, Train Acc 0.9418, Val Loss 0.5817, Val Acc 0.7780, 
Epoch 8/50: Train Loss 0.1265, Train Acc 0.9656, Val Loss 0.5555, Val Acc 0.7822, 
Epoch 9/50: Train Loss 0.0963, Train Acc 0.9735, Val Loss 0.5628, Val Acc 0.8055, 
Epoch 10/50: Train Loss 0.0888, Train Acc 0.9751, Val Loss 0.6335, Val Acc 0.7801, 
Epoch 11/50: Train Loss 0.0741, Train Acc 0.9767, Val Loss 0.6443, Val Acc 0.7717, 
Epoch 12/50: Train Loss 0.0616, Train Acc 0.9836, Val Loss 0.6387, Val Acc 0.8097, 
E

In [None]:
# 모델 평가
# - 정답, 모델 예측값을 가지고, classification_report 작성
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        output = model(X_batch)
        loss = criterion(output, y_batch)
        pred = output.argmax(dim=1)

        all_preds.extend(pred.detach().cpu().numpy())      # 배치 예측을 리스트에 누적
        all_labels.extend(y_batch.detach().cpu().numpy())  # 배치 정답을 리스트에 누적

print(classification_report(all_labels, all_preds, target_names=newsgroups.target_names))

                    precision    recall  f1-score   support

     comp.graphics       0.83      0.88      0.85       202
rec.sport.baseball       0.82      0.86      0.84       202
         sci.space       0.81      0.73      0.77       187

          accuracy                           0.82       591
         macro avg       0.82      0.82      0.82       591
      weighted avg       0.82      0.82      0.82       591



## 사전학습된 임베딩 적용하기

In [8]:
%pip install gensim -q

Note: you may need to restart the kernel to use updated packages.


- ted_en_fasttext.model : 모델 본체 (메타데이터 + 학습된 파라미터 경로)
- ted_en_fasttext.model.wv.vectors_ngrams.npy : FastText의 핵심 특징인 서브워드(n-gram) 벡터 행렬(Numpy 배열 파일)

In [None]:
from gensim.models import FastText

fasttext_model = FastText.load('ted_en_fasttext.model')
print(fasttext_model.vector_size)  # 단어 벡터 차원 (단어 임베딩 벡터의 차원수)

100


In [None]:
import numpy as np

embedding_dim = fasttext_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))  # (단어사전크기, 임베딩차원) 0 행렬 초기화

word_index = tokenizer.word_index # 단어 -> 인덱스 사전
word_index = {word:index \
              for word, index in word_index.items() \
                if index < vocab_size}  # vocab_size 범위 내 단어만 필터링해서 사용
print(len(word_index)) # 10000

for word, index in word_index.items():
    if word in fasttext_model.wv:  # FastText가 해당 단어 벡터를 가지고있으면
        embedding_matrix[index] = fasttext_model.wv[word]  # 해당 인덱스 위치에 임베딩 벡터를 채움

9999


In [None]:
# 모델 생성
import torch.nn as nn

# 사전학습 임베딩(embedding_matrix)으로 임베딩 레이어를 초기화한 LSTM 분류기
class LSTMClassifier2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_size):
        super().__init__()
        # embedding - lstm - dense
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))  # 사전학습 임베딩 가중치로 초기화
        self.embedding.weight.requires_grad = True    # 임베딩을 학습에 포함할지 여부 (True =미세조정, False=고정)

        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 3)

    def forward(self, x):
        x = self.embedding(x)       # (B, T) -> (B, T, E)
        _, (h, c) = self.lstm(x)    # h: (L, B, H)
        out = self.fc(h[-1])        # 마지막 레이어 은닉 사용 (B, H) -> (B, 3)
        return out

In [14]:
# 모델 학습
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cuda 또는 cpu
embedding_dim = 100
hidden_size = 128

model = LSTMClassifier2(vocab_size, embedding_dim, embedding_matrix, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 학습루프
train_losses, train_accs = [], []
val_losses, val_accs = [], []

epochs = 100
for epoch in range(epochs):

    # 학습
    model.train()
    train_loss, train_correct, train_total = 0, 0, 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.detach().cpu().item()
        pred = output.argmax(dim=1)
        train_correct += (pred == y_batch).sum().detach().cpu().item()
        train_total += len(y_batch)

    train_loss /= len(train_loader)
    train_acc = train_correct / train_total
    train_losses.append(train_loss)
    train_accs.append(train_acc)

    # 검증
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            loss = criterion(output, y_batch)

            val_loss += loss.detach().cpu().item()
            pred = output.argmax(dim=1)
            val_correct += (pred == y_batch).sum().detach().cpu().item()
            val_total += len(y_batch)

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

    # 출력(train_loss, val_loss)
    print(f'Epoch {epoch + 1}/{epochs}: '
          f'Train Loss {train_loss:.4f}, '
          f'Train Acc {train_acc:.4f}, '
          f'Val Loss {val_loss:.4f}, '
          f'Val Acc {val_acc:.4f}, ')


Epoch 1/100: Train Loss 1.0987, Train Acc 0.3460, Val Loss 1.0951, Val Acc 0.3594, 
Epoch 2/100: Train Loss 1.0951, Train Acc 0.3735, Val Loss 1.0920, Val Acc 0.3573, 
Epoch 3/100: Train Loss 1.0916, Train Acc 0.3963, Val Loss 1.0885, Val Acc 0.4165, 
Epoch 4/100: Train Loss 1.0875, Train Acc 0.4519, Val Loss 1.0835, Val Acc 0.4841, 
Epoch 5/100: Train Loss 1.0815, Train Acc 0.4884, Val Loss 1.0755, Val Acc 0.4947, 
Epoch 6/100: Train Loss 1.0709, Train Acc 0.5063, Val Loss 1.0580, Val Acc 0.5264, 
Epoch 7/100: Train Loss 1.0103, Train Acc 0.5460, Val Loss 0.9119, Val Acc 0.6068, 
Epoch 8/100: Train Loss 0.9099, Train Acc 0.6228, Val Loss 0.8564, Val Acc 0.6448, 
Epoch 9/100: Train Loss 0.8446, Train Acc 0.6608, Val Loss 0.8696, Val Acc 0.6490, 
Epoch 10/100: Train Loss 1.0720, Train Acc 0.4439, Val Loss 1.0577, Val Acc 0.4778, 
Epoch 11/100: Train Loss 0.9016, Train Acc 0.6296, Val Loss 0.8597, Val Acc 0.6850, 
Epoch 12/100: Train Loss 0.7975, Train Acc 0.7127, Val Loss 0.7622, Val Ac

In [15]:
# 모델 평가
# - 정답, 모델 예측값을 가지고, classification_report 작성
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        output = model(X_batch)
        loss = criterion(output, y_batch)
        pred = output.argmax(dim=1)

        all_preds.extend(pred.detach().cpu().numpy())
        all_labels.extend(y_batch.detach().cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=newsgroups.target_names))

                    precision    recall  f1-score   support

     comp.graphics       0.94      0.93      0.93       202
rec.sport.baseball       0.98      0.95      0.96       202
         sci.space       0.90      0.95      0.92       187

          accuracy                           0.94       591
         macro avg       0.94      0.94      0.94       591
      weighted avg       0.94      0.94      0.94       591

