### LSTM으로 Spam SMS 분류
: 훈련과 평가 함수를 따로 만들지 않고 사용하는 방법

In [1]:
# Module
import pandas as pd
import numpy as np
import requests
import zipfile
from io import BytesIO

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
# 데이터 로드
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
    with zip_ref.open("SMSSpamCollection") as file:
        data = pd.read_csv(file, sep="\t", names=['label', 'message'])

In [4]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.shape

(5572, 2)

### Data 전처리

In [9]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['message'])
X = tokenizer.texts_to_sequences(data['message'])
X = pad_sequences(X, maxlen=100)
y = data['label'].values

X

array([[   0,    0,    0, ...,   58, 4437,  144],
       [   0,    0,    0, ...,  472,    6, 1940],
       [   0,    0,    0, ...,  660,  392, 2998],
       ...,
       [   0,    0,    0, ...,   23,  107,  251],
       [   0,    0,    0, ...,  200,   12,   47],
       [   0,    0,    0, ...,    2,   61,  268]], dtype=int32)

In [11]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [12]:
# Train과 Test

X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        test_size=0.2,
                                        random_state=42
)

### DataSet

In [13]:
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [14]:
train_dataset = SpamDataset(X_train, y_train)
test_dataset = SpamDataset(X_test, y_test)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

### Model

In [20]:
class SpamClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SpamClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout1 = nn.Dropout(0.5)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2, batch_first=True)
        self.dropout2 = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim // 2, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x) # 임베딩 수행
        x,_ = self.lstm1(x)
        x = self.dropout1(x)
        x,_ = self.lstm2(x)
        x = self.dropout2(x[:,-1,:])
        x = self.fc(x)
        return self.sigmoid(x) # 최종 예측값 변환

### 모델 초기화

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SpamClassifier(vocab_size=5000, embedding_dim=128, hidden_dim=64, output_dim=1).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 모델 훈련

In [22]:
for epoch in range(5):
    model.train()
    epoch_loss = 0
    correct = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        preds = (outputs > 0.5).float()
        correct += (preds == batch_y).sum().item()
    acc = correct / len(train_dataset)
    print(f"Epoch : {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {acc*100:.2f}%")

Epoch : 1, Loss: 29.7642, Accuracy: 84.38%
Epoch : 2, Loss: 10.2404, Accuracy: 96.61%
Epoch : 3, Loss: 5.6853, Accuracy: 98.50%
Epoch : 4, Loss: 3.2061, Accuracy: 99.21%
Epoch : 5, Loss: 4.7283, Accuracy: 98.32%


### 평가

In [24]:
model.eval()
correct = 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X).squeeze()
        preds = (outputs > 0.5).float()
        correct += (preds == batch_y).sum().item()
print(f'테스트 세트 정확도: {correct/len(test_dataset)*100:.2f}%')

테스트 세트 정확도: 97.94%
