# LSTM을 활용한 자연어 처리

# 데이터 전처리

데이터셋: IMDB 영화 리뷰 데이터

목표: 영화 리뷰가 긍정적인지 부정적인지 분류하는 신경망 개발

접근 방식: 자연어 처리에 적합한 RNN (순환 신경망) 사용. RNN 기법 중 LSTM 기법 활용.

**주의**: 해당 코드는 파이토치 1.11.0 버전, torchtext 0.12.0 버전, torchdata 0.3.0 버전을 사용. 아래의 코드를 사용하여 해당 버전들을 사용할수 있음. (주석 처리 되어있다면 해제하고 사용)

In [1]:
!pip install torch==1.11.0 torchtext==0.12.0 torchdata==0.3.0

Collecting torch==1.11.0
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtext==0.12.0
  Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchdata==0.3.0
  Downloading torchdata-0.3.0-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext, torchdata
  Attempting uninstall: torch
    Found existing installation: torch 2.3.0+cu121
    Uninstalling torch-2.3.0+cu121:
      Successfully uninstalled torch-2.3.0+cu121
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.18.0
    Uninstalling torchtext-0.18.

In [2]:
import os
import time
import string
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, Dataset, random_split
from torch.autograd import Variable
from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

import torchdata
import torchtext
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [49]:
# 토크나이저 설정
tokenizer = get_tokenizer("basic_english")

# 어휘 생성 함수
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# IMDB 데이터셋 로드
train_iter, test_iter = IMDB(split=('train', 'test'))

# 어휘 생성
vocab = build_vocab_from_iterator(yield_tokens(train_iter), max_tokens=10000, specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

불용어 제거를 위해 nltk를 사용합니다.

In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
stopwords = set(stopwords.words('english'))

In [50]:
# 텍스트 전처리 및 레이블 변환
def text_pipeline(x):
    text = [token.lower() for token in tokenizer(x)]
    text = [token.replace("<br", "") for token in text]
    text = [''.join(c for c in token if c not in string.punctuation) for token in text]
    text = [token for token in text if token]
    text = [token for token in text if not token in stopwords]
    return text[:100] + ['<pad>'] * (100 - len(text))

def label_pipeline(x):
    return 1 if x == 'pos' else 0

In [51]:
# IMDB 데이터셋을 커스텀 Dataset으로 변환
class IMDBDataset(Dataset):
    def __init__(self, data_iter, text_transform, label_transform):
        self.data = list(data_iter)
        self.text_transform = text_transform
        self.label_transform = label_transform
        self.preprocessed_data = [
            {'text': self.text_transform(text), 'label': self.label_transform(label)}
            for label, text in self.data
        ]

    def __len__(self):
        return len(self.preprocessed_data)

    def __getitem__(self, idx):
        return self.preprocessed_data[idx]

train_iter, test_iter = IMDB(split=('train', 'test'))
train_dataset = IMDBDataset(train_iter, text_pipeline, label_pipeline)
test_dataset = IMDBDataset(test_iter, text_pipeline, label_pipeline)

In [52]:
# 데이터셋 분할
train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size
train_data, valid_data = random_split(train_dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(0))

# 데이터 로더를 위한 collate 함수 설정
def collate_batch(batch):
    label_list, text_list = [], []
    for example in batch:
        label_list.append(example['label'])
        text_list.append(torch.tensor([vocab[token] for token in example['text']], dtype=torch.long))
    label_list = torch.tensor(label_list, dtype=torch.long)
    text_list = pad_sequence(text_list, batch_first=False, padding_value=vocab["<pad>"])
    return text_list, label_list

In [53]:
# 데이터 로더 생성
BATCH_SIZE = 50
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# 데이터 로더 사용 예시
for batch in train_dataloader:
    texts, labels = batch
    print(f"Labels: {labels}")
    print(f"Texts: {texts}")
    break

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_dataset)}')

print(f"Unique tokens in TEXT vocabulary: {len(vocab)}")

Labels: tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
        1, 0])
Texts: tensor([[  81,   23,  472,  ..., 1213,  475, 3877],
        [  45,  589,  113,  ...,  332, 3214, 7343],
        [4392,  715,   21,  ...,  587,  711,  420],
        ...,
        [   0,  354,  487,  ...,    1,  230,    1],
        [4030,  540,   89,  ...,    1, 7838,    1],
        [1656,  116,  487,  ...,    1,  194,    1]])
Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000
Unique tokens in TEXT vocabulary: 10000


# LSTM 사용

In [57]:
# LABEL vocabulary는 이미 0과 1로 구성된 숫자로 처리됩니다.
label_vocab = {0: 'neg', 1: 'pos'}
print(label_vocab)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x, hidden, cell):
        x = self.embedding(x)
        x, (hidden, cell) = self.lstm(x, (hidden, cell))
        x = self.dropout(x[:, -1, :])  # We only take the output of the last LSTM cell
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x, (hidden, cell)

    def init_hidden_and_cell(self, batch_size):
        hidden = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).to(device)
        cell = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).to(device)
        return hidden, cell

{0: 'neg', 1: 'pos'}


In [34]:
device

device(type='cuda', index=0)

In [58]:
vocab_size = len(vocab)
embed_size = 100
hidden_size = 300
num_layers = 1
output_size = 2
dropout = 0.3

model = LSTMModel(vocab_size, embed_size, hidden_size, output_size, num_layers, dropout)
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def training(epoch, model, trainloader, validloader):
    correct = 0
    total = 0
    running_loss = 0

    model.train()
    hidden, cell = model.init_hidden_and_cell(BATCH_SIZE)

    for texts, labels in trainloader:
        texts, labels = texts.to(device), labels.to(device)
        texts = torch.transpose(texts, 0, 1).contiguous()

        hidden, cell = hidden.detach(), cell.detach()

        y_pred, (hidden, cell) = model(texts, hidden, cell)

        loss = loss_fn(y_pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            correct += (y_pred.max(1)[1].view(labels.size()).data == labels.data).sum().item()
            total += labels.size(0)
            running_loss += loss.item()
    epoch_loss = running_loss / len(trainloader.dataset)
    epoch_acc = correct / total

    valid_correct = 0
    valid_total = 0
    valid_running_loss = 0

    model.eval()
    with torch.no_grad():
        # hidden, cell = model.init_hidden_and_cell(BATCH_SIZE)

        for texts, labels in validloader:
            texts, labels = texts.to(device), labels.to(device)
            texts = torch.transpose(texts, 0, 1).contiguous()
            y_pred, (hidden, cell) = model(texts, hidden, cell)
            loss = loss_fn(y_pred, labels)
            valid_correct += (y_pred.max(1)[1].view(labels.size()).data == labels.data).sum().item()
            valid_total += labels.size(0)
            valid_running_loss += loss.item()

    epoch_valid_loss = valid_running_loss / len(validloader.dataset)
    epoch_valid_acc = valid_correct / valid_total

    print('epoch: ', epoch,
          'loss： ', round(epoch_loss, 3),
          'accuracy:', round(epoch_acc, 3),
          'valid_loss： ', round(epoch_valid_loss, 3),
          'valid_accuracy:', round(epoch_valid_acc, 3)
          )
    return epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc

epochs = 10
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []
best_valid_loss = float('inf')

for epoch in range(epochs):
    epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc = training(epoch, model, train_dataloader, valid_dataloader)

    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'best-model.pt')

    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    valid_loss.append(epoch_valid_loss)
    valid_acc.append(epoch_valid_acc)

epoch:  0 loss：  0.014 accuracy: 0.504 valid_loss：  0.014 valid_accuracy: 0.509
epoch:  1 loss：  0.014 accuracy: 0.536 valid_loss：  0.013 valid_accuracy: 0.62
epoch:  2 loss：  0.012 accuracy: 0.677 valid_loss：  0.014 valid_accuracy: 0.552
epoch:  3 loss：  0.011 accuracy: 0.776 valid_loss：  0.011 valid_accuracy: 0.773
epoch:  4 loss：  0.01 accuracy: 0.822 valid_loss：  0.01 valid_accuracy: 0.789
epoch:  5 loss：  0.009 accuracy: 0.853 valid_loss：  0.01 valid_accuracy: 0.795
epoch:  6 loss：  0.009 accuracy: 0.873 valid_loss：  0.01 valid_accuracy: 0.806
epoch:  7 loss：  0.009 accuracy: 0.882 valid_loss：  0.01 valid_accuracy: 0.805
epoch:  8 loss：  0.008 accuracy: 0.895 valid_loss：  0.01 valid_accuracy: 0.806
epoch:  9 loss：  0.008 accuracy: 0.903 valid_loss：  0.01 valid_accuracy: 0.812
