In [7]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m52.8 MB/s[0m  [33m0:00:00[0m
[?25hDownloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.2 scikit-learn-1.7.2 threadpoolctl-3.6.0


In [1]:
import sentencepiece as spm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import os

# -------------------------------------------------------------
# 1. 데이터 로드
# -------------------------------------------------------------
train_path = os.path.expanduser("./ratings_train.txt")
test_path = os.path.expanduser("./ratings_test.txt")

train_df = pd.read_table(train_path).dropna()
test_df = pd.read_table(test_path).dropna()

# -------------------------------------------------------------
# 2. SentencePiece 학습
# -------------------------------------------------------------
corpus_path = "nsmc_corpus.txt"

with open(corpus_path, "w", encoding="utf8") as f:
    for t in train_df["document"]:
        f.write(str(t) + "\n")

spm.SentencePieceTrainer.Train(
    input=corpus_path,
    model_prefix="spm_nsmc",
    vocab_size=8000,
    character_coverage=0.9995,
    model_type="bpe"
)

sp = spm.SentencePieceProcessor()
sp.load("spm_nsmc.model")

# -------------------------------------------------------------
# 3. 텍스트 → 인덱스 시퀀스 변환
# -------------------------------------------------------------
def encode(text):
    ids = sp.encode_as_ids(str(text))
    return torch.tensor(ids, dtype=torch.long)

X_train_seq = [encode(t) for t in train_df["document"]]
X_test_seq = [encode(t) for t in test_df["document"]]

y_train = torch.tensor(train_df["label"].values, dtype=torch.float)
y_test = torch.tensor(test_df["label"].values, dtype=torch.float)

# -------------------------------------------------------------
# 4. 패딩
# -------------------------------------------------------------
MAX_LEN = 60
PAD_ID = 0

X_train_cut = [s[:MAX_LEN] for s in X_train_seq]
X_test_cut = [s[:MAX_LEN] for s in X_test_seq]

X_train_pad = pad_sequence(X_train_cut, batch_first=True, padding_value=PAD_ID)
X_test_pad = pad_sequence(X_test_cut, batch_first=True, padding_value=PAD_ID)

# -------------------------------------------------------------
# 5. Train / Validation split
# -------------------------------------------------------------
X_train, X_val, y_train_split, y_val = train_test_split(
    X_train_pad, y_train, test_size=0.2, random_state=42
)

train_ds = TensorDataset(X_train, y_train_split)
val_ds = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# -------------------------------------------------------------
# 6. LSTM 모델
# -------------------------------------------------------------
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_ID)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 8)
        self.act = nn.ReLU()
        self.fc2 = nn.Linear(8, 1)
        self.out = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(x)
        _, (h, _) = self.lstm(x)
        x = h[-1]
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.out(x)
        return x.squeeze()

# -------------------------------------------------------------
# 7. 학습 루프
# -------------------------------------------------------------
device = torch.device("mps")

model = SentimentLSTM(
    vocab_size=sp.get_piece_size(),
    embedding_dim=128,
    hidden_dim=32
).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

def accuracy(pred, target):
    pred = torch.round(pred)
    return (pred == target).float().mean()

EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_acc = 0

    for Xb, yb in train_loader:
        Xb = Xb.to(device)
        yb = yb.to(device)

        pred = model(Xb)
        loss = criterion(pred, yb)
        acc = accuracy(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += acc.item()

    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb = Xb.to(device)
            yb = yb.to(device)

            pred = model(Xb)
            loss = criterion(pred, yb)
            acc = accuracy(pred, yb)

            val_loss += loss.item()
            val_acc += acc.item()

    print(f"Epoch {epoch+1} | Train Loss {total_loss/len(train_loader):.4f} | "
          f"Train Acc {total_acc/len(train_loader):.4f} | "
          f"Val Loss {val_loss/len(val_loader):.4f} | "
          f"Val Acc {val_acc/len(val_loader):.4f}")


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: nsmc_corpus.txt
  input_format: 
  model_prefix: spm_nsmc
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differ

Epoch 1 | Train Loss 0.6919 | Train Acc 0.5099 | Val Loss 0.6940 | Val Acc 0.5033
Epoch 2 | Train Loss 0.6871 | Train Acc 0.5204 | Val Loss 0.6612 | Val Acc 0.6231
Epoch 3 | Train Loss 0.6482 | Train Acc 0.6339 | Val Loss 0.5884 | Val Acc 0.7068
Epoch 4 | Train Loss 0.4464 | Train Acc 0.7980 | Val Loss 0.3871 | Val Acc 0.8273
Epoch 5 | Train Loss 0.3332 | Train Acc 0.8589 | Val Loss 0.3573 | Val Acc 0.8444
Epoch 6 | Train Loss 0.2871 | Train Acc 0.8816 | Val Loss 0.3661 | Val Acc 0.8454
Epoch 7 | Train Loss 0.2513 | Train Acc 0.8989 | Val Loss 0.3719 | Val Acc 0.8446
Epoch 8 | Train Loss 0.2174 | Train Acc 0.9155 | Val Loss 0.3880 | Val Acc 0.8436
Epoch 9 | Train Loss 0.1856 | Train Acc 0.9304 | Val Loss 0.4094 | Val Acc 0.8395
Epoch 10 | Train Loss 0.1580 | Train Acc 0.9434 | Val Loss 0.4458 | Val Acc 0.8386
Epoch 11 | Train Loss 0.1335 | Train Acc 0.9542 | Val Loss 0.4802 | Val Acc 0.8360
Epoch 12 | Train Loss 0.1152 | Train Acc 0.9624 | Val Loss 0.5276 | Val Acc 0.8340
Epoch 13 | Tr

In [3]:
# -------------------------------------------------------------
# 8. 테스트/검증 성능 비교
# -------------------------------------------------------------
test_ds = TensorDataset(X_test_pad, y_test)
test_loader = DataLoader(test_ds, batch_size=64)

def evaluate(loader):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            acc = accuracy(preds, yb)
            total_loss += loss.item()
            total_acc += acc.item()
    return total_loss / len(loader), total_acc / len(loader)

val_loss, val_acc = evaluate(val_loader)
test_loss, test_acc = evaluate(test_loader)

print(f"Validation - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
print(f"Test        - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}")


Validation - Loss: 0.7171, Acc: 0.8287
Test        - Loss: 0.7436, Acc: 0.8222


In [4]:
# -------------------------------------------------------------
# 9. Komoran 기반 토크나이저 모델 (SentencePiece 결과와 비교용)
# -------------------------------------------------------------
from collections import Counter
try:
    from konlpy.tag import Komoran
except ImportError as exc:
    raise RuntimeError("konlpy가 필요합니다. 먼저 !pip install konlpy 를 실행하세요.") from exc

komoran = Komoran()
PAD_ID_KOM = 0
UNK_ID_KOM = 1
MIN_FREQ = 5

def komoran_tokenize(text):
    return komoran.morphs(str(text))

token_counter = Counter()
for doc in train_df['document']:
    token_counter.update(komoran_tokenize(doc))

komoran_vocab = {"<pad>": PAD_ID_KOM, "<unk>": UNK_ID_KOM}
for token, freq in token_counter.items():
    if freq >= MIN_FREQ and token not in komoran_vocab:
        komoran_vocab[token] = len(komoran_vocab)

def komoran_encode(text):
    ids = [komoran_vocab.get(tok, UNK_ID_KOM) for tok in komoran_tokenize(text)]
    return torch.tensor(ids[:MAX_LEN], dtype=torch.long)

X_train_kom_seq = [komoran_encode(t) for t in train_df['document']]
X_test_kom_seq = [komoran_encode(t) for t in test_df['document']]

X_train_kom_pad = pad_sequence(
    [s[:MAX_LEN] for s in X_train_kom_seq],
    batch_first=True,
    padding_value=PAD_ID_KOM
)
X_test_kom_pad = pad_sequence(
    [s[:MAX_LEN] for s in X_test_kom_seq],
    batch_first=True,
    padding_value=PAD_ID_KOM
)

X_train_k, X_val_k, y_train_k, y_val_k = train_test_split(
    X_train_kom_pad, y_train, test_size=0.2, random_state=42
)

train_loader_kom = DataLoader(TensorDataset(X_train_k, y_train_k), batch_size=64, shuffle=True)
val_loader_kom = DataLoader(TensorDataset(X_val_k, y_val_k), batch_size=64)
test_loader_kom = DataLoader(TensorDataset(X_test_kom_pad, y_test), batch_size=64)

komoran_model = SentimentLSTM(
    vocab_size=len(komoran_vocab),
    embedding_dim=128,
    hidden_dim=32
).to(device)

kom_optimizer = optim.Adam(komoran_model.parameters(), lr=1e-3)

def run_epoch(model, loader, optimizer=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    total_loss, total_acc = 0.0, 0.0
    ctx = torch.enable_grad() if is_train else torch.no_grad()
    with ctx:
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            acc = accuracy(preds, yb)
            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
            total_acc += acc.item()
    return total_loss / len(loader), total_acc / len(loader)

for epoch in range(5):
    train_loss, train_acc = run_epoch(komoran_model, train_loader_kom, kom_optimizer)
    val_loss, val_acc = run_epoch(komoran_model, val_loader_kom)
    print(f'[Komoran] Epoch {epoch+1} | Train {train_loss:.4f}/{train_acc:.4f} | Val {val_loss:.4f}/{val_acc:.4f}')

val_loss_k, val_acc_k = run_epoch(komoran_model, val_loader_kom)
test_loss_k, test_acc_k = run_epoch(komoran_model, test_loader_kom)
print(f'[Komoran] Validation - Loss: {val_loss_k:.4f}, Acc: {val_acc_k:.4f}')
print(f'[Komoran] Test        - Loss: {test_loss_k:.4f}, Acc: {test_acc_k:.4f}')


The operation couldn’t be completed. Unable to locate a Java Runtime.
Please visit http://www.java.com for information on installing Java.



CalledProcessError: Command '['/usr/libexec/java_home']' returned non-zero exit status 1.