In [67]:
import pandas as pd

kingtotal = pd.read_csv('josuncsv/king_0.csv', sep=';')
kingtotal

from kiwipiepy import Kiwi

kiwi = Kiwi()

def kiwi_preprocess(text):
    tokens = kiwi.tokenize(text)
    return ' '.join([str(token) for token in tokens])

kingtotal['text_kiwi'] = kingtotal['text'].apply(kiwi_preprocess)

In [68]:
targetDF = kingtotal.label
targetDF

0       0
1       0
2       0
3       0
4       0
       ..
2247    0
2248    0
2249    0
2250    0
2251    0
Name: label, Length: 2252, dtype: int64

In [69]:
featureDF = kingtotal[['text','text_kiwi']]
featureDF

Unnamed: 0,text,text_kiwi
0,임금이 수창궁(壽昌宮)에 거둥하였다.,"Token(form='임금', tag='NNG', start=0, len=2) To..."
1,이조 전서(吏曹典書) 유양(柳亮) ...,"Token(form='이조', tag='NNP', start=14, len=2) T..."
2,임금이 탄생일이므로 여러 신하들의 조하(朝賀)를 받고 사형(死刑)과 유형(流刑) 이...,"Token(form='임금', tag='NNG', start=0, len=2) To..."
3,우현보(禹玄寶)·이색(李穡)·설장수(偰長壽) 등 30인을 외방(外方)에 종편(從便...,"Token(form='우현', tag='NNP', start=1, len=2) To..."
4,공부 상정 도감(貢賦詳定都監)159) 에서 상서(上書...,"Token(form='공부', tag='NNG', start=0, len=2) To..."
...,...,...
2247,"사노(私奴) 오마대(吾麻大)가 그 아버지를 구타하였으므로, 명하여 이를 목 베게 하였다.","Token(form='사노', tag='NNG', start=0, len=2) To..."
2248,비가 내리었다.,"Token(form='비', tag='NNG', start=0, len=1) Tok..."
2249,비가 내리었다.,"Token(form='비', tag='NNG', start=0, len=1) Tok..."
2250,"임금이 서강(西江)에 거둥하였는데, 경상도의 조선(漕船) 20척이 도착 정박하였다.","Token(form='임금', tag='NNG', start=0, len=2) To..."


In [70]:
featureDF = featureDF.reset_index(drop=True)
targetDF = targetDF.reset_index(drop=True)

In [71]:
def yield_tokens(data_iter):
  for _, text in data_iter:
    tokens = kiwi.tokenize(text)
    yield tokens


In [72]:
tokenizer = kiwi_preprocess


In [73]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

In [74]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


In [75]:
# 데이터셋 만들기
class KingDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        text_idxs = [word_to_idx[word] for word in kiwi_preprocess(text).split()]
        text_idxs = torch.tensor(text_idxs, dtype=torch.long)
        return text_idxs, torch.tensor(label, dtype=torch.long)

    @staticmethod
    def collate_fn(batch):
        texts, labels = zip(*batch)
        padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
        labels = torch.stack(labels)
        return padded_texts, labels

In [76]:
X = featureDF['text_kiwi']
y = targetDF
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [77]:
train_dataset = KingDataset(list(X_train), list(y_train))
val_dataset = KingDataset(list(X_val), list(y_val))
test_dataset = KingDataset(list(X_test), list(y_test))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)

In [78]:

# 단어 집합(vocabulary) 만들기
word_set = set()
for text in X_train:
    word_set.update(kiwi_preprocess(text).split())

word_to_idx = {word: idx for idx, word in enumerate(word_set)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

train_dataset = KingDataset(X_train, y_train)
val_dataset = KingDataset(X_val, y_val)
test_dataset = KingDataset(X_test, y_test)

In [79]:
# 데이터로더 만들기
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)


In [80]:
# 모델 정의
class KingClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(KingClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, text):
        embedded = self.embedding(text)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden[-1])
        return output

In [81]:

# 하이퍼파라미터 설정
vocab_size = len(word_set)
embedding_dim = 100
hidden_dim = 128
num_classes = len(set(targetDF))

In [82]:
# 모델 인스턴스 생성
model = KingClassifier(vocab_size, embedding_dim, hidden_dim, num_classes).to(device)

# 옵티마이저와 손실 함수 정의
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [83]:
print("Text indices:", X_train.index)
print("Label indices:", y_train.index)

print("Text indices:", X_val.index)
print("Label indices:", y_val.index)

print("Text indices:", X_test.index)
print("Label indices:", y_test.index)

Text indices: Index([2019, 1300,  335, 1934, 1736,  476, 1697,  635,  993,   33,
       ...
        305,  202, 1858,  391, 1744,  894, 2105, 2239,  407,  347],
      dtype='int64', length=1440)
Label indices: Index([2019, 1300,  335, 1934, 1736,  476, 1697,  635,  993,   33,
       ...
        305,  202, 1858,  391, 1744,  894, 2105, 2239,  407,  347],
      dtype='int64', length=1440)
Text indices: Index([ 434,  201, 1917, 1176, 2204, 2220, 1170, 2013, 1398, 1564,
       ...
        568,  439, 1722,  802, 1728,  871,  379,  768,  373, 1682],
      dtype='int64', length=361)
Label indices: Index([ 434,  201, 1917, 1176, 2204, 2220, 1170, 2013, 1398, 1564,
       ...
        568,  439, 1722,  802, 1728,  871,  379,  768,  373, 1682],
      dtype='int64', length=361)
Text indices: Index([1596,  643,  807, 1171,  937, 1288,   56, 2154, 1269, 1138,
       ...
        316, 1842, 1877,  427,  374,  105, 1196,  121,  173, 1244],
      dtype='int64', length=451)
Label indices: Index([1596,  64

In [84]:
X_train = list(X_train)
y_train = list(y_train)
X_val = list(X_val)
y_val = list(y_val)
X_test = list(X_test)
y_test = list(y_test)

In [85]:
train_dataset = KingDataset(X_train, y_train)
val_dataset = KingDataset(X_val, y_val)
test_dataset = KingDataset(X_test, y_test)

In [86]:
def train(model, train_loader, val_loader, num_epochs):
    best_val_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_acc = 0.0
        for batch_idx, (data, labels) in enumerate(train_loader):
            data, labels = data.to(device), labels.to(device)  # 데이터를 mps로 이동
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            running_acc += (preds == labels).sum().item()
            if batch_idx % 100 == 99:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_loader)}, Train Loss: {running_loss/100:.4f}, Train Accuracy: {running_acc/((batch_idx+1)*batch_size):.4f}')
                running_loss = 0.0
                running_acc = 0.0
        val_loss, val_acc = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')

In [87]:
def evaluate(model, data_loader, device):
    model.eval()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        for data, labels in data_loader:
            data, labels = data.to(device), labels.to(device)  # 데이터를 mps로 이동
            outputs = model(data)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            running_acc += (preds == labels).sum().item()
    epoch_loss = running_loss / len(data_loader)
    epoch_acc = running_acc / len(data_loader.dataset)
    return epoch_loss, epoch_acc

In [88]:
# 모델 학습
num_epochs = 10
train(model, train_loader, val_loader, num_epochs)

# 테스트 평가
test_loss, test_acc = evaluate(model, test_loader, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')

KeyError: 1176

KeyError: 926

ValueError: `analyze` requires an instance of `str` or an iterable of `str`.