In [41]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

In [42]:
import pandas as pd

kingtotal = pd.read_csv('josuncsv/king_0.csv', sep=';')
kingtotal

from kiwipiepy import Kiwi

kiwi = Kiwi()

def kiwi_preprocess(text):
    tokens = kiwi.tokenize(text)
    return ' '.join([str(token) for token in tokens])

kingtotal['text_kiwi'] = kingtotal['text'].apply(kiwi_preprocess)

In [43]:
kingtotal

Unnamed: 0,label,year,month,day,text,text_kiwi
0,0,1년,10월,10일,임금이 수창궁(壽昌宮)에 거둥하였다.,"Token(form='임금', tag='NNG', start=0, len=2) To..."
1,0,1년,10월,10일,이조 전서(吏曹典書) 유양(柳亮) ...,"Token(form='이조', tag='NNP', start=14, len=2) T..."
2,0,1년,10월,11일,임금이 탄생일이므로 여러 신하들의 조하(朝賀)를 받고 사형(死刑)과 유형(流刑) 이...,"Token(form='임금', tag='NNG', start=0, len=2) To..."
3,0,1년,10월,12일,우현보(禹玄寶)·이색(李穡)·설장수(偰長壽) 등 30인을 외방(外方)에 종편(從便...,"Token(form='우현', tag='NNP', start=1, len=2) To..."
4,0,1년,10월,12일,공부 상정 도감(貢賦詳定都監)159) 에서 상서(上書...,"Token(form='공부', tag='NNG', start=0, len=2) To..."
...,...,...,...,...,...,...
2247,0,7년,윤5월,7일,"사노(私奴) 오마대(吾麻大)가 그 아버지를 구타하였으므로, 명하여 이를 목 베게 하였다.","Token(form='사노', tag='NNG', start=0, len=2) To..."
2248,0,7년,윤5월,7일,비가 내리었다.,"Token(form='비', tag='NNG', start=0, len=1) Tok..."
2249,0,7년,윤5월,9일,비가 내리었다.,"Token(form='비', tag='NNG', start=0, len=1) Tok..."
2250,0,7년,윤5월,9일,"임금이 서강(西江)에 거둥하였는데, 경상도의 조선(漕船) 20척이 도착 정박하였다.","Token(form='임금', tag='NNG', start=0, len=2) To..."


In [55]:
len(kingtotal.text_kiwi[1])

3607

In [44]:
from sklearn.model_selection import train_test_split
tokenizer = Kiwi()

In [45]:
targetDF = kingtotal.label
targetDF

0       0
1       0
2       0
3       0
4       0
       ..
2247    0
2248    0
2249    0
2250    0
2251    0
Name: label, Length: 2252, dtype: int64

In [46]:
featureDF = kingtotal.text
featureDF

0                                    임금이 수창궁(壽昌宮)에 거둥하였다.
1                     이조 전서(吏曹典書)              유양(柳亮) ...
2       임금이 탄생일이므로 여러 신하들의 조하(朝賀)를 받고 사형(死刑)과 유형(流刑) 이...
3        우현보(禹玄寶)·이색(李穡)·설장수(偰長壽) 등 30인을 외방(外方)에 종편(從便...
4       공부 상정 도감(貢賦詳定都監)159)                  에서 상서(上書...
                              ...                        
2247    사노(私奴) 오마대(吾麻大)가 그 아버지를 구타하였으므로, 명하여 이를 목 베게 하였다.
2248                                             비가 내리었다.
2249                                             비가 내리었다.
2250       임금이 서강(西江)에 거둥하였는데, 경상도의 조선(漕船) 20척이 도착 정박하였다.
2251    사신인 내사(內史) 노타내(盧他乃) 등이 중국 서울로 돌아가니, 임금이 여러 신하를...
Name: text, Length: 2252, dtype: object

In [47]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetDF,
                                                    random_state=7,
                                                    test_size=0.3)

In [48]:
X_train

1645                  도평의사사(都評議使司)에 명하여, 서반 6품과 동반 7품 ...
1336    이조(吏曹)에서 조종(祖宗)을 현양하고 배필(配匹)을 중히 하기를 청하였다. "1....
1681          이달에 큰비가 내려 경상도에 물로 손실된 밭이 거의 1만 결(結)이나 되었다.
13                                         큰비가 오고 천둥이 쳤다.
389                                왜적(倭賊)이 교동(喬桐)에 침구하였다.
                              ...                        
211                                      목가(木稼)083)  하였다.
1603    간관(諫官)이 상언(上言)하였다. "배표(拜表)하는 예를 초지(草地)에서 행할 수 ...
537     내전에서 1백 8명의 중에게 밥을 먹이어 국사(國師)의 봉숭례(封崇禮)를 행하여, ...
1220     합문 사인(閤門舍人) 권효(權曉)를 경상도에 보내서 최운해(崔雲海)에게 궁온(宮醞...
175                   대사헌              남재(南在) 등이 상언(上言...
Name: text, Length: 1576, dtype: object

In [49]:
tokenizer = Kiwi()

In [50]:
def yield_tokens(data_iter):
    for text in data_iter:
        # print(text)
        yield iter(Okt.nouns(text))        

In [51]:
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<pad>"])
len(vocab)

NameError: name 'train_iter' is not defined

TypeError: build_vocab_from_iterator() got an unexpected keyword argument 'key'

TypeError: '<' not supported between instances of 'kiwipiepy.Token' and 'kiwipiepy.Token'

In [None]:
vocab(["<pad>", "<unk>", "here", "is", "an", "example"])

[0, 1, 476, 22, 31, 5298]

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1  # 0~3으로 변환

In [None]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device = 'cpu'
device

'cpu'

In [None]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [None]:
batch_size = 64
train_loader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_iter, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [None]:
for a in train_loader:
    print(a)
    break
for b in test_loader:
    print(b)
    break

(tensor([3, 3, 1, 3, 1, 1, 1, 3, 0, 1, 3, 0, 0, 0, 3, 2, 0, 3, 0, 1, 2, 3, 0, 0,
        3, 2, 1, 3, 1, 2, 0, 0, 3, 2, 0, 2, 0, 2, 3, 0, 3, 0, 0, 3, 1, 3, 0, 0,
        2, 3, 2, 3, 0, 1, 3, 0, 2, 2, 0, 0, 2, 3, 3, 2]), tensor([ 100, 1169, 3631,  ...,    2,    2,    2]), tensor([   0,   62,   97,  153,  221,  261,  310,  348,  407,  459,  513,  553,
         598,  635,  722,  768,  828,  875,  907,  989, 1053, 1099, 1149, 1167,
        1210, 1251, 1305, 1357, 1383, 1426, 1454, 1484, 1539, 1561, 1594, 1631,
        1664, 1701, 1749, 1787, 1809, 1843, 1868, 1914, 1953, 2004, 2042, 2086,
        2129, 2197, 2240, 2296, 2334, 2380, 2430, 2497, 2542, 2595, 2631, 2666,
        2715, 2757, 2792, 2829]))
(tensor([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 3, 1, 1, 0, 3,
        0, 1, 0, 1, 0, 3, 2, 3, 0, 0, 2, 2, 1, 1, 1, 3]), tensor([ 870,   12,   84,  ..., 5499, 1330,    2]), tensor([   0,   28,   87

In [None]:
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embed_dim = 300
num_epochs = 5
lr = 0.001

print(f"num_class : {num_class}    vocab_size : {vocab_size}")


num_class : 4    vocab_size : 95812


In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(TextClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.rnn = nn.RNN(embed_dim, hidden_dim)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layer,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm",
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, num_class)
        else:
            self.classifier = nn.Linear(hidden_dim, num_class)

        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits


In [None]:
n_vocab = len(vocab)
hidden_dim = 64
embedding_dim = 128
n_layer = 2

classifier = SentenceClassifier(
    n_vocab=n_vocab, 
    hidden_dim=hidden_dim, 
    embedding_dim=embedding_dim, 
    n_layer=n_layer
).to(device)

classifier = TextClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_class=num_class
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [None]:
from torchinfo import summary

summary(classifier)


Layer (type:depth-idx)                   Param #
TextClassifier                           --
├─EmbeddingBag: 1-1                      28,743,600
├─RNN: 1-2                               23,424
├─Linear: 1-3                            1,204
Total params: 28,768,228
Trainable params: 28,768,228
Non-trainable params: 0

In [None]:
def train(model, dataloader):
    model.train()
    total_loss = 0
    total_acc = 0
    total_samples = 0

    for idx, (labels, texts, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(texts, offsets)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_loss += loss.item()
        acc = (outputs.argmax(1) == labels).float().sum()
        total_acc += acc.item()
        total_samples += labels.size(0)

    return total_loss / total_samples, total_acc / total_samples

def evaluate(model, dataloader, is_training=False):
    model.eval()
    total_loss = 0
    total_acc = 0
    total_samples = 0
    with torch.no_grad():
        for labels, texts, offsets in dataloader:
            outputs = model(texts, offsets)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * labels.size(0)
            acc = (outputs.argmax(1) == labels).float().sum()
            total_acc += acc.item()
            total_samples += labels.size(0)
    return total_loss / total_samples, total_acc / total_samples



In [None]:
def predict(model, text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text), dtype=torch.int64).to(device)
        text = text.unsqueeze(0)
        offsets = torch.tensor([0]).to(device)
        predicted_label = model(text, offsets)
        return predicted_label.argmax(1).item() + 1

In [None]:
best_valid_acc = 0
for epoch in range(num_epochs):
    train_loss, train_acc = train(classifier, train_loader)
    valid_loss, valid_acc = evaluate(classifier, test_loader)  # valid_loader 대신 test_loader 사용
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(classifier.state_dict(), 'best_model.pth')
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_acc:.4f}')

# 모델 평가
classifier.load_state_dict(torch.load('best_model.pth'))
test_loss, test_acc = evaluate(classifier, test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 1, Train Loss: 0.0005, Train Acc: 0.9889, Valid Loss: 0.4608, Valid Acc: 0.8996
Epoch: 2, Train Loss: 0.0005, Train Acc: 0.9904, Valid Loss: 0.5020, Valid Acc: 0.8957
Epoch: 3, Train Loss: 0.0004, Train Acc: 0.9920, Valid Loss: 0.5397, Valid Acc: 0.8939
Epoch: 4, Train Loss: 0.0004, Train Acc: 0.9922, Valid Loss: 0.5794, Valid Acc: 0.8925
Epoch: 5, Train Loss: 0.0003, Train Acc: 0.9933, Valid Loss: 0.6172, Valid Acc: 0.8908
Test Loss: 0.4608, Test Acc: 0.8996
