In [2]:
!pip install PyKomoran

Collecting PyKomoran
[?25l  Downloading https://files.pythonhosted.org/packages/23/b0/ce6a46f311651ed64c39beb1cfe1c39a9906521139ace45430d08c489b62/PyKomoran-0.1.5-py3-none-any.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 6.3MB/s 
[?25hCollecting py4j==0.10.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/04/de/2d314a921ef4c20b283e1de94e0780273678caac901564df06b948e4ba9b/py4j-0.10.8.1-py2.py3-none-any.whl (196kB)
[K     |████████████████████████████████| 204kB 60.2MB/s 
[?25hInstalling collected packages: py4j, PyKomoran
Successfully installed PyKomoran-0.1.5 py4j-0.10.8.1


In [0]:
import json
from PyKomoran import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, fbeta_score
import torch
import numpy as np
import random

In [0]:
def set_seed():
  random.seed(777)
  np.random.seed(777)
  torch.manual_seed(777)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(777)

set_seed()

In [0]:
label_list = ['opening', 'request', 'wh-question', 'yn-question', 'inform', 'affirm', 'ack', 'expressive']
label_map = {label: i for i, label in enumerate(label_list)}

In [0]:
komoran = Komoran("EXP")

with open('./SpeechAct_tr.json', 'r', encoding='utf-8') as read_file:
    data_tr = json.load(read_file)

tfidf_word_tr = []
tfidf_label_tr = []
for key_tr in data_tr:
  if data_tr[key_tr] == 0:
    continue
  for message_tr in range(len(data_tr[key_tr])):
    komoran_text_tr = ' '.join(komoran.get_morphes_by_tags(data_tr[key_tr][message_tr][1],
                                                        tag_list=['NNP', 'NNG', 'VV']))
    tfidf_word_tr.append(komoran_text_tr)
    komoran_label_tr = data_tr[key_tr][message_tr][2]
    tfidf_label_tr.append(komoran_label_tr)

tfidfvect_tr = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', stop_words=None)
tfidfvect_tr.fit_transform(tfidf_word_tr)

train_label_list = []
for i in tfidf_label_tr:
  if i in label_map.keys():
    train_label_list.append(label_map[i])
train_tfidf_list = tfidfvect_tr.transform(tfidf_word_tr).toarray().tolist()

In [0]:
with open('./SpeechAct_te.json', 'r', encoding='utf-8') as read_file:
    data_te = json.load(read_file)

tfidf_word_te = []
tfidf_label_te = []
for key_te in data_te:
  if data_te[key_te] == 0:
    continue
  for message_te in range(len(data_te[key_te])):
    komoran_text_te = ' '.join(komoran.get_morphes_by_tags(data_te[key_te][message_te][1],
                                                        tag_list=['NNP', 'NNG', 'VV']))
    tfidf_word_te.append(komoran_text_te)
    komoran_label_te = data_te[key_te][message_te][2]
    tfidf_label_te.append(komoran_label_te)

tfidfvect_te = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', stop_words=None)
tfidfvect_te.fit_transform(tfidf_word_tr)

test_label_list = []
for i in tfidf_label_te:
  if i in label_map.keys():
    test_label_list.append(label_map[i])
test_tfidf_list = tfidfvect_te.transform(tfidf_word_te).toarray().tolist()

In [8]:
print(len(train_tfidf_list), len(train_label_list))
print(len(test_tfidf_list), len(test_label_list))

5825 5825
6671 6671


In [0]:
train_tfidf_tensor = torch.tensor(train_tfidf_list)
train_label_tensor = torch.tensor(train_label_list)
test_tfidf_tensor = torch.tensor(test_tfidf_list)
test_label_tensor = torch.tensor(test_label_list)

In [10]:
print(train_tfidf_tensor.shape)
print(train_label_tensor.shape)
print(test_tfidf_tensor.shape)
print(test_label_tensor.shape)

torch.Size([5825, 743])
torch.Size([5825])
torch.Size([6671, 743])
torch.Size([6671])


In [0]:
class Perceptron(torch.nn.Module):
  def __init__(self, tfidf_size, num_label):
    super(Perceptron, self).__init__()
    self.linear = torch.nn.Linear(tfidf_size, num_label)

  def forward(self, tfidf_input):
    y_pred = self.linear(tfidf_input)

    return y_pred

In [14]:
# GPU 가능 여부 및 선택
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# train_tfidf_tensor shape = (발화 수, tfidf_size)
model = Perceptron(tfidf_size = train_tfidf_tensor.shape[1], num_label=len(label_list))

# model을 GPU로 이동
model.to(device)

Perceptron(
  (linear): Linear(in_features=743, out_features=8, bias=True)
)

In [0]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [0]:
# TensorDataset을 이용하여 Input/output data를 하나로 묶음
Train_dataset = torch.utils.data.TensorDataset(train_tfidf_tensor, train_label_tensor)
Test_dataset = torch.utils.data.TensorDataset(test_tfidf_tensor, test_label_tensor)

# DataLoader를 선언하고 batch size 만큼 데이터를 가지고 와서 학습
# Shuffle 여부 결정
train_DataLoader = torch.utils.data.DataLoader(Train_dataset, shuffle=True, batch_size=4)
test_DataLoader = torch.utils.data.DataLoader(Test_dataset, shuffle=False, batch_size=1)

In [19]:
# Train
model.train(True)
model.zero_grad()
for epoch in range(500):
  epoch_loss = 0
  for batch in train_DataLoader:
    # batch : (tfidf data, label)
    batch = tuple(t.to(device) for t in batch)
    y_pred = model(batch[0])

    loss = criterion(y_pred, batch[1])
    epoch_loss += loss.item()

    loss.backward()
    optimizer.step()
    model.zero_grad()
  if (epoch+1) % 10 == 0:
    print(epoch, epoch_loss)
model.train(False)

9 1103.7283244729042
19 958.6925388872623
29 903.0958309993148
39 872.0457537174225
49 851.3249836042523
59 837.1129299998283
69 826.3753804266453
79 818.3898889869452
89 812.0431623831391
99 805.9857422597706
109 802.1300911083817
119 797.4468386806548
129 795.1456507565454
139 792.2450955696404
149 788.5901269316673
159 786.6082487776875
169 785.295797586441
179 782.9158060178161
189 781.4034805325791
199 780.4541671387851
209 779.0619646348059
219 777.9139478579164
229 776.5090017709881
239 776.1416970267892
249 775.208323545754
259 774.3706013076007
269 773.6660101357847
279 772.7106293775141
289 772.2353050597012
299 771.6814183667302
309 771.2304613254964
319 770.4629209861159
329 770.2456400748342
339 769.7179885618389
349 770.1528772823513
359 769.3615378402174
369 768.7022183034569
379 768.3300602324307
389 768.3663168661296
399 767.8026814833283
409 767.5243470594287
419 767.3848301488906
429 766.9996184678748
439 766.8820808157325
449 766.8889205269516
459 766.5046111643314


Perceptron(
  (linear): Linear(in_features=743, out_features=8, bias=True)
)

In [0]:
# Test
model.eval()
pred = None
label = None
for batch in test_DataLoader:
  # batch : [tfidf_data, label]
  batch = tuple(t.to(device) for t in batch)

  # gradient를 계산하지 않도록 선언
  with torch.no_grad():
    y_pred = model(batch[0])

  if pred is None:
    pred = y_pred.detach().cpu().numpy()
    label = batch[1].detach().cpu().numpy()
  else:
    pred = np.append(pred, y_pred.detach().cpu().numpy(), axis=0)
    label = np.append(label, batch[1].detach().cpu().numpy(), axis=0)

pred = np.argmax(pred, axis=1)

In [30]:
if len(test_label_tensor) == len(pred):
  print("True")

True


In [0]:
def eval(true, pred):
    ave = ['macro', 'micro']
    precision = []
    recall = []
    fbeta = []
    f1 = []
    acc = accuracy_score(true, pred)
    for i in ave:
        precision.append(precision_score(true, pred, average=i))
        recall.append(recall_score(true, pred, average=i))
        f1.append(f1_score(true, pred, average=i))
    return acc, precision, recall, f1

In [0]:
conf_mat = confusion_matrix(test_label_tensor, pred)

In [0]:
evaluation = eval(test_label_tensor, pred)

In [35]:
evaluation

(0.7949332933593164,
 [0.7436745887350917, 0.7949332933593164],
 [0.732319876517449, 0.7949332933593164],
 [0.7307183367455284, 0.7949332933593164])

In [37]:
save_file_name = '2019711752_윤민형_MLP'
with open('./'+save_file_name+'.txt', 'w', encoding='utf-8', newline='') as writer_text:
    list = ['precision', 'recall', 'f1-score']
    for idx, k in enumerate(range(len(evaluation[1:]))):
        writer_text.writelines('Macro average ' + str(list[idx]) +' : ' 
                               + str(evaluation[k+1][0]*100) +'%' + '\n')
        writer_text.writelines('Micro averate ' + str(list[idx]) +' : ' 
                               + str(evaluation[k+1][1]*100) +'%'+ '\n\n')
    #writer_text.close()
    print("[저장 완료]")

[저장 완료]
