In [1]:
!pip install PyKomoran

Collecting PyKomoran
[?25l  Downloading https://files.pythonhosted.org/packages/23/b0/ce6a46f311651ed64c39beb1cfe1c39a9906521139ace45430d08c489b62/PyKomoran-0.1.5-py3-none-any.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 2.4MB/s 
[?25hCollecting py4j==0.10.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/04/de/2d314a921ef4c20b283e1de94e0780273678caac901564df06b948e4ba9b/py4j-0.10.8.1-py2.py3-none-any.whl (196kB)
[K     |████████████████████████████████| 204kB 28.6MB/s 
[?25hInstalling collected packages: py4j, PyKomoran
Successfully installed PyKomoran-0.1.5 py4j-0.10.8.1


In [0]:
import json
from PyKomoran import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, fbeta_score
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random

In [0]:
def set_seed():
  random.seed(777)
  np.random.seed(777)
  torch.manual_seed(777)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(777)

set_seed()

In [0]:
# Train Data 와 Test Data 의 발화를 형태소 분석
komoran = Komoran("EXP")

with open('SpeechAct_tr.json', 'r', encoding='utf-8') as read_file:
    data_tr = json.load(read_file)
with open('SpeechAct_te.json', 'r', encoding='utf-8') as read_file:
    data_te = json.load(read_file)

In [0]:
docs_tr = []
words_tr = []
voca_tr = []
for key in data_tr.keys():
    for message in data_tr[key]:
        komoran_text = komoran.get_plain_text(message[1])
        komoran_text = komoran_text.split(' ')
        for voc in komoran_text:
          voca_tr.append(voc)
        #print(komoran_text)
        words_tr.append(komoran_text)
    docs_tr.append(words_tr)

In [0]:
docs_te = []
words_te = []
for key in data_te.keys():
    for message in data_te[key]:
        komoran_text = komoran.get_plain_text(message[1])
        komoran_text = komoran_text.split(' ')
        words_te.append(komoran_text)
    docs_te.append(words_te)

In [7]:
print(len(docs_tr), len(docs_te))
print(len(words_tr), len(words_te))
print(len(voca_tr))

260 40
5825 6671
48472


In [0]:
# 형태소 분석한 Train Data 의 발화를 이용하여 word2idx 구축
words_dic = sorted(set(voca_tr))
word2index = {}
word2index['<PAD>'] = 0
word2index['<UKN>'] = 1
for idx, word in enumerate(words_dic):
  word2index[word] = idx + 2

In [0]:
# Train data label -> label2idx구축
label2idx = {'opening' : 0, 'request' : 1, 'wh-question' : 2, 'yn-question' : 3,
             'inform' : 4, 'affirm' : 5, 'ack' : 6, 'expressive' : 7}

def labellist(path):
  with open(path, 'r', encoding='utf-8') as read_file:
    data = json.load(read_file)

  label_list = []
  for idx, labels in enumerate(data.values()):
    for label in labels:
      label_list.append(label2idx[label[2]])
  return label_list

In [0]:
label_list_tr = labellist('SpeechAct_tr.json')
label_list_te = labellist('SpeechAct_te.json')

In [0]:
# emd = nn.Embedding(num_embeddings = len(word2index) ,embedding_dim = 128)
word_list_tr = []
for key in data_tr:
  if 0 < len(data_tr[key]):
    sentence_list = [i[1] for i in data_tr[key]]
    for sentence in sentence_list:
      index_word = []
      if sentence:
        o_word = komoran.get_plain_text(sentence)
        for word in o_word.split(' '):
          index_word.append(word2index[word])      
      word_list_tr.append(index_word)

In [12]:
word2index.keys()

dict_keys(['<PAD>', '<UKN>', '!/SF', "'/SS", '(/SS', ',/SP', '-/SS', './SF', '0/SO', '000/SN', '08/SN', '1/SN', '10/SN', '10분/NNP', '10월', '10일/NNP', '11/SN', '11월', '11일/NNP', '12/SN', '12월', '12일/NNP', '13/SN', '13일/NNP', '14/SN', '14일/NNP', '15/SN', '15일/NNP', '16/SN', '16일/NNP', '17일/NNP', '18/SN', '18일/NNP', '19/SN', '19일/NNP', '1일/NNP', '2/SN', '20/SN', '20일/NNP', '22일/NNP', '23/SN', '23일/NNP', '24/SN', '24일/NNP', '25일/NNP', '26일/NNP', '27/SN', '27일/NNP', '28/SN', '28일/NNP', '29일/NNP', '2시/NNP', '2일/NNP', '3/SN', '30/SN', '30일/NNP', '31/SN', '33/SN', '3월', '3일/NNP', '4/SN', '46/SN', '4월', '4월/NNP', '4일/NNP', '5/SN', '50/SN', '5월', '5일/NNP', '6/SN', '60/SN', '6월', '6일/NNP', '7/SN', '7월', '7월/NNP', '7일/NNP', '8/SN', '8월', '8일/NNP', '9/SN', '9월', '9월/NNP', '9일/NNP', '?/SF', 'CGV/SL', 'MMC/SL', 'ㄴ/ETM', 'ㄴ/JX', 'ㄴ가/EF', 'ㄴ가요/EF', 'ㄴ다/EF', 'ㄴ다고/EC', 'ㄴ다고/EF', 'ㄴ단다/EF', 'ㄴ데/EC', 'ㄴ데/EF', 'ㄴ데요/EF', 'ㄴ지/EC', 'ㄹ/ETM', 'ㄹ건데/EF', 'ㄹ걸/EC', 'ㄹ게/EC', 'ㄹ게/EF', 'ㄹ게요/EF', 'ㄹ까/EF', 'ㄹ까요/EF', 'ㄹ래/E

In [0]:
# emd = nn.Embedding(num_embeddings = len(word2index) ,embedding_dim = 128)
word_list_te = []
for key in data_te:
  sentence_list = [i[1] for i in data_te[key]]
  for sentence in sentence_list:
    index_word = []
    if sentence:
      o_word = komoran.get_plain_text(sentence)
      for word in o_word.split(' '):
        if word not in word2index.keys():
          index_word.append(word2index['<UKN>'])
        else:
          index_word.append(word2index[word])
    word_list_te.append(index_word)

In [14]:
print(len(word_list_tr))
print(len(label_list_tr))
print(len(word_list_te))
print(len(label_list_te))

5825
5825
6671
6671


In [0]:
# list를 torch.tensor로 만드려면 list의 길이가 같아야 함 그래서 0으로 패딩
max_len = 50
word_list_tr_ = np.array([i + [0] * (max_len - len(i)) for i in word_list_tr])
word_list_te_ = np.array([i + [0] * (max_len - len(i)) for i in word_list_te])

In [16]:
word_list_tr_[:5]

array([[ 746,  738, 1011, 1004,  783,  332,   84,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [ 311,    5, 1011, 1004,  783,  693,    7,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [ 746,  738,  987, 1261, 1078, 1212,  735, 1092,  802,    7,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   

In [17]:
label_list_tr[:4]

[0, 0, 1, 2]

In [0]:
word_tensor_tr = torch.tensor(word_list_tr_)
label_tensor_tr = torch.tensor(label_list_tr)
word_tensor_te = torch.tensor(word_list_te_)
label_tensor_te = torch.tensor(label_list_te)

In [19]:
print(word_tensor_tr.size(), label_tensor_tr.size())
print(word_tensor_te.size(), label_tensor_te.size())

torch.Size([5825, 50]) torch.Size([5825])
torch.Size([6671, 50]) torch.Size([6671])


In [0]:
# 6 번째 과제 , ‘Multi Layer Perceptron 을 활용한 화행 분석 ’ 참조
# 8 번째 과제 , ‘Convolution Neural Networks 를 활용한 화행 분석 (2)’ 참조
epochs = 100
dropout = 0.5
learning_rate = 0.001

class CNN(torch.nn.Module):
  def __init__(self, vocab_size, num_labels):
    super(CNN, self).__init__()
    self.word_embed = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=128, padding_idx=0)
    
    self.conv1 = nn.Conv1d(in_channels=128, out_channels=32, kernel_size=3, stride=1)
    self.conv2 = nn.Conv1d(in_channels=32, out_channels=16, kernel_size=3, stride=1)
    self.relu = nn.ReLU()
    self.maxp1 = nn.MaxPool1d(3)
    self.maxp2 = nn.MaxPool1d(12)
    
    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(16 * 3, num_labels, bias=True)

  def forward(self, inputs):
    embedded = self.word_embed(inputs).permute(0, 2, 1)
    x = self.maxp1(self.relu(self.conv1(embedded)))
    x = self.maxp2(self.relu(self.conv2(x))).squeeze(2)
    x = self.dropout(torch.cat((x,x,x), dim = 1))

    pred = self.fc1(x)

    return pred

In [199]:
# GPU 가능 여부 및 선택
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# train_tfidf_tensor shape = (발화 수, tfidf_size)
model = CNN(vocab_size = len(word2index), num_labels=8)

# model을 GPU로 이동
model.to(device)

CNN(
  (word_embed): Embedding(1277, 128, padding_idx=0)
  (conv1): Conv1d(128, 32, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(32, 16, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (maxp1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (maxp2): MaxPool1d(kernel_size=12, stride=12, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.6, inplace=False)
  (fc1): Linear(in_features=48, out_features=8, bias=True)
)

In [0]:
# Train data를 이용 CNN 모델 학습
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# TensorDataset을 이용하여 Input/output data를 하나로 묶음
Train_dataset = torch.utils.data.TensorDataset(word_tensor_tr, label_tensor_tr)
Test_dataset = torch.utils.data.TensorDataset(word_tensor_te, label_tensor_te)

# DataLoader를 선언하고 batch size 만큼 데이터를 가지고 와서 학습
# Shuffle 여부 결정
train_DataLoader = torch.utils.data.DataLoader(Train_dataset, shuffle=True, batch_size=4)
test_DataLoader = torch.utils.data.DataLoader(Test_dataset, shuffle=False, batch_size=1)

In [201]:
model.train(True)
model.zero_grad()
for epoch in range(epochs):
  epoch_loss = 0
  for batch in train_DataLoader:
    # batch : (tfidf data, label)
    batch = tuple(t.to(device) for t in batch)
    y_pred = model(batch[0])

    loss = criterion(y_pred, batch[1])
    epoch_loss += loss.item()

    loss.backward()
    optimizer.step()
    model.zero_grad()
  if (epoch+1) % 10 == 0:
    print(epoch, epoch_loss)
model.train(False)

9 705.4591852826998
19 642.4179804844316
29 683.8232677261985
39 622.4573948900797
49 698.0531187830784
59 774.7222020104527
69 949.1826521605253


CNN(
  (word_embed): Embedding(1277, 128, padding_idx=0)
  (conv1): Conv1d(128, 32, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(32, 16, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (maxp1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (maxp2): MaxPool1d(kernel_size=12, stride=12, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.6, inplace=False)
  (fc1): Linear(in_features=48, out_features=8, bias=True)
)

In [0]:
'''
# Test model
model.eval()
with torch.no_grad():
  correct = 0
  total = 0
  for images, labels in test_DataLoader:
    images = images.to(device)
    labels = labels.to(device)
    outputs = model(images)
    _, pred = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (pred == labels).sum().item()

  print('Test Accuracy of the model on the  : {}'.format(100*correct/total))
'''

Test Accuracy of the model on the  : 94.99325438464997


In [0]:
 # Test
model.eval()
pred = None
label = None
for batch in test_DataLoader:
  # batch : [tfidf_data, label]
  batch = tuple(t.to(device) for t in batch)

  # gradient를 계산하지 않도록 선언
  with torch.no_grad():
    y_pred = model(batch[0])

  if pred is None:
    pred = y_pred.detach().cpu().numpy()
    label = batch[1].detach().cpu().numpy()
  else:
    pred = np.append(pred, y_pred.detach().cpu().numpy(), axis=0)
    label = np.append(label, batch[1].detach().cpu().numpy(), axis=0)

pred = np.argmax(pred, axis=1)

In [203]:
if len(label_tensor_te) == len(pred):
  print("True")

True


In [0]:
def eval(true, pred):
    ave = ['macro', 'micro']
    precision = []
    recall = []
    fbeta = []
    f1 = []
    acc = accuracy_score(true, pred)
    for i in ave:
        precision.append(precision_score(true, pred, average=i))
        recall.append(recall_score(true, pred, average=i))
        f1.append(f1_score(true, pred, average=i))
    return acc, precision, recall, f1

In [0]:
conf_mat = confusion_matrix(label_tensor_te, pred)

In [206]:
evaluation = eval(label_tensor_te, pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [207]:
evaluation

(0.8506970469195023,
 [0.7005980489636119, 0.8506970469195023],
 [0.5932539220143238, 0.8506970469195023],
 [0.630922640501747, 0.8506970469195023])

In [208]:
save_file_name = '2019711752_윤민형_CNN_EXPERIMENT_3'
with open('./'+save_file_name+'.txt', 'w', encoding='utf-8', newline='') as writer_text:
    list = ['precision', 'recall', 'f1-score']
    writer_text.writelines('epochs : ' + str(epochs) +'\n' + 'dropout : ' + str(dropout) + '\n' + 'learning_rate : ' + str(learning_rate) + '\n\n')
    for idx, k in enumerate(range(len(evaluation[1:]))):
        a = np.round(evaluation[k+1][0]*100, 4)
        b = np.round(evaluation[k+1][1]*100, 4)
        writer_text.writelines('Macro average ' + str(list[idx]) +' : ' + str(a) +'%' + '\n')
        writer_text.writelines('Micro averate ' + str(list[idx]) +' : ' + str(b) +'%'+ '\n\n')
    #writer_text.close()
    print("[저장 완료]")

[저장 완료]
