# konlpy(Korean Natural Language Processing in Python)란?
## : 한국어 텍스트 데이터 처리(형태소 분석, 품사 태깅, 명사 추출)

In [3]:
from konlpy.tag import Okt

okt = Okt()

text = '성심당 망고시루를 먹어보고 싶었는데 타이밍을 놓쳤어요.'

morphs = okt.morphs(text)
print('Morphs:', morphs)

Morphs: ['성심당', '망고', '시루', '를', '먹어', '보고', '싶었는데', '타이밍', '을', '놓쳤어요', '.']


In [4]:
from konlpy.tag import Okt

okt = Okt()

text = '한글 자연어 처리가 필요하다. 하지만 한국어는 굉장히 복잡한 구조로 이루어져 있다.'

# 형태소 분석
morphs = okt.morphs(text)
print('Morphs:', morphs)

# 품사태깅
pos = okt.pos(text)
print('POS:', pos)

# 명사 추출
nouns = okt.nouns(text)
print('Nouns:', nouns)

# 어간 추출
stemmed = okt.morphs(text, stem=True)
print('Stemmed:', stemmed)

Morphs: ['한글', '자연어', '처리', '가', '필요하다', '.', '하지만', '한국어', '는', '굉장히', '복잡한', '구조', '로', '이루어져', '있다', '.']
POS: [('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('가', 'Josa'), ('필요하다', 'Adjective'), ('.', 'Punctuation'), ('하지만', 'Conjunction'), ('한국어', 'Noun'), ('는', 'Josa'), ('굉장히', 'Adjective'), ('복잡한', 'Adjective'), ('구조', 'Noun'), ('로', 'Josa'), ('이루어져', 'Verb'), ('있다', 'Adjective'), ('.', 'Punctuation')]
Nouns: ['한글', '자연어', '처리', '한국어', '구조']
Stemmed: ['한글', '자연어', '처리', '가', '필요하다', '.', '하지만', '한국어', '는', '굉장하다', '복잡하다', '구조', '로', '이루어지다', '있다', '.']


In [5]:
import os
import re
import json

import numpy as np
import pandas as pd
from tqdm import tqdm

from konlpy.tag import Okt

FILTERS = "([~.,!?\"':;)(])"
PAD = '<PAD>'
SOS = '<SOS>'
END = '<END>'
UNK = '<UNK>'

PAD_INDEX = 0
DOD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, SOS, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)

MAX_SEQUENCE = 25

In [6]:
def load_data(path):
    data_df = pd.read_csv(path, header=0)
    question, answer = list(data_df['Q']), list(data_df['A'])

    return question, answer

In [7]:
path = 'ChatBotData.csv_short'

inputs, outputs = load_data(path)

In [9]:
inputs[:5], outputs[:5]

(['가끔 궁금해', '가끔 뭐하는지 궁금해', '가끔은 혼자인게 좋다', '가난한 자의 설움', '가만 있어도 땀난다'],
 ['그 사람도 그럴 거예요.',
  '그 사람도 그럴 거예요.',
  '혼자를 즐기세요.',
  '돈은 다시 들어올 거예요.',
  '땀을 식혀주세요.'])

In [11]:
def load_vocabulary(path, vocab_path, tokenize_as_morph=False):
    vocabulary_list = []

    if not os.path.exists(vocab_path):
        if os.path.exists(path) :
            data_df = pd.read_csv(path, encoding='utf-8')
            question, answer = list(data_df['Q']), list(data_df['A'])
            if tokenize_as_morph:
                question = prepro_like_morphlized(question)
                answer = prepro_like_morphlized(answer)

            data = []

            data.extend(question)
            data.extend(answer)

            words = data_tokenizer(data)
            words = list(set(words))

            words[:0] = MARKER

        with open(vocab_path, 'w', encoding='utf-8') as vocabulary_file:
            for word in words:
                vocabulary_file.write(word + '\n')

    with open(vocab_path, 'r', encoding='utf-8') as vocabulary_file:
        for line in vocabulary_file:
            vocabulary_list.append(line.strip())

    char2idx, idx2char = make_vocabulary(vocabulary_list)

    return char2idx, idx2char, len(char2idx)

In [12]:
def prepro_like_morphlized(data):
    morph_analyzer = Okt()
    result_data = list()
    for seq in tqdm(data):
        morplized_seq = ' '.join(morph_analyzer.morphs(seq.replaxe(' ', '')))
        result_data.append(morpglized_seq)

    return result_data

In [13]:
def data_tokenizer(data):
    words = []
    for sentence in data:
        sentence = re.sub(CHANGE_FILTER, '', sentence)
        for word in sentence.split():
            words.append(word)

    return [word for word in words if word]

In [16]:
def make_vocabulary(vocabulary_list):
    char2idx = {char:idx for idx, char in enumerate(vocabulary_list)}
    idx2char = {idx: char for idx, char in enumerate(vocabulary_list)}

    return char2idx, idx2char

In [17]:
vocab_path = 'vocabulary2.txt'

char2idx, idx2char, vocab_size = load_vocabulary(path, vocab_path, tokenize_as_morph=False)

In [18]:
print(char2idx)

{'<PAD>': 0, '<SOS>': 1, '<END>': 2, '<UNK>': 3, '질린다': 4, '달에는': 5, '궁금해': 6, '가상화폐': 7, '운동만': 8, '돌아가서': 9, '열': 10, '훈훈해': 11, '식혀주세요': 12, '비싼데': 13, '좋을': 14, '선물로': 15, '감기': 16, '새출발': 17, '절약해봐요': 18, '뭐가': 19, '어서': 20, '따뜻하게': 21, '좀': 22, '그럴': 23, '뭐하는지': 24, '보인다': 25, '같아요': 26, '생일인데': 27, '가난한': 28, '집에': 29, '뭘': 30, '마세요': 31, '더': 32, '다시': 33, '같아': 34, '걸리겠어': 35, '나왔다': 36, '땀을': 37, '설움': 38, '운동': 39, '해보세요': 40, '사세요': 41, '승진': 42, '생각해보세요': 43, '망함': 44, '때까지': 45, '너무': 46, '가끔은': 47, '가스비': 48, '적당히': 49, '가스불': 50, '설득해보세요': 51, '전생에': 52, '자의': 53, '나라를': 54, '필요한': 55, '가끔': 56, '돈은': 57, '집착하지': 58, '교회': 59, '나갔어': 60, '빨리': 61, '혼자인게': 62, '또': 63, '구하셨나요': 64, '싶어': 65, '안': 66, '땀난다': 67, '믿어줘': 68, '게': 69, '나': 70, '좋다': 71, '마음을': 72, '켜놓고': 73, '즐기세요': 74, '다음': 75, '켜고': 76, '거예요': 77, '따라': 78, '남자친구가': 79, '갔어': 80, '좋을까': 81, '운동을': 82, '남자친구': 83, '필요했던': 84, '그': 85, '해': 86, '잊고': 87, '나온거': 88, '바빠': 89, '혼자를': 90, '많이': 91, '나오세요': 92,

In [20]:
def enc_processing(value, dictionary, tokenize_as_morph=False):
    sequences_input_index = [] # 인덱스 값
    sequences_length = [] # 문장 길이

    # 형태소 토크나이징
    if tokenize_as_morph:
        value = prepro_like_morphlized(value)

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, '', sequence)
        sequence_index = []

        # UNK
        for word in sequence.split():
            if dictionary.get(word) is not None:
                sequence_index.extend([dictionary[word]])
            else :
                sequence_index.extend([dictionary[UNK]])

        # 문장 길이 제한
        if len(sequence_index) > MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE]
            
        sequences_length.append(len(sequence_index))
        # PAD
        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
        sequences_input_index.append(sequence_index)

        #sequences_length.append(len(sequence_index))

    return np.asarray(sequences_input_index), sequences_length

In [22]:
index_inputs, inputs_seq_len = enc_processing(inputs, char2idx, tokenize_as_morph=False)

In [23]:
print(index_inputs[:3], inputs_seq_len)

[[56  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [56 24  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [47 62 71  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]] [2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 3, 4, 3]


In [27]:
def dec_output_processing(value, dictionary, tokenize_as_morph=False):
    sequences_output_index = []
    sequences_length = []

    if tokenize_as_morph:
        value = prepro_like_morphlized(value)

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, '', sequence)
        sequence_index = [dictionary[SOS]] + [dictionary[word] if word in dictionary else dictionary[UNK] for word in sequence.split()]

        if len(sequence_index) > MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE]

        sequences_length.append(len(sequence_index))
        
        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
        sequences_output_index.append(sequence_index)

    return np.asarray(sequences_output_index), sequences_length

In [28]:
index_outputs, output_seq_len = dec_output_processing(outputs, char2idx, tokenize_as_morph=False)

In [29]:
print(index_outputs[:3], output_seq_len)

[[  1  85 105  23  77   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  1  85 105  23  77   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]
 [  1  90  74   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0]] [5, 5, 3, 5, 3, 5, 6, 6, 5, 3, 5, 4, 5, 7, 4, 4, 4, 4, 4, 4]


In [30]:
def dec_target_processing(value, dictionary, tokenize_as_morph=False):
    sequences_target_index = []

    if tokenize_as_morph:
        value = prepro_like_morphlized(value)

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, '', sequence)
        sequence_index = [dictionary[word] if word in dictionary else dictionary[UNK] for word in sequence.split()]

        if len(sequence_index) >= MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE-1] + [dictionary[END]] # 기존보다 1만큼 길이를 줄이고 END 토큰 추가
        else:
            sequence_index += [dictionary[END]] # 넘지 않을 때, padding 붙여준 다음 END를 바로 붙여준다.(PAD 토큰 추가할 필요없이 문장 마지막에 END 토큰 추가)

        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]] # PAD 토큰 추가
        sequences_target_index.append(sequence_index)

    return np.asarray(sequences_target_index)

In [31]:
index_targets = dec_target_processing(outputs, char2idx, tokenize_as_morph=False)

In [32]:
data_configs = {}
data_configs['char2idx'] = char2idx
data_configs['idx2char'] = idx2char
data_configs['vocab_size'] = vocab_size
data_configs['pad_symbol'] = PAD
data_configs['sos_symbol'] = SOS
data_configs['end_symbol'] = END
data_configs['unk_symbol'] = UNK

In [33]:
DATA_IN_PATH = '/Users/jeon-yewon/Desktop/데이터 분석 강의/부트캠프/12주차/data_in/'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

In [34]:
np.save(open(DATA_IN_PATH + TRAIN_INPUTS, 'wb'), index_inputs)
np.save(open(DATA_IN_PATH + TRAIN_OUTPUTS, 'wb'), index_outputs)
np.save(open(DATA_IN_PATH + TRAIN_TARGETS, 'wb'), index_targets)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))