In [77]:
from konlpy.tag import Okt
import numpy as np
import os, re, kss
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from hanspell import spell_checker

DATA_PATH = './somelight_2.csv' # somelight_1.csv 중 처음부터 50개 데이터
encoding = 'CP949'
print("File Size: "+str(round(os.path.getsize(DATA_PATH)/1000000, 2))+"MB")

dataset = pd.read_csv(DATA_PATH, header=0, encoding=encoding)
dataset.describe()

File Size: 0.03MB


Unnamed: 0,결과
count,50.0
mean,0.86
std,0.947822
min,0.0
25%,0.0
50%,0.0
75%,2.0
max,2.0


In [78]:
okt = Okt() 

stop_word = pd.read_csv('./ko_stopword.txt', header=0)
stop_words = set(stop_word)

In [79]:
def preprocessing(text, okt, re_stopword = True):
    spelled_sent = spell_checker.check(text)
    hanspell_sent = spelled_sent.checked
    retext_1 = re.sub("[a-zA-Zㄱ-ㅎㅏ-ㅣ!?.]", "", hanspell_sent)
    retext_2 = okt.morphs(retext_1, stem=True)
    
    if re_stopword:
        clean_text = [token for token in retext_2 if not token in stop_words]
    
    clean_text = ' '.join(clean_text)
    return clean_text

In [93]:
clean_sent = []
for re_text in dataset['본문']:
    if type(re_text) == str:
        clean_text=''
        for sent in kss.split_sentences(re_text):
            clean_text += preprocessing(sent, okt) + ' '
        clean_sent.append(clean_text)
    else:
        clean_sent.append([])

In [94]:
clean_sent[0]

'나 유치원 때 잠깐 사물함 뒤쪽 에 뭐 좀 넣다 문 을 탁 닫다 하다 남자 애가 있다 거야 그리고 날 그 윽 하 게 보더 니 ” 이 것 만은 잊다 마 ” 라고 하다 볼 에 뽀뽀 해주다 가다 '

In [104]:
word_index = []
for sent in clean_sent:
    word_to_index = {word: index for index, word in enumerate(sent)}
    word_index.append(word_to_index)
print(word_index[0])

{'나': 0, ' ': 105, '유': 2, '치': 3, '원': 4, '때': 6, '잠': 8, '깐': 9, '사': 11, '물': 12, '함': 13, '뒤': 15, '쪽': 16, '에': 94, '뭐': 20, '좀': 22, '넣': 24, '다': 104, '문': 27, '을': 29, '탁': 31, '닫': 33, '하': 89, '남': 39, '자': 40, '애': 42, '가': 103, '있': 45, '거': 48, '야': 49, '그': 57, '리': 52, '고': 87, '날': 55, '윽': 59, '게': 63, '보': 65, '더': 66, '니': 68, '”': 84, '이': 72, '것': 74, '만': 76, '은': 77, '잊': 79, '마': 82, '라': 86, '볼': 92, '뽀': 97, '해': 99, '주': 100}


In [105]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
encoded = tokenizer.texts_to_sequences(clean_sent)

In [106]:
padded = pad_sequences(encoded, padding='post', maxlen = max(len(item) for item in encoded))
type(padded)
padded.shape

(50, 787)

In [107]:
np.savetxt("clean_db.csv", padded, delimiter=",", fmt="%d")