In [2]:
import os
import sys
import re

import numpy as np
import pandas as pd

import json

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

* basic
    * 가장 기초적 전처리
    * punctuation 제거
* spell check
    * 오탈자 교정
    * 줄임말 원형 복원 (i'm not happy -> I am not happy)
* part of speech
    * 형태소분석
    * noun, adjective, verb, adverb
* stemming
    * 형태소 분석 후 동사 원형 복원
* stopwords

In [3]:
review = pd.read_csv("data/review_datal_all.csv")
review = pd.DataFrame(review)

In [4]:
test = list(review.text)
len(test) #15개의 데이터

6653

In [5]:
#제일 처음에 사용하던 전처리
"""
#1.띄워쓰기
from pykospacing import spacing  
space_doc=[]
for doc in test:
    space_doc.append(spacing(doc))
#2. 맞춤법과 한글만 남기기
from hanspell import spell_checker
checked_doc =[]
for doc in test:
    check = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc) #한글
    check = spell_checker.check(check).checked #맞춤법
    checked_doc.append(check) 
    
#3.stopword
path = 'data/stopwords-ko.txt'
with open(path,encoding="utf-8") as f:
    stopwords =[word for word in f]

#4.tokenization - 품사 (형용사, 동사에 감성분석)
tokenized_doc=[]
word_doc = []
tokenization = input()
from konlpy.tag import Okt
tokenizer = Okt()
for doc in checked_doc:
    if(tokenization == "품사"):
        token = [pair for pair in tokenizer.pos(doc) if pair[0] not in stopwords and len(pair[0]) > 1] 
        words = [word for word, pos in token]
        tokenized_doc.append(words)
        word_doc.append(token)
    if(tokenization == "단어"):
        token = [word for word in doc.split() if word not in stopwords and len(word) > 1] 
        tokenized_doc.append(token)
    if(tokenization == "형태소"):
        token = [word for word in tokenizer.morphs(doc,stem=True) if word not in stopwords and len(word) > 1] 
        tokenized_doc.append(token)
print(len(tokenized_doc))
print(len(word_doc)) #품사일때
"""

'\n#1.띄워쓰기\nfrom pykospacing import spacing  \nspace_doc=[]\nfor doc in test:\n    space_doc.append(spacing(doc))\n#2. 맞춤법과 한글만 남기기\nfrom hanspell import spell_checker\nchecked_doc =[]\nfor doc in test:\n    check = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc) #한글\n    check = spell_checker.check(check).checked #맞춤법\n    checked_doc.append(check) \n    \n#3.stopword\npath = \'data/stopwords-ko.txt\'\nwith open(path,encoding="utf-8") as f:\n    stopwords =[word for word in f]\n\n#4.tokenization - 품사 (형용사, 동사에 감성분석)\ntokenized_doc=[]\nword_doc = []\ntokenization = input()\nfrom konlpy.tag import Okt\ntokenizer = Okt()\nfor doc in checked_doc:\n    if(tokenization == "품사"):\n        token = [pair for pair in tokenizer.pos(doc) if pair[0] not in stopwords and len(pair[0]) > 1] \n        words = [word for word, pos in token]\n        tokenized_doc.append(words)\n        word_doc.append(token)\n    if(tokenization == "단어"):\n        token = [word for word in doc.split() if word not in stopwords and len(

# basic and check spelling

In [6]:
#1. korean
import re
clean_doc =[]
for doc in test:
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc) #한글
    clean_doc.append(doc)

In [7]:
'''
normalize-soynlp
띄워쓰기 pykospacing
맞춤법 hanspell
외래어 사전 다운로드
'''
!pip install soynlp 
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git
!pip install git+https://github.com/ssut/py-hanspell.git
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1RNYpLE-xbMCGtiEHIoNsCmfcyJP3kLYn" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1RNYpLE-xbMCGtiEHIoNsCmfcyJP3kLYn" -o confused_loanwords.txt

Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /private/var/folders/0y/sw67dqxs30d9l05l8m7j8kfm0000gn/T/pip-req-build-kodk6rbx
  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /private/var/folders/0y/sw67dqxs30d9l05l8m7j8kfm0000gn/T/pip-req-build-kodk6rbx
Collecting argparse>=1.4.0
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0


Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /private/var/folders/0y/sw67dqxs30d9l05l8m7j8kfm0000gn/T/pip-req-build-ii3c74yz
  Running command git clone -q https://github.com/ssut/py-hanspell.git /private/var/folders/0y/sw67dqxs30d9l05l8m7j8kfm0000gn/T/pip-req-build-ii3c74yz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0    438      0 --:--:-- --:--:-- --:--:--   437
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 19779  100 19779    0     0   9954      0  0:00:01  0:00:01 --:--:--  9954


In [9]:
lownword_map = {}
lownword_data = open('confused_loanwords.txt', 'r', encoding='utf-8')

lines = lownword_data.readlines()

for line in lines:
    line = line.strip()
    miss_spell = line.split('\t')[0]
    ori_word = line.split('\t')[1]
    lownword_map[miss_spell] = ori_word

In [12]:
pip install numpy==1.19.2

Collecting numpy==1.19.2
  Downloading numpy-1.19.2-cp38-cp38-macosx_10_9_x86_64.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 415 kB/s eta 0:00:01
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.0
    Uninstalling numpy-1.19.0:
      Successfully uninstalled numpy-1.19.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyldavis 3.3.1 requires numpy>=1.20.0, but you have numpy 1.19.2 which is incompatible.[0m
Successfully installed numpy-1.19.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
from pykospacing import spacing
from hanspell import spell_checker
from soynlp.normalizer import *

In [11]:
spell_preprocessed_doc = []
for doc in clean_doc:
    spaced_doc = spacing(doc) #띄워쓰기
    spelled_doc = spell_checker.check(spaced_doc) #맞춤법
    checked_doc = spelled_doc.checked
    normalized_doc = repeat_normalize(checked_doc) #정규화
    for lownword in lownword_map:
        normalized_doc = normalized_doc.replace(lownword, lownword_map[lownword])
    spell_preprocessed_doc.append(normalized_doc)

In [13]:
review["spell_preprocessed"] = spell_preprocessed_doc

# tokenizatioin and stemming
- mecab (stemming까지 추가해서 비교해볼 예정)
- khaiii

In [15]:
#Mecab 
#3.stopword
path = 'data/stopwords-ko.txt'
with open(path,encoding="utf-8") as f:
    stopwords =[word for word in f]

#4.tokenization - 품사 (형용사, 동사에 감성분석)
tokenized_doc=[]
word_doc = []
from konlpy.tag import Mecab
tokenizer = Mecab()
for doc in spell_preprocessed_doc:
    token = [pair for pair in tokenizer.pos(doc) if pair[0] not in stopwords and len(pair[0]) > 1] 
    words = [word for word, pos in token]
    tokenized_doc.append(words)
    word_doc.append(token)

In [16]:
#khaiii
from khaiii import KhaiiiApi
api = KhaiiiApi()

significant_tags = ['NNG', 'NNP', 'NNB', 'VV', 'VA', 'VX', 'MAG', 'MAJ', 'XSV', 'XSA']

def pos_text(texts):
    corpus = []
    error = []
    for sent in texts:
        pos_tagged = ''
        try:
            words = api.analyze(sent)
            for word in words:
                for morph in word.morphs:
                    if morph.tag in significant_tags:
                        pos_tagged += morph.lex + '/' + morph.tag + ' '
            corpus.append(pos_tagged.strip())
        except:
                error.append(sent)
                corpus.append("")
    return corpus, error

In [17]:
pos_tagged_corpus, error = pos_text(spell_preprocessed_doc)
print(len(pos_tagged_corpus),len(error)) #error는 수작업..?

6653 102


동사를 원형으로 복원하도록 하겠습니다.
규칙은 다음과 같습니다.

1. NNG|NNP|NNB + XSV|XSA --> NNG|NNP|NNB + XSV|XSA + 다
2. NNG|NNP|NNB + XSA + VX --> NNG|NNP + XSA + 다
3. VV --> VV + 다
4. VX --> VX + 다

In [18]:
p1 = re.compile('[가-힣A-Za-z0-9]+/NN. [가-힣A-Za-z0-9]+/XS.')
p2 = re.compile('[가-힣A-Za-z0-9]+/NN. [가-힣A-Za-z0-9]+/XSA [가-힣A-Za-z0-9]+/VX')
p3 = re.compile('[가-힣A-Za-z0-9]+/VV')
p4 = re.compile('[가-힣A-Za-z0-9]+/VX')

In [19]:
def stemming_text(text):
    corpus = []
    for sent in text:
        ori_sent = sent
        mached_terms = re.findall(p1, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            modi_terms += '다/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)
        
        mached_terms = re.findall(p2, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                if tag != 'VX':
                    modi_terms += lemma
            modi_terms += '다/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)

        mached_terms = re.findall(p3, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            if '다' != modi_terms[-1]:
                modi_terms += '다'
            modi_terms += '/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)

        mached_terms = re.findall(p4, ori_sent)
        for terms in mached_terms:
            ori_terms = terms
            modi_terms = ''
            for term in terms.split(' '):
                lemma = term.split('/')[0]
                tag = term.split('/')[-1]
                modi_terms += lemma
            if '다' != modi_terms[-1]:
                modi_terms += '다'
            modi_terms += '/VV'
            ori_sent = ori_sent.replace(ori_terms, modi_terms)
        corpus.append(ori_sent)
    return corpus

In [20]:
stemming_corpus = stemming_text(pos_tagged_corpus)

In [21]:
for i in range(0, 30):
    print(stemming_corpus[i])

판례/NNG 평석/NNG 잘/MAG 하/XSV 성적/NNG 잘/MAG 주다/VV 것/NNB 같/VA 중간/NNG 개/NNG 망치다/VV 기말/NNG 틀리다/VV 문제/NNG 개/NNB 있다/VV 판례/NNG 평석/NNG 과제/NNG 평가/NNG 전부/NNG 성적/NNG 결국/NNG 나오다/VV
과제/NNG 되다/VV 귀찮/VA 많/VA 시험/NNG 말발/NNG 좋/VA 되다/VV 시험/NNG 망치다/VV 과제/NNG 열심히/MAG 하다/VV 점수/NNG 잘/MAG 받다/VV
수업/NNG 띄우다/VV 놓다/VV 하다/VV 보통/NNG 교안/NNG 없/VA 관련/NNG 사례/NNG 중심/NNG 많이/MAG 얘기하다/VV 심/NNG 수업/NNG 잘/MAG 듣다/VV 시험/NNG 잘/MAG 보다/VV 수/NNB 있다/VV 성적/NNG 후하다/VV 주다/VV 편/NNB 다만/MAG 과제/NNG 분량/NNG 채우다/VV 쓰다/VV 쉽/VA 않다/VV
교재/NNG 없/VA 내용/NNG 많이/MAG 얘기하다/VV 주다/VV 말씀하다/VV 내용/NNG 시험/NNG 문제/NNG 내다/VV 교재/NNG 그렇/VA 크/VA 필요/NNG 없/VA 과제/NNG 좀/MAG 시간/NNG 오래/MAG 걸리다/VV 하다/VV 데/NNG 판례/NNG 평석/NNG 민사/NNG 소송법/NNG 강의/NNG 내주다/VV 어차피/MAG 내년/NNG 하다/VV 되다/VV 계속하다/VV 되다/VV 거/NNB 연습하다/VV 다/MAG 셈/NNB 치다/VV 해/NNG 보다/VV 길/NNG 점수/NNG 나/NNB 름/NNG 잘/MAG 주다/VV 것/NNB 같/VA 방청/NNG 재미있/VA 상당히/MAG 만족하다/VV 강의/NNG 강의/NNG 내용/NNG 재밌/VA 시험/NNG 문제/NNG 괜찮/VA 학점/NNG 원래/NNG 잘/MAG 주다/VV 교수/NNG 법학/NNG 과/NNG 들어오다/VV 강의/NNG 듣다/VV 전/NNG 수강하다/VV 최적되다/VV 과목/NNG 듯/NNB
사이버/NNG 강의라/NNG 괜찮

In [22]:
stemming_corpus[0]

'판례/NNG 평석/NNG 잘/MAG 하/XSV 성적/NNG 잘/MAG 주다/VV 것/NNB 같/VA 중간/NNG 개/NNG 망치다/VV 기말/NNG 틀리다/VV 문제/NNG 개/NNB 있다/VV 판례/NNG 평석/NNG 과제/NNG 평가/NNG 전부/NNG 성적/NNG 결국/NNG 나오다/VV'

In [23]:
corpus =[]
for doc in stemming_corpus:
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc) #한글
    corpus.append(doc)

In [24]:
review["corpus"] = corpus

In [25]:
review.to_csv("data/preprocessing.csv")

In [114]:
corpus[0]

'판례 평석 잘 하 성적 잘 주다 것 같 중간 개 망치다 기말 틀리다 문제 개 있다 판례 평석 과제 평가 전부 성적 결국 나오다'

In [115]:
#전처리 파일로 저장
with open('data/tokenized_doc.txt','w') as f:
    for doc in corpus:
        f.writelines(doc)
        f.write('\n')

In [119]:
#corpus파일 불러오기
data =[]
with open('data/tokenized_doc.txt','r') as f:
    for doc in f:
        doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        data.append(doc)

In [None]:
#lemmatization ex.개, 겁나 -> 너무