# 텍스트 전처리( Text preprocessing )

# 토큰화( Tokenization )

## 단어 토큰화( Word Tokenization )

In [1]:
text = "Don't be fooled by the dark sounding name, Jone's Orphanage is as cheery as cheery goes for a pastry shop."

## nltk package의 토큰화 메소드

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
print( word_tokenize( text ) )

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [4]:
from nltk.tokenize import WordPunctTokenizer

In [5]:
print( WordPunctTokenizer().tokenize( text ) )

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


## keras의 토큰화 메소드

In [6]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [7]:
print( text_to_word_sequence( text ) )

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


## 표준 토큰화 예

In [8]:
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."

In [9]:
from nltk.tokenize import TreebankWordTokenizer

In [10]:
tokenizer = TreebankWordTokenizer()

In [11]:
print( tokenizer.tokenize( text ) )

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 문장 토큰화( Sentence Tokenization )

In [12]:
text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print( text )

His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near.


In [13]:
from nltk.tokenize import sent_tokenize

In [14]:
print( sent_tokenize( text ) )

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']


In [15]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student"

In [16]:
print( sent_tokenize( text ) )

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student']


In [None]:
!pip install kss

In [17]:
import kss

In [19]:
text = '딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?'
print( text )

딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?


In [20]:
print( kss.split_sentences( text ) )

['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요.', '농담아니에요.', '이제 해보면 알걸요?']


## NLTK와 KoNLPY를 이용한 영어, 한국어 토큰화 실습

In [21]:
from nltk.tokenize import word_tokenize

In [25]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student"
print( text )

I am actively looking for Ph.D. students. and you are a Ph.D student


In [26]:
print( word_tokenize( text ) )

['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D', 'student']


In [27]:
from nltk.tag import pos_tag

In [28]:
x = word_tokenize( text )
pos_tag( x )

[('I', 'PRP'),
 ('am', 'VBP'),
 ('actively', 'RB'),
 ('looking', 'VBG'),
 ('for', 'IN'),
 ('Ph.D.', 'NNP'),
 ('students', 'NNS'),
 ('.', '.'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('a', 'DT'),
 ('Ph.D', 'NNP'),
 ('student', 'NN')]

## 한국어 NLP에서는 형태소 분석기를 사용한다는 것은 단어 토큰화가 아니라 형태소( morpheme ) 단위로
## 형태소 토큰화( morpheme tokenization )를 수행하게 됨을 뜻한다.

### Okt( 이전 Twitter )을 이용한 토큰화 

In [29]:
from konlpy.tag import Okt

In [31]:
okt = Okt()
print( okt.morphs( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']


In [32]:
print( okt.pos( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]


In [33]:
print( okt.nouns( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

['코딩', '당신', '연휴', '여행']


### 꼬꼬마를 이용한 토큰화

In [34]:
from konlpy.tag import Kkma

In [35]:
kkma = Kkma()
print( kkma.morphs( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']


In [36]:
print( kkma.pos( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

[('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]


In [37]:
print( kkma.nouns( '열심히 코딩한 당신, 연휴에는 여행을 가봐요' ) )

['코딩', '당신', '연휴', '여행']
