## 1. Word Tokenization

In [1]:
sent = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."

In [2]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [3]:
print(word_tokenize(sent))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal', '.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


In [4]:
print(WordPunctTokenizer().tokenize(sent))

['Starting', 'a', 'home', '-', 'based', 'restaurant', 'may', 'be', 'an', 'ideal', '.', 'it', 'doesn', "'", 't', 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


In [5]:
print(text_to_word_sequence(sent))

['starting', 'a', 'home', 'based', 'restaurant', 'may', 'be', 'an', 'ideal', 'it', "doesn't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own']


### Treebank Tokenization(표준 방법 중 하나)
    규칙 1. 하이픈으로 구성된 단어는 하나로 유지한다.
    규칙 2. don't 와 같이 " ' " 로 접어가 함께 있는 단어는 분리한다.

In [6]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(sent))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 2. Sentence Tokenization 

#### English

In [7]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."

In [8]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(text))

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


#### Korean

In [9]:
text='딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?'

In [10]:
from kss import split_sentences

print(split_sentences(text))

['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요.', '농담아니에요.', '이제 해보면 알걸요?']


## 3. Part-of-speech tagging

#### English

In [11]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."

In [12]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.tag import pos_tag

tokenizer = TreebankWordTokenizer()
words = tokenizer.tokenize(text)
print(pos_tag(words))

[('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students.', 'NN'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D', 'NNP'), ('student', 'NN'), ('.', '.')]


In [2]:
text = "열심히 코딩한 당신, 연휴에는 여행을 가봐요"

In [3]:
from konlpy.tag import Okt

okt = Okt()
print(okt.morphs(text))

['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']


In [5]:
print(okt.pos(text))

[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]


In [6]:
print(okt.nouns(text))

['코딩', '당신', '연휴', '여행']
