# NLTK 

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/younghun/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## 토큰화

In [3]:
# 문장 Tokenization
from nltk import sent_tokenize
text_sample = 'The Matrix is everywhere its all around us, here even in this room.  \
              You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'

sentences = sent_tokenize(text=text_sample)
print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [4]:
# 단어 Tokenization
# 쉼표, 마침표도 하나의 토큰화시켜줌
from nltk import word_tokenize

sentence = 'The Matrix is everywhere its all around us, here even in this room.'
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [5]:
# 여러 문장들에 대한 단어 토큰화
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    # 문장별로 토큰화
    sentences = sent_tokenize(text)
    # 분리된 문장을 단어 토큰화
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


In [7]:
# N-gram으로 토큰화
from nltk import ngrams

sentence = 'The Matrix is everywhere its all around us, here even in this room.'
words = word_tokenize(sentence)

# ngram적용할때는 단어로 토큰화시킨 후 적용하기
all_ngrams = ngrams(words, 3)
# list comprephension으로 append사용하지 않고 한 번에 하기
ngrams = [ngram for ngram in all_ngrams]
print(ngrams)

[('The', 'Matrix', 'is'), ('Matrix', 'is', 'everywhere'), ('is', 'everywhere', 'its'), ('everywhere', 'its', 'all'), ('its', 'all', 'around'), ('all', 'around', 'us'), ('around', 'us', ','), ('us', ',', 'here'), (',', 'here', 'even'), ('here', 'even', 'in'), ('even', 'in', 'this'), ('in', 'this', 'room'), ('this', 'room', '.')]


## 불용어(Stopwords) 제거

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/younghun/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# 영어의 불용어 종류에 뭐가 있는지 살펴보기
print('영어 불용어 개수:', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

영어 불용어 개수: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [13]:
# nltk이용해서 문장에서 불용어 제거한 후 단어 토큰화
import nltk
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
# 3개 문장을 단어토큰화 시킨 것을 loop문 돌리면서 불용어 제거
for sentence in word_tokens:
    filtered_word = []
    for word in sentence:
        word = word.lower()
        # 불용어 사전에 없으면 출력
        if word not in stopwords:
            filtered_word.append(word)
    all_tokens.append(filtered_word)
print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


## Stemming, Lemmatization

- 둘 다 원형, 어근을 추출하는 방법
- Lemmatization이 더 정교한 방법임

In [14]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('amuses'),stemmer.stem('amused'))
print(stemmer.stem('happier'),stemmer.stem('happiest'))
print(stemmer.stem('fancier'),stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/younghun/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing','v'),lemma.lemmatize('amuses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('happier','a'),lemma.lemmatize('happiest','a'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fanciest','a'))

amuse amuse amuse
happy happy
fancy fancy
