## NLTK패키지의 sent_tokenize를 이용하여 문장 토큰화를 진행
### 1. 패키지 임포트

In [1]:
from nltk import sent_tokenize
import nltk
nltk.download('punkt')

C:\Users\taeeu\anaconda3\lib\site-packages\numpy\.libs\libopenblas.el2c6ple4zyw3eceviv3oxxgrn2nrfm2.gfortran-win_amd64.dll
C:\Users\taeeu\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\taeeu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. 문장 토큰화

In [2]:
text_sample = '''The Matrix is everywhere its all around us,
here even in this room. you can see it out your window or on your television.
you feel it when you go to work, or go to church or pay your taxes.'''

# sent_tokenize(text="입력할 텍스트")
sentences = sent_tokenize(text=text_sample)

print("결과 :", sentences)

# text가 몇개의 문장으로 되어있는지 개수를 세줌
print("문장 개수 :", len(sentences))

결과 : ['The Matrix is everywhere its all around us,\nhere even in this room.', 'you can see it out your window or on your television.', 'you feel it when you go to work, or go to church or pay your taxes.']
문장 개수 : 3


## 3. 단어 토큰화

In [3]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room"
words = word_tokenize(sentence)

print(type(words))
print("결과 :", words)

<class 'list'>
결과 : ['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room']


## 4. 문장 토큰화와 단어 토큰화의 결합

In [7]:
from nltk import word_tokenize, sent_tokenize

#함수 생성
def tokenize_text(text):
    
    #문장별로 분리
    sentences = sent_tokenize(text)
    
    #분리된 문장별 단어 토큰화
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens

#함수에 text_sample을 넣어줌
word_tokens =  tokenize_text(text_sample)

#반환값 word_tokens 출력
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['you', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['you', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


## 5. StopWords 제거

In [10]:
import nltk

# 'english'언어의 불용어 목록을 가져옴
stopwords = nltk.corpus.stopwords.words('english')

all_tokens = []

for i in word_tokens:
    
    filltered_words = []
    
    for word in i:
        
        #소문자로 변환
        word=word.lower()
        
        #word가 불용어에 없다면 filltered_words에 추가
        if word not in stopwords:
            filltered_words.append(word)
            
    all_tokens.append(filltered_words)

print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


## 6. Stemming과 Lemmatization - 어근 추출

In [11]:
from nltk.stem import LancasterStemmer

#Stemming작업을 하는 LancasterStemmer을 stemmer라고 지칭하겠다는 명령!
stemmer = LancasterStemmer()

In [12]:
#stemmer.stem('단어')를 통해서 stemming(어근추출)후 print
print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('happiest'), stemmer.stem('happier'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))
print(stemmer.stem('amuses'),stemmer.stem('amusing'), stemmer.stem('amused'))

work work work
happiest happy
fant fanciest
amus amus amus


In [16]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# Lemmatization작업을 하는 WordNetLemmatizer을 lemma라고 지칭하겠다는 명령!
lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\taeeu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
# lemma.lemmatize('단어', '품사')를 통해서 Lemmatization(어근 추출) 후 print
print(lemma.lemmatize('amusing','v'), lemma.lemmatize('amuses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('happier','a'), lemma.lemmatize('happiest','a'))
print(lemma.lemmatize('fancier','a'), lemma.lemmatize('fanciest','a'))

amuse amuse amuse
happy happy
fancy fancy
