In [2]:
# Corpus 정의

text = '''Beneath it were the words: “Stay Hungry. Stay Foolish.” 
It was their farewell message as they signed off. Stay Hungry. 
Stay Foolish. And I have always wished that for myself. 
And now, as you graduate to begin anew, I wish that for you.'''

In [3]:
# 특수문자 제거(Cleaning)

import string     # 
import re         # 파이썬의 정규식 모듈 re - 문자와 스페이스를 제외한 나머지 특수 문자들을 제거

clean = re.sub('[^\w\s]', '', text) # 문자 스페이스를 제외한 나머지 (^) 특수 문자 제거
print(clean)


Beneath it were the words Stay Hungry Stay Foolish 
It was their farewell message as they signed off Stay Hungry 
Stay Foolish And I have always wished that for myself 
And now as you graduate to begin anew I wish that for you


In [4]:
clean = re.sub('\n', '', clean)     # 개행 문자도 제거
print(clean)

Beneath it were the words Stay Hungry Stay Foolish It was their farewell message as they signed off Stay Hungry Stay Foolish And I have always wished that for myself And now as you graduate to begin anew I wish that for you


In [21]:
# 토큰화(Tokenization)

import nltk
nltk.download('punkt')   # punkt 모듈 다운로드 -> sent_tokenize 와 word_tokenize 함수를 사용하여 모듈화 작업
nltk.download('all')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_da

True

In [40]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [41]:
send_tokens = sent_tokenize(text)    # 주어진 text를 sentence 단위로 tokenize
print(send_tokens)

# 에러 해결 : nltk.download('all') - 모두 설치한 경우 해결 (240908)

['Beneath it were the words: “Stay Hungry.', 'Stay Foolish.” \nIt was their farewell message as they signed off.', 'Stay Hungry.', 'Stay Foolish.', 'And I have always wished that for myself.', 'And now, as you graduate to begin anew, I wish that for you.']


In [42]:
tokens = word_tokenize(clean)    # 주어진 text를 word 단위로 tokenize
print(tokens)

['Beneath', 'it', 'were', 'the', 'words', 'Stay', 'Hungry', 'Stay', 'Foolish', 'It', 'was', 'their', 'farewell', 'message', 'as', 'they', 'signed', 'off', 'Stay', 'Hungry', 'Stay', 'Foolish', 'And', 'I', 'have', 'always', 'wished', 'that', 'for', 'myself', 'And', 'now', 'as', 'you', 'graduate', 'to', 'begin', 'anew', 'I', 'wish', 'that', 'for', 'you']


In [43]:
# 불용어 제거 (Stopword Elimination)

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]   # 불용어 사전에 있는 단어는 제외
print(tokens)

['Beneath', 'words', 'Stay', 'Hungry', 'Stay', 'Foolish', 'It', 'farewell', 'message', 'signed', 'Stay', 'Hungry', 'Stay', 'Foolish', 'And', 'I', 'always', 'wished', 'And', 'graduate', 'begin', 'anew', 'I', 'wish']


In [45]:
tokens = [token for token in tokens if len(token) >= 3]   # 세 글자 이상으로 구성된 단어만 토큰으로 사용
print(tokens)

['Beneath', 'words', 'Stay', 'Hungry', 'Stay', 'Foolish', 'farewell', 'message', 'signed', 'Stay', 'Hungry', 'Stay', 'Foolish', 'And', 'always', 'wished', 'And', 'graduate', 'begin', 'anew', 'wish']


In [46]:
# 소문자화 (lower capitalization)

tokens = [token.lower() for token in tokens]
print(tokens)


['beneath', 'words', 'stay', 'hungry', 'stay', 'foolish', 'farewell', 'message', 'signed', 'stay', 'hungry', 'stay', 'foolish', 'and', 'always', 'wished', 'and', 'graduate', 'begin', 'anew', 'wish']


In [48]:
nltk.download('averaged_perceptron_tagger')    # 형태소 분석을 위한 도구 : 형태소 분석기

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wonta\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [49]:
from nltk.tag import pos_tag

tagged_list = pos_tag(set(tokens))   # set 함수 : 중복된 단어는 제외 -> 한 개만 포함
print(tagged_list)

[('hungry', 'JJ'), ('anew', 'RB'), ('wished', 'VBN'), ('signed', 'VBN'), ('wish', 'JJ'), ('stay', 'NN'), ('message', 'NN'), ('always', 'RB'), ('farewell', 'VBP'), ('beneath', 'NN'), ('words', 'NNS'), ('begin', 'VBP'), ('graduate', 'JJ'), ('foolish', 'NN'), ('and', 'CC')]


In [None]:
# 참고 : 형태소 분석 기호 - https://cheris8.github.io/data%20analysis/TP-Morpheme-Analysis/