In [1]:
# ! pip3 install -U nltk

In [2]:
# Need to run only ONCE
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yyoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## load data

In [3]:
with open('./data/text.txt', 'r') as fp:
    texts = fp.read()
texts

"Before we proceed any further, hear me speak.\nSpeak, speak.\nYou are all resolved rather to die than to famish?\nResolved. resolved.\nFirst, you know Caius Marcius is chief enemy to the people.\nWe know't, we know't.\nLet us kill him, and we'll have corn at our own price.\n"

## tokenize by sentence

In [4]:
from nltk import sent_tokenize
sentences = sent_tokenize(texts)
sentences

['Before we proceed any further, hear me speak.',
 'Speak, speak.',
 'You are all resolved rather to die than to famish?',
 'Resolved.',
 'resolved.',
 'First, you know Caius Marcius is chief enemy to the people.',
 "We know't, we know't.",
 "Let us kill him, and we'll have corn at our own price."]

## tokenize by word

In [5]:
from nltk import word_tokenize

words = word_tokenize(texts)
words

['Before',
 'we',
 'proceed',
 'any',
 'further',
 ',',
 'hear',
 'me',
 'speak',
 '.',
 'Speak',
 ',',
 'speak',
 '.',
 'You',
 'are',
 'all',
 'resolved',
 'rather',
 'to',
 'die',
 'than',
 'to',
 'famish',
 '?',
 'Resolved',
 '.',
 'resolved',
 '.',
 'First',
 ',',
 'you',
 'know',
 'Caius',
 'Marcius',
 'is',
 'chief',
 'enemy',
 'to',
 'the',
 'people',
 '.',
 'We',
 "know't",
 ',',
 'we',
 "know't",
 '.',
 'Let',
 'us',
 'kill',
 'him',
 ',',
 'and',
 'we',
 "'ll",
 'have',
 'corn',
 'at',
 'our',
 'own',
 'price',
 '.']

## token 의 빈도분석

In [6]:
from nltk import FreqDist
token_dict = dict(FreqDist(words))
token_dict

{'Before': 1,
 'we': 3,
 'proceed': 1,
 'any': 1,
 'further': 1,
 ',': 5,
 'hear': 1,
 'me': 1,
 'speak': 2,
 '.': 7,
 'Speak': 1,
 'You': 1,
 'are': 1,
 'all': 1,
 'resolved': 2,
 'rather': 1,
 'to': 3,
 'die': 1,
 'than': 1,
 'famish': 1,
 '?': 1,
 'Resolved': 1,
 'First': 1,
 'you': 1,
 'know': 1,
 'Caius': 1,
 'Marcius': 1,
 'is': 1,
 'chief': 1,
 'enemy': 1,
 'the': 1,
 'people': 1,
 'We': 1,
 "know't": 2,
 'Let': 1,
 'us': 1,
 'kill': 1,
 'him': 1,
 'and': 1,
 "'ll": 1,
 'have': 1,
 'corn': 1,
 'at': 1,
 'our': 1,
 'own': 1,
 'price': 1}

In [7]:
# ! pip3 install pandas

In [8]:
import pandas as pd
texts_token_series = pd.Series(token_dict)
texts_token_series.sort_values(ascending=False)

.           7
,           5
we          3
to          3
speak       2
know't      2
resolved    2
Before      1
us          1
chief       1
enemy       1
the         1
people      1
We          1
Let         1
kill        1
Marcius     1
him         1
and         1
'll         1
have        1
corn        1
at          1
our         1
own         1
is          1
you         1
Caius       1
are         1
proceed     1
any         1
further     1
hear        1
me          1
Speak       1
You         1
all         1
know        1
rather      1
die         1
than        1
famish      1
?           1
Resolved    1
First       1
price       1
dtype: int64

In [9]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
token = tokenizer.tokenize(texts)
print(token)

['Before', 'we', 'proceed', 'any', 'further', ',', 'hear', 'me', 'speak', '.', 'Speak', ',', 'speak', '.', 'You', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish', '?', 'Resolved', '.', 'resolved', '.', 'First', ',', 'you', 'know', 'Caius', 'Marcius', 'is', 'chief', 'enemy', 'to', 'the', 'people', '.', 'We', 'know', "'", 't', ',', 'we', 'know', "'", 't', '.', 'Let', 'us', 'kill', 'him', ',', 'and', 'we', "'", 'll', 'have', 'corn', 'at', 'our', 'own', 'price', '.']


In [10]:
from nltk import pos_tag
pos_tag(token)

[('Before', 'IN'),
 ('we', 'PRP'),
 ('proceed', 'VBP'),
 ('any', 'DT'),
 ('further', 'JJ'),
 (',', ','),
 ('hear', 'VB'),
 ('me', 'PRP'),
 ('speak', 'JJ'),
 ('.', '.'),
 ('Speak', 'NNP'),
 (',', ','),
 ('speak', 'NN'),
 ('.', '.'),
 ('You', 'PRP'),
 ('are', 'VBP'),
 ('all', 'DT'),
 ('resolved', 'VBD'),
 ('rather', 'RB'),
 ('to', 'TO'),
 ('die', 'VB'),
 ('than', 'IN'),
 ('to', 'TO'),
 ('famish', 'VB'),
 ('?', '.'),
 ('Resolved', 'NNP'),
 ('.', '.'),
 ('resolved', 'VBN'),
 ('.', '.'),
 ('First', 'NNP'),
 (',', ','),
 ('you', 'PRP'),
 ('know', 'VBP'),
 ('Caius', 'NNP'),
 ('Marcius', 'NNP'),
 ('is', 'VBZ'),
 ('chief', 'JJ'),
 ('enemy', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('people', 'NNS'),
 ('.', '.'),
 ('We', 'PRP'),
 ('know', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 (',', ','),
 ('we', 'PRP'),
 ('know', 'VBP'),
 ("'", "''"),
 ('t', 'NN'),
 ('.', '.'),
 ('Let', 'VB'),
 ('us', 'PRP'),
 ('kill', 'VB'),
 ('him', 'PRP'),
 (',', ','),
 ('and', 'CC'),
 ('we', 'PRP'),
 ("'", "''"),
 ('ll', 'NNS')

## tag 설명

In [11]:
import nltk
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /Users/yyoo/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [12]:
import nltk.help as nltk_help
nltk_help.upenn_tagset('PRP')  # 대명사

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [13]:
nltk_help.upenn_tagset('JJ')  # 형용사

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


## remove stop words

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yyoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords
stopwords.words('english')[::18]

['i', 'him', 'which', 'had', 'at', 'in', 'both', 's', 'aren', "mightn't"]

In [16]:
import string

punct = string.punctuation
punct = [punct[i] for i in range(len(punct))]

words_to_remove = punct + stopwords.words('english') + ['\n']

In [17]:
token = [word for word in token  
         if word not in words_to_remove]
print(token)

['Before', 'proceed', 'hear', 'speak', 'Speak', 'speak', 'You', 'resolved', 'rather', 'die', 'famish', 'Resolved', 'resolved', 'First', 'know', 'Caius', 'Marcius', 'chief', 'enemy', 'people', 'We', 'know', 'know', 'Let', 'us', 'kill', 'corn', 'price']
