# 문장 토큰화
- 영어 : nltk, spaCy

## 영어 문장 토큰화

In [1]:
text="I never thought through love we'd be. Making one as lovely as she. But isn't she lovely made from love."

- 문장 토큰화(Sentence Tokenization)는 마침표(.), 느낌표(!), 물음표(?)로 할 수 있음.
- !, ?는 확실히 문장이 끝났다의 기준이 될 수 있으나 마침표는 아닌 경우도 있음.

In [2]:
tokenized_sentence = text.split(".")
tokenized_sentence

["I never thought through love we'd be",
 ' Making one as lovely as she',
 " But isn't she lovely made from love",
 '']

In [3]:
text = "My name is Ms.Lee. I'm a Ph.D. student. My IP Address is 192.168.1.51"
text.split(".")

['My name is Ms',
 'Lee',
 " I'm a Ph",
 'D',
 ' student',
 ' My IP Address is 192',
 '168',
 '1',
 '51']

In [4]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from nltk.tokenize import sent_tokenize

sent_tokenize(text)

['My name is Ms.Lee.', "I'm a Ph.D. student.", 'My IP Address is 192.168.1.51']

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")

# 입력 텍스트 토큰화
doc = nlp(text)

list(doc.sents)

[My name is Ms.Lee., I'm a Ph.D. student., My IP Address is 192.168.1.51]

In [7]:
for sent in doc.sents:
  print(sent)

My name is Ms.Lee.
I'm a Ph.D. student.
My IP Address is 192.168.1.51


## 영어 단어 토큰화

In [8]:
text = "All of my change I spent on you Where have the times gone?"
text.split()

['All',
 'of',
 'my',
 'change',
 'I',
 'spent',
 'on',
 'you',
 'Where',
 'have',
 'the',
 'times',
 'gone?']

nltk에서 기본적으로 제공하는 Tokenizer

In [10]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.tokenize import word_tokenize

tokenized_word = word_tokenize(text)
tokenized_word

['All',
 'of',
 'my',
 'change',
 'I',
 'spent',
 'on',
 'you',
 'Where',
 'have',
 'the',
 'times',
 'gone',
 '?']

### TreebankWordTokenizer
- 표준 토크나이저

In [12]:
from nltk.tokenize import TreebankWordTokenizer

# 객체 생성
tokenizer = TreebankWordTokenizer()
tokenized_word = tokenizer.tokenize(text)
tokenized_word

['All',
 'of',
 'my',
 'change',
 'I',
 'spent',
 'on',
 'you',
 'Where',
 'have',
 'the',
 'times',
 'gone',
 '?']

In [13]:
# 문장 정제 시 각 문장의 단어의 어간(lemma)을 구해서 다시 문장으로 재구성
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [14]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# 1. 각 단어의 Lemma
word_lemmas = []
for word in tokenized_word:
  word_lemma = lemmatizer.lemmatize(word.lower(), "v")
  word_lemmas.append(word_lemma)

print(word_lemmas)

# 2. Lemma를 다시 띄어쓰기를 기준으로 합쳐주기
print(" ".join(word_lemmas))

['all', 'of', 'my', 'change', 'i', 'spend', 'on', 'you', 'where', 'have', 'the', 'time', 'go', '?']
all of my change i spend on you where have the time go ?


In [15]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [16]:
word_lemmas = [ word.lemma_ for word in doc ]
' '.join(word_lemmas)

'all of my change I spend on you where have the time go ?'

# Keras의 Tokenizer를 이용해서 정수 인코딩

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [19]:
text = """Isn't she lovely.
Isn't she wonderful.
Isn't she precious.
Less than one minute old.
I never thought through love we'd be.
Making one as lovely as she.
But isn't she lovely made from love.
Isn't she pretty.
Truly the angel's best.
Boy, I'm so happy.
We have been heaven blessed.
I can't believe what God has done.
Through us he's given life to one.
But isn't she lovely made from love.
Isn't she lovely.
Life and love are the same.
Life is Aisha.
The meaning of her name.
Londie, it could have not been done.
Without you who conceived the one.
That's so very lovely made from love."""

In [20]:
sent_tokens = sent_tokenize(text)
print(sent_tokens)

["Isn't she lovely.", "Isn't she wonderful.", "Isn't she precious.", 'Less than one minute old.', "I never thought through love we'd be.", 'Making one as lovely as she.', "But isn't she lovely made from love.", "Isn't she pretty.", "Truly the angel's best.", "Boy, I'm so happy.", 'We have been heaven blessed.', "I can't believe what God has done.", "Through us he's given life to one.", "But isn't she lovely made from love.", "Isn't she lovely.", 'Life and love are the same.', 'Life is Aisha.', 'The meaning of her name.', 'Londie, it could have not been done.', 'Without you who conceived the one.', "That's so very lovely made from love."]


In [21]:
# fit_on_texts() 메소드에 문장 코퍼스를 넣어주면 단어 빈도수를 기준으로 단어 집합을 만들어 준다.
#  단어의 빈도수 기준으로 단어 집합을 생성 - 빈도수가 많으면 앞쪽 번호 부여, 빈도수가 적으면 뒷쪽 번호
tokenizer.fit_on_texts(sent_tokens)

In [22]:
# 단어 집합 확인하기
print(tokenizer.word_index)

{'she': 1, "isn't": 2, 'lovely': 3, 'love': 4, 'one': 5, 'the': 6, 'made': 7, 'from': 8, 'life': 9, 'i': 10, 'through': 11, 'as': 12, 'but': 13, 'so': 14, 'have': 15, 'been': 16, 'done': 17, 'wonderful': 18, 'precious': 19, 'less': 20, 'than': 21, 'minute': 22, 'old': 23, 'never': 24, 'thought': 25, "we'd": 26, 'be': 27, 'making': 28, 'pretty': 29, 'truly': 30, "angel's": 31, 'best': 32, 'boy': 33, "i'm": 34, 'happy': 35, 'we': 36, 'heaven': 37, 'blessed': 38, "can't": 39, 'believe': 40, 'what': 41, 'god': 42, 'has': 43, 'us': 44, "he's": 45, 'given': 46, 'to': 47, 'and': 48, 'are': 49, 'same': 50, 'is': 51, 'aisha': 52, 'meaning': 53, 'of': 54, 'her': 55, 'name': 56, 'londie': 57, 'it': 58, 'could': 59, 'not': 60, 'without': 61, 'you': 62, 'who': 63, 'conceived': 64, "that's": 65, 'very': 66}


In [23]:
# 단어 빈도수 확인
print(tokenizer.word_counts)

OrderedDict([("isn't", 7), ('she', 8), ('lovely', 6), ('wonderful', 1), ('precious', 1), ('less', 1), ('than', 1), ('one', 4), ('minute', 1), ('old', 1), ('i', 2), ('never', 1), ('thought', 1), ('through', 2), ('love', 5), ("we'd", 1), ('be', 1), ('making', 1), ('as', 2), ('but', 2), ('made', 3), ('from', 3), ('pretty', 1), ('truly', 1), ('the', 4), ("angel's", 1), ('best', 1), ('boy', 1), ("i'm", 1), ('so', 2), ('happy', 1), ('we', 1), ('have', 2), ('been', 2), ('heaven', 1), ('blessed', 1), ("can't", 1), ('believe', 1), ('what', 1), ('god', 1), ('has', 1), ('done', 2), ('us', 1), ("he's", 1), ('given', 1), ('life', 3), ('to', 1), ('and', 1), ('are', 1), ('same', 1), ('is', 1), ('aisha', 1), ('meaning', 1), ('of', 1), ('her', 1), ('name', 1), ('londie', 1), ('it', 1), ('could', 1), ('not', 1), ('without', 1), ('you', 1), ('who', 1), ('conceived', 1), ("that's", 1), ('very', 1)])


In [24]:
# 인코딩( 텍스트 -> 정수화(수의 형태) )
#  단어 토큰을 정수화 시키기 위해 texts_to_sequences 메소드 사용
# 코퍼스: 텍스트 또는 음성 데이터의 집합
# 단어 코퍼스의 형태로 들어가야 하기 때문에 2차원 배열 형태로
tokenizer.texts_to_sequences(["she is lovely", "life is lovely"]) # "she is lovely" => ["she", "is", "lovely"]

[[1, 51, 3], [9, 51, 3]]

In [25]:
# 디코딩( 정수(수의 형태) -> 텍스트화)
tokenizer.sequences_to_texts([[1, 5, 6, 31, 50, 56]])

["she one the angel's same name"]

# OOV(Out of Vocabulary)

In [26]:
# 빈도 수 상위 5개의 토큰만 사용
vocab_size = 5

tokenizer = Tokenizer(num_words=vocab_size + 2, oov_token="<oov>") # vocab_size + 2 : 기존 단어 집합 5개 + pad, oov 개수까지 포함
tokenizer.fit_on_texts(sent_tokens)

In [27]:
tokenizer.texts_to_sequences(["she is bowwow"])

[[2, 1, 1]]

In [28]:
print(tokenizer.word_index)

{'<oov>': 1, 'she': 2, "isn't": 3, 'lovely': 4, 'love': 5, 'one': 6, 'the': 7, 'made': 8, 'from': 9, 'life': 10, 'i': 11, 'through': 12, 'as': 13, 'but': 14, 'so': 15, 'have': 16, 'been': 17, 'done': 18, 'wonderful': 19, 'precious': 20, 'less': 21, 'than': 22, 'minute': 23, 'old': 24, 'never': 25, 'thought': 26, "we'd": 27, 'be': 28, 'making': 29, 'pretty': 30, 'truly': 31, "angel's": 32, 'best': 33, 'boy': 34, "i'm": 35, 'happy': 36, 'we': 37, 'heaven': 38, 'blessed': 39, "can't": 40, 'believe': 41, 'what': 42, 'god': 43, 'has': 44, 'us': 45, "he's": 46, 'given': 47, 'to': 48, 'and': 49, 'are': 50, 'same': 51, 'is': 52, 'aisha': 53, 'meaning': 54, 'of': 55, 'her': 56, 'name': 57, 'londie': 58, 'it': 59, 'could': 60, 'not': 61, 'without': 62, 'you': 63, 'who': 64, 'conceived': 65, "that's": 66, 'very': 67}


## Padding

In [29]:
integer_tokens = tokenizer.texts_to_sequences(sent_tokens)
print(integer_tokens)

[[3, 2, 4], [3, 2, 1], [3, 2, 1], [1, 1, 6, 1, 1], [1, 1, 1, 1, 5, 1, 1], [1, 6, 1, 4, 1, 2], [1, 3, 2, 4, 1, 1, 5], [3, 2, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 6], [1, 3, 2, 4, 1, 1, 5], [3, 2, 4], [1, 1, 5, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 6], [1, 1, 1, 4, 1, 1, 5]]


In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 기본은 prepadding
# padding='post' : 앞쪽에 패딩(0)이 채워짐. pre로 설정 시 앞이 채워짐
# truncating='post' : 문장의 길이가 maxlen보다 긴 경우 뒷쪽을 제거, pre로 설정 시 앞이 잘림
padded_tokens = pad_sequences(integer_tokens, maxlen=5)
padded_tokens

array([[0, 0, 3, 2, 4],
       [0, 0, 3, 2, 1],
       [0, 0, 3, 2, 1],
       [1, 1, 6, 1, 1],
       [1, 1, 5, 1, 1],
       [6, 1, 4, 1, 2],
       [2, 4, 1, 1, 5],
       [0, 0, 3, 2, 1],
       [0, 1, 1, 1, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 6],
       [2, 4, 1, 1, 5],
       [0, 0, 3, 2, 4],
       [1, 5, 1, 1, 1],
       [0, 0, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 6],
       [1, 4, 1, 1, 5]], dtype=int32)