<a href="https://colab.research.google.com/github/ttury/Deep-Learning-For-Natural-Language-Processing/blob/master/Integer_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 정수 인코딩(Integer Encoding)
<br/>

## dictionary 자료형
---

> 단어를 빈도수 순으로 정렬한 단어 집합을 만들어 인코딩함

<br/>

## Counter
---

`from collections import Counter`

> 단어 집합을 Counter의 입력으로 사용하면 중복을 제거하고 단어의 빈도수를 기록함

<br/>

## NLTK -> FreqDist
---

`from nltk import FreqDist`

> Counter와 같은 빈도수 계산 도구

<br/>

## Keras -> tokenizer
---

`from tensorflow.keras.preprocessing.text import Tokenizer`

> fit_on_texts()를 이용해 정수 인코딩


In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# dictionary 자료형 사용

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

sent_tokens = sent_tokenize(text)
print(sent_tokens)

vocab = dict()
words_by_sent = list()
stop_words = set(stopwords.words('english'))

# words_by_sent(문장별 불용어 삭제 단어 리스트), vocab(단어 집합) 만들기
for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()

  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)
        if word_token not in vocab:
          vocab[word_token] = 0
        vocab[word_token] += 1
  
  words_by_sent.append(result)

print(words_by_sent)
print()

print(vocab)
print(vocab["barber"])
print()

# 단어 집합 정렬
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)
print(vocab_sorted)
print()

# 단어 집합 반대로 정렬 & 빈도가 1인 단어 삭제
vocab_processed = dict()
i = 0
for (word, frequency) in vocab_sorted:
  if frequency > 1:
    i+=1
    vocab_processed[word] = i

print(vocab_processed)

# 빈도수 상위 5개 단어들 골라내기
vocab_size = 5
words_frequency = [word for word, frequency in vocab_processed.items() if frequency > vocab_size]
for word in words_frequency:
  del vocab_processed[word]
print(vocab_processed)
print()

# words(list) integer encoding
vocab_processed['OOV'] = len(vocab_processed) + 1

encoded = list()
for word_tokens in words:
  temp = list()
  for word_token in word_tokens:
    try:
      temp.append(vocab_processed[word_token])
    except KeyError:
      temp.append(vocab_processed['OOV'])
  encoded.append(temp)

print(encoded)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']
[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}
8

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('goo

In [None]:
# Counter 사용

from collections import Counter
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
sent_tokens = sent_tokenize(text)
stop_words = stopwords.words('english')

words_by_sent = list()

for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()

  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)
  
  words_by_sent.append(result)

# words_by_sent -> words(문장 구분 없는 단어 집합)
print(words_by_sent)
words = sum(words_by_sent, []) # words_by_list is list, so second para should be empty list
print(words)
print()

# Counter 객체 생성
vocab = Counter(words)
print(vocab)
print(vocab["barber"])
print()

# 빈도수 상위 5개 단어 골라내기
vocab_size = 5
vocab = vocab.most_common(vocab_size)
print(vocab)
print()

# 단어 인덱싱
vocab_processed = dict()
i = 0
for (word, frequency) in vocab:
  i = i + 1
  vocab_processed[word] = i

print(vocab_processed)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})
8

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person'

In [None]:
# FreqDist 사용

from nltk import FreqDist
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
sent_tokens = sent_tokenize(text)
stop_words = stopwords.words('english')
words_by_sentence = list()

for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()

  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)

  words_by_sentence.append(result)

# FreqDist 객체 생성
words = np.hstack(words_by_sentence) # sum(words_by_sentence, [])
vocab = FreqDist(words)
print(vocab)
print(vocab["barber"])
print()

# 빈도수 상위 5개 단어 골라내기
vocab_size = 5
vocab = vocab.most_common(vocab_size)
print(vocab)
print()

# 단어 인덱싱
vocab_processed = {word[0] : index + 1 for index, word in enumerate(vocab)}
print(vocab_processed)

<FreqDist with 13 samples and 36 outcomes>
8

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [12]:
# keras tokenizer 사용

from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
sent_tokens = sent_tokenize(text)
stop_words = stopwords.words('english')
words_by_sent = list()

for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()

  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)
  
  words_by_sent.append(result)

print(words_by_sent)
print()

# Tokenizer 객체 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words_by_sent)

# 단어 인덱싱, 빈도수, 인코딩
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.texts_to_sequences(words_by_sent))
print()

# 단어 빈도수 상위 5개 골라내기
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1)
tokenizer.fit_on_texts(words_by_sent)

# num_words의 적용은 texts_to_sequences에서만 일어남
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.texts_to_sequences(words_by_sent))
print()

# 단어 집합에 없는 단어를 OOV로 보존하고 싶다면 oov_token 사용
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = "OOV")
tokenizer.fit_on_texts(words_by_sent)

print("단어 OOV의 인덱스 : {}".format(tokenizer.word_index["OOV"]))
print(tokenizer.texts_to_sequences(words_by_sent))

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}
OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])
[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'w