<a href="https://colab.research.google.com/github/ttury/Deep-Learning-For-Natural-Language-Processing/blob/master/Padding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 패딩(Padding)
---
> 문장의 길이를 임의로 동일하게 맞춰주는 작업

> 정수 인코딩한 단어 집합을 행렬로 만들어 연산하기 쉽게 하기 위해 필요

<br/>

## Numpy(ndarray)
---
> 반복문으로 패딩한 뒤 ndarray를 사용해 행렬로 만듬

<br/>

## Keras(pad_sequences)
---
> 정수 인코딩이 끝난 단어 집합을 한 번에 패딩과 행렬 변환을 수행함

> 기본적으로 패딩시 앞을 0으로 채우기 때문에 뒤를 채우기 위해서는 post 값이 필요





In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
# Numpy

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
sent_tokens = sent_tokenize(text)
stop_words = stopwords.words('english')
words_by_sent = list()

for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()

  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)

  words_by_sent.append(result)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(words_by_sent) # 빈도수를 기준으로 단어 집합 생성
encoded = tokenizer.texts_to_sequences(words_by_sent)

max_len = max(len(item) for item in encoded) # 한 문장 단위 내 인코딩된 단어 개수 기준
print(max_len)

for item in encoded:
  while len(item) < max_len:
    item.append(0) # 남는 뒤 공간을 0으로 채워 패딩

padded_np = np.array(encoded)
print(padded_np)

7
[[ 1  5  0  0  0  0  0]
 [ 1  8  5  0  0  0  0]
 [ 1  3  5  0  0  0  0]
 [ 9  2  0  0  0  0  0]
 [ 2  4  3  2  0  0  0]
 [ 3  2  0  0  0  0  0]
 [ 1  4  6  0  0  0  0]
 [ 1  4  6  0  0  0  0]
 [ 1  4  2  0  0  0  0]
 [ 7  7  3  2 10  1 11]
 [ 1 12  3 13  0  0  0]]


In [39]:
# keras -> pad_sequences

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
stop_words = stopwords.words('english')
sent_tokens = sent_tokenize(text)
words_by_sent = list()

for sent_token in sent_tokens:
  word_tokens = word_tokenize(sent_token)
  result = list()
  
  for word_token in word_tokens:
    word_token = word_token.lower()
    if word_token not in stop_words:
      if len(word_token) > 2:
        result.append(word_token)

  words_by_sent.append(result)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(words_by_sent)
encoded = tokenizer.texts_to_sequences(words_by_sent)
print(encoded)
print()

padded = pad_sequences(encoded) # pad_sequences를 이용한 패딩
print(padded)
print()

padded_post = pad_sequences(encoded, padding = "post") # 뒤에 0 채우기
print(padded_post)
print()

padded_maxlen = pad_sequences(encoded, padding = "post", maxlen = 5) # 길이 제한(넘어가면 데이터 손실)
print(padded_maxlen)
print()

last_value = len(tokenizer.word_index) + 1 # 단어 집합의 인코딩에 사용되지 않은 정수 구하기
print(last_value)
print()

padded = pad_sequences(encoded, padding = 'post', value = last_value)
print(padded)

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]

[[ 0  0  0  0  0  1  5]
 [ 0  0  0  0  1  8  5]
 [ 0  0  0  0  1  3  5]
 [ 0  0  0  0  0  9  2]
 [ 0  0  0  2  4  3  2]
 [ 0  0  0  0  0  3  2]
 [ 0  0  0  0  1  4  6]
 [ 0  0  0  0  1  4  6]
 [ 0  0  0  0  1  4  2]
 [ 7  7  3  2 10  1 11]
 [ 0  0  0  1 12  3 13]]

[[ 1  5  0  0  0  0  0]
 [ 1  8  5  0  0  0  0]
 [ 1  3  5  0  0  0  0]
 [ 9  2  0  0  0  0  0]
 [ 2  4  3  2  0  0  0]
 [ 3  2  0  0  0  0  0]
 [ 1  4  6  0  0  0  0]
 [ 1  4  6  0  0  0  0]
 [ 1  4  2  0  0  0  0]
 [ 7  7  3  2 10  1 11]
 [ 1 12  3 13  0  0  0]]

[[ 1  5  0  0  0]
 [ 1  8  5  0  0]
 [ 1  3  5  0  0]
 [ 9  2  0  0  0]
 [ 2  4  3  2  0]
 [ 3  2  0  0  0]
 [ 1  4  6  0  0]
 [ 1  4  6  0  0]
 [ 1  4  2  0  0]
 [ 3  2 10  1 11]
 [ 1 12  3 13  0]]

14

[[ 1  5 14 14 14 14 14]
 [ 1  8  5 14 14 14 14]
 [ 1  3  5 14 14 14 14]
 [ 9  2 14 14 14 14 14]
 [ 2  4  3  2 14 14 14]
 [ 3  2 