# Padding
- 자연어 처리에서 각 문장(문서)의 길이는 다를 수 있음
- 그러나 언어모델은 고정된 길이의 데이터를 효율적으로 처리함
    - -> 모든 문장의 길이를 동일하게 맞춰주는 작업이 필요함 == 패딩

**패딩 이점**
1. 일관된 입력 형식
2. 병렬 연산 최적화
3. 유연한 데이터 처리

In [2]:
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'],
                        ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'],
                        ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'],
                        ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
                        ['barber', 'went', 'huge', 'mountain']]

### 직접 구현

In [4]:
#!pip install torch

Collecting torch
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.2
    Uninstalling sympy-1.13.2:
      Successfully uninstalled sympy-1.13.2
Successfully installed sympy-1.13.1 torch-2.6.0


In [None]:
import torch
from collections import Counter

class TokenizerForPadding:
    def __init__(self, num_words=None, oov_token='<OOV>'):
        self.num_words = num_words  # 인스턴스 생성 시 전달받음
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}
        self.word_counts = Counter()
    
    def fit_on_texts(self, texts):
        # 빈도수 세기
        for sentence in texts:
            self.word_counts.update(word for word in sentence if word)  # 개수를 세어주는 word_counts 객체를 업데이트
        
        # 빈도수 기반 vocabulary  생성 (num_word 만큼만)
        # 두 개의 리스트를 합침
        vocab = [self.oov_token] + [word for word, _ in self.word_counts.most_common(self.num_words-2 if self.num_words else None)]

        self.word_index = {word: i+1 for i, word in enumerate(vocab)}
        self.index_word = {i+1: word for word,i in self.word_index.items()}
    
    def texts_to_sequences(self, texts):
        return [[self.word_index.get(word, self.word_index[self.oov_token]) for word in sentence] for sentence in texts]


In [25]:
def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0):    # padding, truncating는 pre or post를 가짐
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences) # maxlen은 문장 하나에 들어가있는 토큰(단어)의 개수
    
    padded_sequences = []
    for seq in sequences:   # sequences는 전체 corpus
        if len(seq) > maxlen: 
            if truncating == 'pre':
                seq = seq[-maxlen:]
            else:
                seq = seq[:maxlen]
        else: 
            pad_length = maxlen - len(seq) 
            if padding == 'pre':
                seq = [value] * pad_length + seq
            else:
                seq = seq + [value]*pad_length
        
        padded_sequences.append(seq)
    
    return torch.tensor(padded_sequences)

sequences

[[2, 6],
 [2, 9, 6],
 [2, 4, 6],
 [10, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 11, 2, 12],
 [2, 13, 4, 14]]

In [27]:
padded = pad_sequences(sequences)
padded

tensor([[ 0,  0,  0,  0,  0,  2,  6],
        [ 0,  0,  0,  0,  2,  9,  6],
        [ 0,  0,  0,  0,  2,  4,  6],
        [ 0,  0,  0,  0,  0, 10,  3],
        [ 0,  0,  0,  3,  5,  4,  3],
        [ 0,  0,  0,  0,  0,  4,  3],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  3],
        [ 8,  8,  4,  3, 11,  2, 12],
        [ 0,  0,  0,  2, 13,  4, 14]])

In [None]:
padded = pad_sequences(sequences, padding='post')    # padding을 post로 주면 0이 뒤로 감
padded

tensor([[ 2,  6,  0,  0,  0,  0,  0],
        [ 2,  9,  6,  0,  0,  0,  0],
        [ 2,  4,  6,  0,  0,  0,  0],
        [10,  3,  0,  0,  0,  0,  0],
        [ 3,  5,  4,  3,  0,  0,  0],
        [ 4,  3,  0,  0,  0,  0,  0],
        [ 2,  5,  7,  0,  0,  0,  0],
        [ 2,  5,  7,  0,  0,  0,  0],
        [ 2,  5,  3,  0,  0,  0,  0],
        [ 8,  8,  4,  3, 11,  2, 12],
        [ 2, 13,  4, 14,  0,  0,  0]])

In [29]:
padded = pad_sequences(sequences, padding='post', maxlen=5)
padded

tensor([[ 2,  6,  0,  0,  0],
        [ 2,  9,  6,  0,  0],
        [ 2,  4,  6,  0,  0],
        [10,  3,  0,  0,  0],
        [ 3,  5,  4,  3,  0],
        [ 4,  3,  0,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  3,  0,  0],
        [ 4,  3, 11,  2, 12],
        [ 2, 13,  4, 14,  0]])

In [32]:
padded = pad_sequences(sequences, padding='post', maxlen=5, truncating='post')
padded

tensor([[ 2,  6,  0,  0,  0],
        [ 2,  9,  6,  0,  0],
        [ 2,  4,  6,  0,  0],
        [10,  3,  0,  0,  0],
        [ 3,  5,  4,  3,  0],
        [ 4,  3,  0,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  7,  0,  0],
        [ 2,  5,  3,  0,  0],
        [ 8,  8,  4,  3, 11],
        [ 2, 13,  4, 14,  0]])

In [13]:
tokenizer = TokenizerForPadding(num_words=15)
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[2, 6],
 [2, 9, 6],
 [2, 4, 6],
 [10, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 11, 2, 12],
 [2, 13, 4, 14]]

### keras Tokenizer 이용

In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences)
padded

array([[ 0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  1,  8,  5],
       [ 0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  9,  2],
       [ 0,  0,  0,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 0,  0,  0,  1, 12,  3, 13]])

##### 어린왕자 데이터 샘플 패딩처리

1. 텍스트 전처리 (토큰화/불용어처리/정제/정규화)
2. 정수 인코딩 Tokenizer (tensorflow.keras)
3. 패딩 처리 pad_sequences (tensorflow.keras)