# 정수 인코딩 (Integer Encoding)
- 자연어 처는 텍스트 데이터를 숫자 변환하여 컴퓨터가 이해할 수 있도록 만드는 것이 핵심
- 정수 인코딩을 수행하여 텍스트 데이터에 고유한 인덱스를 부여 (1~5000)
- 이러한 인코딩 과정은 전처리 과정에서 필수적이며 각 단어의 등장 빈도에 따라 인덱스를 부여하는 것이 일반적
- 단어 수를 5000개 

In [53]:
raw_text = """The Little Prince, written by Antoine de Saint-Exupéry, is a poetic tale about a young prince who travels from his home planet to Earth. The story begins with a pilot stranded in the Sahara Desert after his plane crashes. While trying to fix his plane, he meets a mysterious young boy, the Little Prince.

The Little Prince comes from a small asteroid called B-612, where he lives alone with a rose that he loves deeply. He recounts his journey to the pilot, describing his visits to several other planets. Each planet is inhabited by a different character, such as a king, a vain man, a drunkard, a businessman, a geographer, and a fox. Through these encounters, the Prince learns valuable lessons about love, responsibility, and the nature of adult behavior.

On Earth, the Little Prince meets various creatures, including a fox, who teaches him about relationships and the importance of taming, which means building ties with others. The fox's famous line, "You become responsible, forever, for what you have tamed," resonates with the Prince's feelings for his rose.

Ultimately, the Little Prince realizes that the essence of life is often invisible and can only be seen with the heart. After sharing his wisdom with the pilot, he prepares to return to his asteroid and his beloved rose. The story concludes with the pilot reflecting on the lessons learned from the Little Prince and the enduring impact of their friendship.

The narrative is a beautifully simple yet profound exploration of love, loss, and the importance of seeing beyond the surface of things."""

### 토큰화 + 정제/정규화

In [54]:
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords

# 문장 토큰화
sentences = sent_tokenize(raw_text)

# 영문 불용어 리스트
en_stopwords = stopwords.words('english')

# 단어사전 
vocab = {}

# 토큰화/정제/정규화 처리 결과 
preprocessed_sentences = []

for sentence in sentences:
    sentence = sentence.lower()         # 대소문자 정규화 (소문자 변환)
    tokens = word_tokenize(sentence)    # 토큰화
    tokens = [token for token in tokens if token not in en_stopwords]   # 불용어 제거
    tokens = [token for token in tokens if len(token) > 2]              # 단어 길이가 2 이하면 삭제 

    for token in tokens:
        if token not in vocab:
            vocab[token] = 1
        else:
            vocab[token] += 1           # {token : 빈도수}
    
    preprocessed_sentences.append(tokens)

vocab
# preprocessed_sentences

{'little': 6,
 'prince': 9,
 'written': 1,
 'antoine': 1,
 'saint-exupéry': 1,
 'poetic': 1,
 'tale': 1,
 'young': 2,
 'travels': 1,
 'home': 1,
 'planet': 2,
 'earth': 2,
 'story': 2,
 'begins': 1,
 'pilot': 4,
 'stranded': 1,
 'sahara': 1,
 'desert': 1,
 'plane': 2,
 'crashes': 1,
 'trying': 1,
 'fix': 1,
 'meets': 2,
 'mysterious': 1,
 'boy': 1,
 'comes': 1,
 'small': 1,
 'asteroid': 2,
 'called': 1,
 'b-612': 1,
 'lives': 1,
 'alone': 1,
 'rose': 3,
 'loves': 1,
 'deeply': 1,
 'recounts': 1,
 'journey': 1,
 'describing': 1,
 'visits': 1,
 'several': 1,
 'planets': 1,
 'inhabited': 1,
 'different': 1,
 'character': 1,
 'king': 1,
 'vain': 1,
 'man': 1,
 'drunkard': 1,
 'businessman': 1,
 'geographer': 1,
 'fox': 3,
 'encounters': 1,
 'learns': 1,
 'valuable': 1,
 'lessons': 2,
 'love': 2,
 'responsibility': 1,
 'nature': 1,
 'adult': 1,
 'behavior': 1,
 'various': 1,
 'creatures': 1,
 'including': 1,
 'teaches': 1,
 'relationships': 1,
 'importance': 2,
 'taming': 1,
 'means': 1,
 '

In [55]:
preprocessed_sentences

[['little',
  'prince',
  'written',
  'antoine',
  'saint-exupéry',
  'poetic',
  'tale',
  'young',
  'prince',
  'travels',
  'home',
  'planet',
  'earth'],
 ['story',
  'begins',
  'pilot',
  'stranded',
  'sahara',
  'desert',
  'plane',
  'crashes'],
 ['trying',
  'fix',
  'plane',
  'meets',
  'mysterious',
  'young',
  'boy',
  'little',
  'prince'],
 ['little',
  'prince',
  'comes',
  'small',
  'asteroid',
  'called',
  'b-612',
  'lives',
  'alone',
  'rose',
  'loves',
  'deeply'],
 ['recounts',
  'journey',
  'pilot',
  'describing',
  'visits',
  'several',
  'planets'],
 ['planet',
  'inhabited',
  'different',
  'character',
  'king',
  'vain',
  'man',
  'drunkard',
  'businessman',
  'geographer',
  'fox'],
 ['encounters',
  'prince',
  'learns',
  'valuable',
  'lessons',
  'love',
  'responsibility',
  'nature',
  'adult',
  'behavior'],
 ['earth',
  'little',
  'prince',
  'meets',
  'various',
  'creatures',
  'including',
  'fox',
  'teaches',
  'relationships'

In [56]:
vocab

{'little': 6,
 'prince': 9,
 'written': 1,
 'antoine': 1,
 'saint-exupéry': 1,
 'poetic': 1,
 'tale': 1,
 'young': 2,
 'travels': 1,
 'home': 1,
 'planet': 2,
 'earth': 2,
 'story': 2,
 'begins': 1,
 'pilot': 4,
 'stranded': 1,
 'sahara': 1,
 'desert': 1,
 'plane': 2,
 'crashes': 1,
 'trying': 1,
 'fix': 1,
 'meets': 2,
 'mysterious': 1,
 'boy': 1,
 'comes': 1,
 'small': 1,
 'asteroid': 2,
 'called': 1,
 'b-612': 1,
 'lives': 1,
 'alone': 1,
 'rose': 3,
 'loves': 1,
 'deeply': 1,
 'recounts': 1,
 'journey': 1,
 'describing': 1,
 'visits': 1,
 'several': 1,
 'planets': 1,
 'inhabited': 1,
 'different': 1,
 'character': 1,
 'king': 1,
 'vain': 1,
 'man': 1,
 'drunkard': 1,
 'businessman': 1,
 'geographer': 1,
 'fox': 3,
 'encounters': 1,
 'learns': 1,
 'valuable': 1,
 'lessons': 2,
 'love': 2,
 'responsibility': 1,
 'nature': 1,
 'adult': 1,
 'behavior': 1,
 'various': 1,
 'creatures': 1,
 'including': 1,
 'teaches': 1,
 'relationships': 1,
 'importance': 2,
 'taming': 1,
 'means': 1,
 '

### 빈도수 기반 정제

In [57]:
# 빈도수 기반 역순 정렬
vocab_sorted = sorted(vocab.items(), key=lambda item: item[1], reverse=True)
vocab_sorted

[('prince', 9),
 ('little', 6),
 ('pilot', 4),
 ('rose', 3),
 ('fox', 3),
 ('young', 2),
 ('planet', 2),
 ('earth', 2),
 ('story', 2),
 ('plane', 2),
 ('meets', 2),
 ('asteroid', 2),
 ('lessons', 2),
 ('love', 2),
 ('importance', 2),
 ('written', 1),
 ('antoine', 1),
 ('saint-exupéry', 1),
 ('poetic', 1),
 ('tale', 1),
 ('travels', 1),
 ('home', 1),
 ('begins', 1),
 ('stranded', 1),
 ('sahara', 1),
 ('desert', 1),
 ('crashes', 1),
 ('trying', 1),
 ('fix', 1),
 ('mysterious', 1),
 ('boy', 1),
 ('comes', 1),
 ('small', 1),
 ('called', 1),
 ('b-612', 1),
 ('lives', 1),
 ('alone', 1),
 ('loves', 1),
 ('deeply', 1),
 ('recounts', 1),
 ('journey', 1),
 ('describing', 1),
 ('visits', 1),
 ('several', 1),
 ('planets', 1),
 ('inhabited', 1),
 ('different', 1),
 ('character', 1),
 ('king', 1),
 ('vain', 1),
 ('man', 1),
 ('drunkard', 1),
 ('businessman', 1),
 ('geographer', 1),
 ('encounters', 1),
 ('learns', 1),
 ('valuable', 1),
 ('responsibility', 1),
 ('nature', 1),
 ('adult', 1),
 ('behavio

In [58]:
# 인덱스 단어사전 생성
word_to_index = {word: i+1 for i, (word, cnt) in enumerate(vocab_sorted)}
word_to_index

{'prince': 1,
 'little': 2,
 'pilot': 3,
 'rose': 4,
 'fox': 5,
 'young': 6,
 'planet': 7,
 'earth': 8,
 'story': 9,
 'plane': 10,
 'meets': 11,
 'asteroid': 12,
 'lessons': 13,
 'love': 14,
 'importance': 15,
 'written': 16,
 'antoine': 17,
 'saint-exupéry': 18,
 'poetic': 19,
 'tale': 20,
 'travels': 21,
 'home': 22,
 'begins': 23,
 'stranded': 24,
 'sahara': 25,
 'desert': 26,
 'crashes': 27,
 'trying': 28,
 'fix': 29,
 'mysterious': 30,
 'boy': 31,
 'comes': 32,
 'small': 33,
 'called': 34,
 'b-612': 35,
 'lives': 36,
 'alone': 37,
 'loves': 38,
 'deeply': 39,
 'recounts': 40,
 'journey': 41,
 'describing': 42,
 'visits': 43,
 'several': 44,
 'planets': 45,
 'inhabited': 46,
 'different': 47,
 'character': 48,
 'king': 49,
 'vain': 50,
 'man': 51,
 'drunkard': 52,
 'businessman': 53,
 'geographer': 54,
 'encounters': 55,
 'learns': 56,
 'valuable': 57,
 'responsibility': 58,
 'nature': 59,
 'adult': 60,
 'behavior': 61,
 'various': 62,
 'creatures': 63,
 'including': 64,
 'teaches'

In [59]:
# 인덱스 단어사전2 생성
index_to_word = {i+1: word for i, (word, cnt) in enumerate(vocab_sorted)}
index_to_word

{1: 'prince',
 2: 'little',
 3: 'pilot',
 4: 'rose',
 5: 'fox',
 6: 'young',
 7: 'planet',
 8: 'earth',
 9: 'story',
 10: 'plane',
 11: 'meets',
 12: 'asteroid',
 13: 'lessons',
 14: 'love',
 15: 'importance',
 16: 'written',
 17: 'antoine',
 18: 'saint-exupéry',
 19: 'poetic',
 20: 'tale',
 21: 'travels',
 22: 'home',
 23: 'begins',
 24: 'stranded',
 25: 'sahara',
 26: 'desert',
 27: 'crashes',
 28: 'trying',
 29: 'fix',
 30: 'mysterious',
 31: 'boy',
 32: 'comes',
 33: 'small',
 34: 'called',
 35: 'b-612',
 36: 'lives',
 37: 'alone',
 38: 'loves',
 39: 'deeply',
 40: 'recounts',
 41: 'journey',
 42: 'describing',
 43: 'visits',
 44: 'several',
 45: 'planets',
 46: 'inhabited',
 47: 'different',
 48: 'character',
 49: 'king',
 50: 'vain',
 51: 'man',
 52: 'drunkard',
 53: 'businessman',
 54: 'geographer',
 55: 'encounters',
 56: 'learns',
 57: 'valuable',
 58: 'responsibility',
 59: 'nature',
 60: 'adult',
 61: 'behavior',
 62: 'various',
 63: 'creatures',
 64: 'including',
 65: 'teac

In [60]:
vocab_size = 15
word_to_index = {word: index for word, index in word_to_index.items() if index <= vocab_size}
word_to_index

{'prince': 1,
 'little': 2,
 'pilot': 3,
 'rose': 4,
 'fox': 5,
 'young': 6,
 'planet': 7,
 'earth': 8,
 'story': 9,
 'plane': 10,
 'meets': 11,
 'asteroid': 12,
 'lessons': 13,
 'love': 14,
 'importance': 15}

### OOV 처리

**(OOV)Out Of Vocabulary**: 단어사전에 정의되지 않은 단어를 가르키는 키워드

In [61]:
word_to_index['OOV'] = len(word_to_index) + 1
word_to_index

{'prince': 1,
 'little': 2,
 'pilot': 3,
 'rose': 4,
 'fox': 5,
 'young': 6,
 'planet': 7,
 'earth': 8,
 'story': 9,
 'plane': 10,
 'meets': 11,
 'asteroid': 12,
 'lessons': 13,
 'love': 14,
 'importance': 15,
 'OOV': 16}

### 수열처리 (=정수 인코딩)

In [62]:
encoded_sentences = []
oov_idx = word_to_index['OOV']

for sentence in preprocessed_sentences:
    encoded_sentence = [word_to_index.get(token, oov_idx) for token in sentence]
    print(sentence)
    print(encoded_sentence)
    print()
    encoded_sentences.append(encoded_sentences)

['little', 'prince', 'written', 'antoine', 'saint-exupéry', 'poetic', 'tale', 'young', 'prince', 'travels', 'home', 'planet', 'earth']
[2, 1, 16, 16, 16, 16, 16, 6, 1, 16, 16, 7, 8]

['story', 'begins', 'pilot', 'stranded', 'sahara', 'desert', 'plane', 'crashes']
[9, 16, 3, 16, 16, 16, 10, 16]

['trying', 'fix', 'plane', 'meets', 'mysterious', 'young', 'boy', 'little', 'prince']
[16, 16, 10, 11, 16, 6, 16, 2, 1]

['little', 'prince', 'comes', 'small', 'asteroid', 'called', 'b-612', 'lives', 'alone', 'rose', 'loves', 'deeply']
[2, 1, 16, 16, 12, 16, 16, 16, 16, 4, 16, 16]

['recounts', 'journey', 'pilot', 'describing', 'visits', 'several', 'planets']
[16, 16, 3, 16, 16, 16, 16]

['planet', 'inhabited', 'different', 'character', 'king', 'vain', 'man', 'drunkard', 'businessman', 'geographer', 'fox']
[7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 5]

['encounters', 'prince', 'learns', 'valuable', 'lessons', 'love', 'responsibility', 'nature', 'adult', 'behavior']
[16, 1, 16, 16, 13, 14, 16, 16, 1

---
### keras Tokenizer

In [63]:
#!pip install tensorflow

In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer

# num_words 에는 n+2를 넣어줘야 (2: 패팅과 OOV 몫)
tokenizer = Tokenizer(num_words=15, oov_token='<OOV>')

tokenizer.fit_on_texts(preprocessed_sentences)

tokenizer.word_index    # corpus의 모든 단어를 대상으로 생성

{'<OOV>': 1,
 'prince': 2,
 'little': 3,
 'pilot': 4,
 'rose': 5,
 'fox': 6,
 'young': 7,
 'planet': 8,
 'earth': 9,
 'story': 10,
 'plane': 11,
 'meets': 12,
 'asteroid': 13,
 'lessons': 14,
 'love': 15,
 'importance': 16,
 'written': 17,
 'antoine': 18,
 'saint-exupéry': 19,
 'poetic': 20,
 'tale': 21,
 'travels': 22,
 'home': 23,
 'begins': 24,
 'stranded': 25,
 'sahara': 26,
 'desert': 27,
 'crashes': 28,
 'trying': 29,
 'fix': 30,
 'mysterious': 31,
 'boy': 32,
 'comes': 33,
 'small': 34,
 'called': 35,
 'b-612': 36,
 'lives': 37,
 'alone': 38,
 'loves': 39,
 'deeply': 40,
 'recounts': 41,
 'journey': 42,
 'describing': 43,
 'visits': 44,
 'several': 45,
 'planets': 46,
 'inhabited': 47,
 'different': 48,
 'character': 49,
 'king': 50,
 'vain': 51,
 'man': 52,
 'drunkard': 53,
 'businessman': 54,
 'geographer': 55,
 'encounters': 56,
 'learns': 57,
 'valuable': 58,
 'responsibility': 59,
 'nature': 60,
 'adult': 61,
 'behavior': 62,
 'various': 63,
 'creatures': 64,
 'including': 

In [65]:
tokenizer.index_word    # corpus의 모든 단어를 대상으로 생성

{1: '<OOV>',
 2: 'prince',
 3: 'little',
 4: 'pilot',
 5: 'rose',
 6: 'fox',
 7: 'young',
 8: 'planet',
 9: 'earth',
 10: 'story',
 11: 'plane',
 12: 'meets',
 13: 'asteroid',
 14: 'lessons',
 15: 'love',
 16: 'importance',
 17: 'written',
 18: 'antoine',
 19: 'saint-exupéry',
 20: 'poetic',
 21: 'tale',
 22: 'travels',
 23: 'home',
 24: 'begins',
 25: 'stranded',
 26: 'sahara',
 27: 'desert',
 28: 'crashes',
 29: 'trying',
 30: 'fix',
 31: 'mysterious',
 32: 'boy',
 33: 'comes',
 34: 'small',
 35: 'called',
 36: 'b-612',
 37: 'lives',
 38: 'alone',
 39: 'loves',
 40: 'deeply',
 41: 'recounts',
 42: 'journey',
 43: 'describing',
 44: 'visits',
 45: 'several',
 46: 'planets',
 47: 'inhabited',
 48: 'different',
 49: 'character',
 50: 'king',
 51: 'vain',
 52: 'man',
 53: 'drunkard',
 54: 'businessman',
 55: 'geographer',
 56: 'encounters',
 57: 'learns',
 58: 'valuable',
 59: 'responsibility',
 60: 'nature',
 61: 'adult',
 62: 'behavior',
 63: 'various',
 64: 'creatures',
 65: 'includin

In [66]:
tokenizer.word_counts   # corpus의 모든 단어를 대상으로 빈도수를 반환

OrderedDict([('little', 6),
             ('prince', 9),
             ('written', 1),
             ('antoine', 1),
             ('saint-exupéry', 1),
             ('poetic', 1),
             ('tale', 1),
             ('young', 2),
             ('travels', 1),
             ('home', 1),
             ('planet', 2),
             ('earth', 2),
             ('story', 2),
             ('begins', 1),
             ('pilot', 4),
             ('stranded', 1),
             ('sahara', 1),
             ('desert', 1),
             ('plane', 2),
             ('crashes', 1),
             ('trying', 1),
             ('fix', 1),
             ('meets', 2),
             ('mysterious', 1),
             ('boy', 1),
             ('comes', 1),
             ('small', 1),
             ('asteroid', 2),
             ('called', 1),
             ('b-612', 1),
             ('lives', 1),
             ('alone', 1),
             ('rose', 3),
             ('loves', 1),
             ('deeply', 1),
             ('recounts',

In [67]:
# 정수 인코딩
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[3, 2, 1, 1, 1, 1, 1, 7, 2, 1, 1, 8, 9],
 [10, 1, 4, 1, 1, 1, 11, 1],
 [1, 1, 11, 12, 1, 7, 1, 3, 2],
 [3, 2, 1, 1, 13, 1, 1, 1, 1, 5, 1, 1],
 [1, 1, 4, 1, 1, 1, 1],
 [8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6],
 [1, 2, 1, 1, 14, 1, 1, 1, 1, 1],
 [9, 3, 2, 12, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1],
 [6, 1, 1, 1, 1, 1, 1, 1, 2, 1, 5],
 [1, 3, 2, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 4, 1, 1, 13, 1, 5],
 [10, 1, 4, 1, 14, 1, 3, 2, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]