# 6) 정수 인코딩

컴퓨터는 텍스트보다 숫자를 더 잘 처리하기 위해 텍스트를 숫자로 바꾸는 여러가지 기법을 사용한다.

기법을 적용시키기 전에 각 단어를 고유한 정수에 매핑시키는 작업이 필요.

In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [4]:
sentences = sent_tokenize(raw_text)
print(sentences)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [6]:
vocab = {}
preprocessed_sentences = []
stop_words = set(stopwords.words('english'))

# 1. 문장 하나씩 뽑기
for sentence in sentences:
    # 2. 단어 토큰화
    tokenized_sentence = word_tokenize(sentence)
    result = []
    
    # 3. 토큰화된 단어들을 소문자화 하여 단어 개수 줄임.
    for word in tokenized_sentence:
        word = word.lower()
        
        # 4. 단어가 불용어가 아니고
        if word not in stop_words:
            # 5. 길이가 2 초과일 때만 리스트에 추가
            if len(word) > 2:
                result.append(word)
                
                # vocab에 단어가 없으면 단어 추가 있으면 count+1
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] += 1
    preprocessed_sentences.append(result)
print(preprocessed_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [7]:
# vocab은 단어와 단어 빈도수를 담은 dictionary
print("단어 집합: ", vocab)

단어 집합:  {'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}


In [8]:
print(vocab["barber"])

8


In [9]:
vocab_sorted = sorted(vocab.items(), key = lambda x: x[1], reverse = True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [11]:
# 높은 빈도수를 가진 단어일수록 낮은 정수 부여
# 1이 제일 빈도수가 높음.
word_to_index = {}
i = 0
for (word, frequency) in vocab_sorted :
    if frequency > 1 : # 빈도수가 작은 단어는 제외.
        i = i + 1
        word_to_index[word] = i

print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [13]:
vocab_size = 5

# 인덱스가 5 초과인 단어 제거
words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1]

# 해당 단어에 대한 인덱스 정보를 삭제
for w in words_frequency:
    del word_to_index[w]
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
