<a href="https://colab.research.google.com/github/ykkim77/nlp_3rd/blob/main/nlp_3rd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 파이썬을 사용하여 정수 인코딩 하기!!

In [None]:
#text 입력

text = """
There is a monkey in the jungle. The monkey is very hungry.
The monkey is looking for food. The monkey found banana on the tree.
Banana is monkey's favorite food in the jungle.
With the monkey's ability, the monkey get the banana.
The monkey is very happy with banana
"""


In [None]:
#입력된 텍스트 출력
text

"\nThere is a monkey in the jungle. The monkey is very hungry.\nThe monkey is looking for food. The monkey found banana on the tree.\nBanana is monkey's favorite food in the jungle.\nWith the monkey's ability, the monkey get the banana.\nThe monkey is very happy with banana\n"

In [None]:
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# 문단 ==> 문장 단위 토큰화
# 결과값은 리스트로 반환된다.

sent_text = sent_tokenize(text)
sent_text

['\nThere is a monkey in the jungle.',
 'The monkey is very hungry.',
 'The monkey is looking for food.',
 'The monkey found banana on the tree.',
 "Banana is monkey's favorite food in the jungle.",
 "With the monkey's ability, the monkey get the banana.",
 'The monkey is very happy with banana']

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# 불용어 다운 받기
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
vocabluary = {}
sentences = []
stop_words = stopwords.words('english')

# 문장 ==> 단어로 토큰화

for sent in sent_text:
    sentence = word_tokenize(sent)
    print(sentence)
    result = []

    for word in sentence:
        word = word.lower()      # sentence의  단어는 모두 소문자로 치환
        if (word not in stop_words) & (len(word)>2):      # 하나하나의 단어가 불용어가 아니며, 단어 수가 2 이상이면 result 리스트에 단어를 추가한다.
            result. append(word)
            if word not in vocabluary:
                vocabluary[word] = 0        # 기존 vocaluary 딕셔너리에 world가 없으면 0이라는 value 값을 저장 (word가 처음 등장한다면...)
            vocabluary[word] += 1
    sentences.append(result)         


['There', 'is', 'a', 'monkey', 'in', 'the', 'jungle', '.']
['The', 'monkey', 'is', 'very', 'hungry', '.']
['The', 'monkey', 'is', 'looking', 'for', 'food', '.']
['The', 'monkey', 'found', 'banana', 'on', 'the', 'tree', '.']
['Banana', 'is', 'monkey', "'s", 'favorite', 'food', 'in', 'the', 'jungle', '.']
['With', 'the', 'monkey', "'s", 'ability', ',', 'the', 'monkey', 'get', 'the', 'banana', '.']
['The', 'monkey', 'is', 'very', 'happy', 'with', 'banana']


In [None]:
sentences      # 불용어가 모두 제거된 단어를 결과값으로 얻음

[['monkey', 'jungle'],
 ['monkey', 'hungry'],
 ['monkey', 'looking', 'food'],
 ['monkey', 'found', 'banana', 'tree'],
 ['banana', 'monkey', 'favorite', 'food', 'jungle'],
 ['monkey', 'ability', 'monkey', 'get', 'banana'],
 ['monkey', 'happy', 'banana']]

In [None]:
vocabluary    # 단어와 빈도수를 나타내는 딕셔너리 결과 출력

{'ability': 1,
 'banana': 4,
 'favorite': 1,
 'food': 2,
 'found': 1,
 'get': 1,
 'happy': 1,
 'hungry': 1,
 'jungle': 2,
 'looking': 1,
 'monkey': 8,
 'tree': 1}

In [None]:
# 빈도를 내림차순으로 정렬

sorted_vocabulary = sorted(vocabluary.items(), key = lambda x:x[1], reverse=True)
sorted_vocabulary

[('monkey', 8),
 ('banana', 4),
 ('jungle', 2),
 ('food', 2),
 ('hungry', 1),
 ('looking', 1),
 ('found', 1),
 ('tree', 1),
 ('favorite', 1),
 ('ability', 1),
 ('get', 1),
 ('happy', 1)]

In [None]:
integer_embedding = {}
i = 0

# 2번 이상 등장한 단어에 대해 정수값을 부여함, 즉 정수 인코딩을 수행하는 과정

for (word, frequency) in sorted_vocabulary:
    if frequency > 1:
        i = i+1
        integer_embedding[word] = i

print(integer_embedding)

{'monkey': 1, 'banana': 2, 'jungle': 3, 'food': 4}


In [None]:
# OOV(out of vacabulary) vacabulary 에 들어있는 않은 단어들도 oov key 값으로 정수 인코딩에 추가

integer_embedding['oov'] = len(integer_embedding)+1
integer_embedding

{'banana': 2, 'food': 4, 'jungle': 3, 'monkey': 1, 'oov': 5}

In [None]:
encoded = []

for s in sentences:
    temp = []
    for w in s:
        try:
            temp.append(integer_embedding[w])
        except KeyError:
            temp.append(integer_embedding['oov'])
    encoded.append(temp)

print(encoded)


[[1, 3], [1, 5], [1, 5, 4], [1, 5, 2, 5], [2, 1, 5, 4, 3], [1, 5, 1, 5, 2], [1, 5, 2]]


# count 함수를 사용하여 정수 인코딩 하기

불용어 처리는 앞에서 한 것을 그대로 쓰기로 한다.

In [None]:
import numpy as np
from collections import Counter

sentences

[['monkey', 'jungle'],
 ['monkey', 'hungry'],
 ['monkey', 'looking', 'food'],
 ['monkey', 'found', 'banana', 'tree'],
 ['banana', 'monkey', 'favorite', 'food', 'jungle'],
 ['monkey', 'ability', 'monkey', 'get', 'banana'],
 ['monkey', 'happy', 'banana']]

In [None]:
words = np.hstack(sentences)   #numpy를 이용해 각각 문장에 들어있는 모든 단어들을 하나의 numpy array로 스텍 작업을 해줌
words

array(['monkey', 'jungle', 'monkey', 'hungry', 'monkey', 'looking',
       'food', 'monkey', 'found', 'banana', 'tree', 'banana', 'monkey',
       'favorite', 'food', 'jungle', 'monkey', 'ability', 'monkey', 'get',
       'banana', 'monkey', 'happy', 'banana'], dtype='<U8')

In [None]:
vocabluary = Counter(words)   # Counter 함수만 사용하면 간단히 vocablurary를 만들 수 있다.
vocabluary

Counter({'ability': 1,
         'banana': 4,
         'favorite': 1,
         'food': 2,
         'found': 1,
         'get': 1,
         'happy': 1,
         'hungry': 1,
         'jungle': 2,
         'looking': 1,
         'monkey': 8,
         'tree': 1})

In [None]:
vocab_size = 4
common_vocablulary = vocabluary.most_common(vocab_size)   #빈도수가 가장 많은 상위 4개의 단어만 저장한다.
common_vocablulary

[('monkey', 8), ('banana', 4), ('jungle', 2), ('food', 2)]

In [None]:
integer_embedding = {}
i = 0

for (word, frequency) in common_vocablulary:
    i = i+ 1
    integer_embedding[word] = i

integer_embedding['oov'] = len(integer_embedding)+1   # oov 추가

print(integer_embedding)

{'monkey': 1, 'banana': 2, 'jungle': 3, 'food': 4, 'oov': 5}


# nltk 를 이용한 정수 인코딩

In [None]:
from nltk import FreqDist

In [None]:
vocabluary = FreqDist(np.hstack(sentences))
vocabluary

FreqDist({'ability': 1,
          'banana': 4,
          'favorite': 1,
          'food': 2,
          'found': 1,
          'get': 1,
          'happy': 1,
          'hungry': 1,
          'jungle': 2,
          'looking': 1,
          'monkey': 8,
          'tree': 1})

In [None]:
vocab_size = 4
common_vocablulary = vocabluary.most_common(vocab_size)
common_vocablulary

[('monkey', 8), ('banana', 4), ('jungle', 2), ('food', 2)]

In [None]:
integer_embedding = {word[0] : index +1 for index, word in enumerate(common_vocablulary)}
integer_embedding

{'banana': 2, 'food': 4, 'jungle': 3, 'monkey': 1}

# 텐서플로우 케라스를 이용한 정수 인코딩

In [None]:
 from tensorflow.keras.preprocessing.text import Tokenizer   #tensorflow keras 에서 제공하는 Tokenizer 클래스 불러오기


In [None]:
 tokenizer = Tokenizer()     # tokenizer 클래스의 객체를 생성함
 tokenizer.fit_on_texts(sentences)
 tokenizer

<keras_preprocessing.text.Tokenizer at 0x7f8a132058d0>

In [None]:
tokenizer.word_index   # tokenizer 객체의 word_index 값을 확인

{'ability': 10,
 'banana': 2,
 'favorite': 9,
 'food': 4,
 'found': 7,
 'get': 11,
 'happy': 12,
 'hungry': 5,
 'jungle': 3,
 'looking': 6,
 'monkey': 1,
 'tree': 8}

In [None]:
tokenizer.word_counts   # tokenizer 객채의 word_count 값을 확인

OrderedDict([('monkey', 8),
             ('jungle', 2),
             ('hungry', 1),
             ('looking', 1),
             ('food', 2),
             ('found', 1),
             ('banana', 4),
             ('tree', 1),
             ('favorite', 1),
             ('ability', 1),
             ('get', 1),
             ('happy', 1)])

In [None]:
integer_encoding = tokenizer.texts_to_sequences(sentences)
integer_encoding

[[1, 3],
 [1, 5],
 [1, 6, 4],
 [1, 7, 2, 8],
 [2, 1, 9, 4, 3],
 [1, 10, 1, 11, 2],
 [1, 12, 2]]

In [None]:
sentences

[['monkey', 'jungle'],
 ['monkey', 'hungry'],
 ['monkey', 'looking', 'food'],
 ['monkey', 'found', 'banana', 'tree'],
 ['banana', 'monkey', 'favorite', 'food', 'jungle'],
 ['monkey', 'ability', 'monkey', 'get', 'banana'],
 ['monkey', 'happy', 'banana']]

# pedding 작업 하기

문장들의 길이를 모두 동일하게 맞추어 행렬형태의 데이터로 변환하기

## 파이썬을 활용한 pedding 작업

In [None]:
max_len = max(len(encode) for encode in integer_encoding)   # integer_encoding 안에 가장 길이가 긴 리스트 길이 저장
max_len

5

In [None]:
for encode in integer_encoding:    #  max_len 이하의 리스트에 모두 0 값을 덧붙임
    while len(encode) < max_len:
        encode.append(0)
padding = np.array(integer_encoding)

padding

array([[ 1,  3,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  6,  4,  0,  0],
       [ 1,  7,  2,  8,  0],
       [ 2,  1,  9,  4,  3],
       [ 1, 10,  1, 11,  2],
       [ 1, 12,  2,  0,  0]])

## Tensorflow의 keras를 활용한 pedding

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padding = pad_sequences(integer_encoding)   # 텐서플로우의 pad_sequence 함수를 활용한 padding
padding

array([[ 1,  3,  0,  0,  0],
       [ 1,  5,  0,  0,  0],
       [ 1,  6,  4,  0,  0],
       [ 1,  7,  2,  8,  0],
       [ 2,  1,  9,  4,  3],
       [ 1, 10,  1, 11,  2],
       [ 1, 12,  2,  0,  0]], dtype=int32)

In [None]:
padding = pad_sequences(integer_encoding, padding = 'post', maxlen=7, value = -1)    # '뒤에 padding 붙이기' , maxlen  옵션주기, padding 숫자를 -1로 주기
padding

array([[ 1,  3,  0,  0,  0, -1, -1],
       [ 1,  5,  0,  0,  0, -1, -1],
       [ 1,  6,  4,  0,  0, -1, -1],
       [ 1,  7,  2,  8,  0, -1, -1],
       [ 2,  1,  9,  4,  3, -1, -1],
       [ 1, 10,  1, 11,  2, -1, -1],
       [ 1, 12,  2,  0,  0, -1, -1]], dtype=int32)

# TF-IDF

In [24]:
documents = ["배가 너무 너무 아프다",       # 1번 문서
             "배가 너무 고프다",           # 2번 문서
             "저기 저기 배가 지나간다",    # 3번 문서
             "비가 와서 다리가 아프다"]    # 4번 문서

In [25]:
vocabulary = list(set(w for doc in documents for w in doc.split()))    # 단어들을 중복없이 리스트로 나열ocabulary = list(set(w for doc in documents for w in doc.split()))    # 단어들을 중복없이 리스트로 나열
vocabulary
vocabulary

['너무', '배가', '저기', '고프다', '비가', '다리가', '와서', '지나간다', '아프다']

TF, idf, td * idf 함수 정의

In [26]:
import math

num_documents = len(documents)


def tf(t,d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in documents:
        df += t in doc
    return math.log(num_document/(df+1))

def tfidf(t,d):
    return tf(t,d) * idf(t)

**tf 구하기**

In [27]:
import pandas as pd

result = []
for i in range(num_documents):
    result.append([])
    d = documents[i]
    for j in range(len(vocabulary)):
        t = vocabulary[j]
        result[-1].append(tf(t,d))

term_frequency = pd.DataFrame(result, columns = vocabulary)
term_frequency

Unnamed: 0,너무,배가,저기,고프다,비가,다리가,와서,지나간다,아프다
0,2,1,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0
2,0,1,2,0,0,0,0,1,0
3,0,0,0,0,1,1,1,0,1


**IDF 구하기**

In [28]:
result = []
for j in range(len(vocabulary)):
    t = vocabulary[j]
    result.append(idf(t))

inverse_df = pd.DataFrame(result, index=vocabulary, columns=["IDF"])
inverse_df

Unnamed: 0,IDF
너무,0.287682
배가,0.0
저기,0.693147
고프다,0.693147
비가,0.693147
다리가,0.693147
와서,0.693147
지나간다,0.693147
아프다,0.287682


<strong>TF*IDF 구하기

In [29]:
result = []
for i in range (num_documents):
    result.append([])
    d = documents[i]
    for j in range(len(vocabulary)):
        t = vocabulary[j]
        result[-1].append(tfidf(t,d))

tf_idf = pd.DataFrame(result, columns=vocabulary)
tf_idf

Unnamed: 0,너무,배가,저기,고프다,비가,다리가,와서,지나간다,아프다
0,0.575364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287682
1,0.287682,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.693147,0.0
3,0.0,0.0,0.0,0.0,0.693147,0.693147,0.693147,0.0,0.287682
