# 1. Vectorization of Statement (문장의 vector 화)


* BOW (Bag of Words)
* TF-IDF (Term Frequency - Inverse Document Frequency)  
* Word Embedding - Keras word API 사용
* 참고 : chap17_딥러닝을 이용한 자연어처리 

In [1]:
import pandas as pd

# Corpus
sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat',
             'Do you think my dog is amazing?']

## 1-1. Bag of Word (BOW)

- CountVectorizer
    - min_df : vocabulary 에 포함할 최소 발생 빈도
    - ngram_range : (1, 1) - unigram only, (1, 2) - unigram + bigram
    - max_features : top max_features 만으로 vocabulary 구성
    - token_pattern = (?u)\\b\\w\\w+\\b : unocode 영수자 2 글자 이상만 포함

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vectorizer

## Text vs token Matrix 생성

In [3]:
features = count_vectorizer.fit_transform(sentences)
features

<4x10 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [10]:
features.shape

(4, 10)

In [4]:
print(f"document 수: {features.shape[0]}")
print(f"단어수: {features.shape[1]-1}")

document 수: 4
단어수: 9


In [5]:
# toarray
vectorized_sentences = features.toarray()
vectorized_sentences

array([[0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 1, 0, 1, 0, 2, 2, 0, 0],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 1]], dtype=int64)

### features 의 단어 list

In [6]:
feature_names = count_vectorizer.get_feature_names_out()
feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [7]:
df = pd.DataFrame(vectorized_sentences, columns=feature_names)
df.index.name = 'sentence'
df

Unnamed: 0_level_0,amazing,and,cat,do,dog,is,love,my,think,you
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,1,0,1,1,0,0
1,0,0,1,0,0,0,1,1,0,0
2,0,1,1,0,1,0,2,2,0,0
3,1,0,0,1,1,1,0,1,1,1


## 1-2. TF-IDF

- TF-IDF(Term Frequency - Inverse Document Frequency)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer

In [9]:
# vertor화 후 꼭 fit_transform 시켜주기
tfidf_sentences = tfidf_vectorizer.fit_transform(sentences)
tfidf_sentences

<4x10 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

## Text vs tf-idf Matrix 생성

In [11]:
tfidf_vect_sentences = tfidf_sentences.toarray()
tfidf_vect_sentences

array([[0.        , 0.        , 0.        , 0.        , 0.61217198,
        0.        , 0.61217198, 0.5004907 , 0.        , 0.        ],
       [0.        , 0.        , 0.69113141, 0.        , 0.        ,
        0.        , 0.55953044, 0.4574528 , 0.        , 0.        ],
       [0.        , 0.45893203, 0.36182728, 0.        , 0.29293037,
        0.        , 0.58586074, 0.47897954, 0.        , 0.        ],
       [0.41960069, 0.        , 0.        , 0.41960069, 0.26782569,
        0.41960069, 0.        , 0.21896505, 0.41960069, 0.41960069]])

In [12]:
tfidf_feature_names =count_vectorizer.get_feature_names_out()
tfidf_feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [13]:
df = pd.DataFrame(tfidf_vect_sentences, columns=tfidf_feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0.0,0.0,0.0,0.0,0.612172,0.0,0.612172,0.500491,0.0,0.0
1,0.0,0.0,0.691131,0.0,0.0,0.0,0.55953,0.457453,0.0,0.0
2,0.0,0.458932,0.361827,0.0,0.29293,0.0,0.585861,0.47898,0.0,0.0
3,0.419601,0.0,0.0,0.419601,0.267826,0.419601,0.0,0.218965,0.419601,0.419601


# 1-3. keras word encoding

- keras  API 이용

In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

## Tokenize

In [39]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')    # 문장으로 부터 상위 100 개 단어로 vocabulary 작성

## Word Index Vocabulary 작성

In [40]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'i': 4,
 'dog': 5,
 'cat': 6,
 'and': 7,
 'do': 8,
 'you': 9,
 'think': 10,
 'is': 11,
 'amazing': 12}

## text 의 sentence 변환 및 paddding

- texts_to_sequences: text list 내의 각 text 를 수열 (sequence of integers) 로 convert


    - 입력 : text (strings) list
    - 반환 : sequence list
    
- pad_sequences: 동일한 길이로 sequence 를 zero padding

In [41]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post', truncating='post')

In [42]:
print(sequences)
print()
print(padded)

[[4, 3, 2, 5], [4, 3, 2, 6], [4, 3, 2, 5, 7, 3, 2, 6], [8, 9, 10, 2, 5, 11, 12]]

[[ 4  3  2  5  0  0  0  0]
 [ 4  3  2  6  0  0  0  0]
 [ 4  3  2  5  7  3  2  6]
 [ 8  9 10  2  5 11 12  0]]


In [43]:
tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'i',
 5: 'dog',
 6: 'cat',
 7: 'and',
 8: 'do',
 9: 'you',
 10: 'think',
 11: 'is',
 12: 'amazing'}

### sequenced sentence 를 word sentence 로 환원

In [44]:
for sequence in sequences:
    sent = []
    for idx in sequence:
        sent.append(tokenizer.index_word[idx])
    print(' '.join(sent))

i love my dog
i love my cat
i love my dog and love my cat
do you think my dog is amazing


### One-Hot-Encoding 표현

In [45]:
to_categorical(padded)

array([[[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

    