## Integer Encoding

- Integer Encoding
    * 단어 토큰에 고유한 정수 부여
    * ~중복 허용
    * OOV와 결합가능
    * padding : 문장마다 길이가 다른 경우 0을 이용해 보정

- 무슨 기준으로 Integer Encoding할 때 정수를 부여할까?
    * 가나다 순
    * 빈도 순

##### text 저장

In [1]:
import nltk
text = """Isn't she lovely.
Isn't she wonderful.
Isn't she precious.
Less than one minute old.
I never thought through love we'd be.
Making one as lovely as she.
But isn't she lovely made from love.
Isn't she pretty.
Truly the angel's best.
Boy, I'm so happy.
We have been heaven blessed.
I can't believe what God has done.
Through us he's given life to one.
But isn't she lovely made from love.
Isn't she lovely.
Life and love are the same.
Life is Aisha.
The meaning of her name.
Londie, it could have not been done.
Without you who conceived the one.
That's so very lovely made from love."""

### English Tokenize

#### 문장 토큰화

In [2]:
from nltk.tokenize import sent_tokenize

sent_tokens = sent_tokenize(text)
print(sent_tokens)

["Isn't she lovely.", "Isn't she wonderful.", "Isn't she precious.", 'Less than one minute old.', "I never thought through love we'd be.", 'Making one as lovely as she.', "But isn't she lovely made from love.", "Isn't she pretty.", "Truly the angel's best.", "Boy, I'm so happy.", 'We have been heaven blessed.', "I can't believe what God has done.", "Through us he's given life to one.", "But isn't she lovely made from love.", "Isn't she lovely.", 'Life and love are the same.', 'Life is Aisha.', 'The meaning of her name.', 'Londie, it could have not been done.', 'Without you who conceived the one.', "That's so very lovely made from love."]


#### 단어 토큰화

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_word = set(stopwords.words('english')) #불용어 집합화

단어를 토큰화 시켜서 리스트 형태로 저장하겠다

In [4]:
sentences = []
for sent in sent_tokens:
    word_tokens = word_tokenize(sent) # 각 문장을 단어 토큰화
    result = [] # 불용어 제거, 길이가 짧은 단어 제거한 데이터를 담을 리스트
    
    for word in word_tokens:
        word = word.lower() # 소문자 = 정규화

        if word not in stop_word: # 불용어 처리
            if len(word) > 2 : # 단어 길이 2이상만
                result.append(word)
        
    sentences.append(result)
print(sentences)
print(result) # for문 안에서 result는 리셋되기 때문에 맨 마지막 것만 살아남음

[["n't", 'lovely'], ["n't", 'wonderful'], ["n't", 'precious'], ['less', 'one', 'minute', 'old'], ['never', 'thought', 'love'], ['making', 'one', 'lovely'], ["n't", 'lovely', 'made', 'love'], ["n't", 'pretty'], ['truly', 'angel', 'best'], ['boy', 'happy'], ['heaven', 'blessed'], ["n't", 'believe', 'god', 'done'], ['given', 'life', 'one'], ["n't", 'lovely', 'made', 'love'], ["n't", 'lovely'], ['life', 'love'], ['life', 'aisha'], ['meaning', 'name'], ['londie', 'could', 'done'], ['without', 'conceived', 'one'], ['lovely', 'made', 'love']]
['lovely', 'made', 'love']


#### 단어 집합

규칙 : 빈도가 높을 수록 앞에 오도록 만들 예정

sum 응용 -> counter -> sorted 응용

In [5]:
words = sum(sentences, [])
# sum을 하는 순간 속 리스트가 하나 해제된다.
# list에 더한다? =append이기 때문에 그 뒤에 extend처럼 뒤에 붙는거다.
# sum([1,2,3], 10) 1,2,3 더한 다음 10에 더한다
print(words)

["n't", 'lovely', "n't", 'wonderful', "n't", 'precious', 'less', 'one', 'minute', 'old', 'never', 'thought', 'love', 'making', 'one', 'lovely', "n't", 'lovely', 'made', 'love', "n't", 'pretty', 'truly', 'angel', 'best', 'boy', 'happy', 'heaven', 'blessed', "n't", 'believe', 'god', 'done', 'given', 'life', 'one', "n't", 'lovely', 'made', 'love', "n't", 'lovely', 'life', 'love', 'life', 'aisha', 'meaning', 'name', 'londie', 'could', 'done', 'without', 'conceived', 'one', 'lovely', 'made', 'love']


In [6]:
from collections import Counter # 횟수 세기 용도
vocab = Counter(words)
vocab

Counter({"n't": 8,
         'lovely': 6,
         'wonderful': 1,
         'precious': 1,
         'less': 1,
         'one': 4,
         'minute': 1,
         'old': 1,
         'never': 1,
         'thought': 1,
         'love': 5,
         'making': 1,
         'made': 3,
         'pretty': 1,
         'truly': 1,
         'angel': 1,
         'best': 1,
         'boy': 1,
         'happy': 1,
         'heaven': 1,
         'blessed': 1,
         'believe': 1,
         'god': 1,
         'done': 2,
         'given': 1,
         'life': 3,
         'aisha': 1,
         'meaning': 1,
         'name': 1,
         'londie': 1,
         'could': 1,
         'without': 1,
         'conceived': 1})

In [7]:
vocab_sorted = sorted(vocab.items(), key=lambda x : x[1], reverse=True) #여기서 x는? tuple 1개
vocab_sorted

[("n't", 8),
 ('lovely', 6),
 ('love', 5),
 ('one', 4),
 ('made', 3),
 ('life', 3),
 ('done', 2),
 ('wonderful', 1),
 ('precious', 1),
 ('less', 1),
 ('minute', 1),
 ('old', 1),
 ('never', 1),
 ('thought', 1),
 ('making', 1),
 ('pretty', 1),
 ('truly', 1),
 ('angel', 1),
 ('best', 1),
 ('boy', 1),
 ('happy', 1),
 ('heaven', 1),
 ('blessed', 1),
 ('believe', 1),
 ('god', 1),
 ('given', 1),
 ('aisha', 1),
 ('meaning', 1),
 ('name', 1),
 ('londie', 1),
 ('could', 1),
 ('without', 1),
 ('conceived', 1)]

In [8]:
word2idx = {}
i = 0 # 실제 부여되는 인덱스

for (word, freq) in vocab_sorted:
    
    # 빈도수 낮은 단어는 제거
    if freq > 1:
        i = i + 1
        word2idx[word] = i
        
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'life': 6, 'done': 7}


- 핵심
빈도수가 가장 높은 상위 n개만 뽑아야한다!

##### 중요 테크닉
- vocab_size = 5
단어집합에서 사용할 단어의 개수

In [9]:
vocab_size = 5
del_word = [w for w, idx in word2idx.items() if idx > vocab_size]
print(del_word)

['life', 'done']


In [17]:
for w in del_word:
    del word2idx[w]
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5}


#### Encoding 수동

In [21]:
word2idx['<oov>'] = 6
print(word2idx)

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, '<oov>': 6}


In [25]:
encoded = []

for sent in sentences:
    temp = []
    
    for w in sent:
        if w in word2idx: # 단어집합에 단어가 있다면!
            temp.append(word2idx[w]) # 단어에 맞는 정수를 추가해줭
        else: #단어가 없다면?
            temp.append(word2idx['<oov>'])
    encoded.append(temp)
print('변화 전 {} \n변화 후 {}'.format(sentences[:5], encoded[:5]))

변화 전 [["n't", 'lovely'], ["n't", 'wonderful'], ["n't", 'precious'], ['less', 'one', 'minute', 'old'], ['never', 'thought', 'love']] 
변화 후 [[1, 2], [1, 6], [1, 6], [6, 4, 6, 6], [6, 6, 3]]


#### Encoding 자동(tf)

- fit_on_texts()에 코퍼스(sentences)를 넣으면 빈도 기준으로 만들어줌

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [30]:
tokenizer.fit_on_texts(sentences) # sentences는 항상 2차원이어야 한다

In [39]:
print(tokenizer.word_index) # 인덱스
print(tokenizer.word_counts) # 빈도수

{"n't": 1, 'lovely': 2, 'love': 3, 'one': 4, 'made': 5, 'life': 6, 'done': 7, 'wonderful': 8, 'precious': 9, 'less': 10, 'minute': 11, 'old': 12, 'never': 13, 'thought': 14, 'making': 15, 'pretty': 16, 'truly': 17, 'angel': 18, 'best': 19, 'boy': 20, 'happy': 21, 'heaven': 22, 'blessed': 23, 'believe': 24, 'god': 25, 'given': 26, 'aisha': 27, 'meaning': 28, 'name': 29, 'londie': 30, 'could': 31, 'without': 32, 'conceived': 33} 
 OrderedDict([("n't", 8), ('lovely', 6), ('wonderful', 1), ('precious', 1), ('less', 1), ('one', 4), ('minute', 1), ('old', 1), ('never', 1), ('thought', 1), ('love', 5), ('making', 1), ('made', 3), ('pretty', 1), ('truly', 1), ('angel', 1), ('best', 1), ('boy', 1), ('happy', 1), ('heaven', 1), ('blessed', 1), ('believe', 1), ('god', 1), ('done', 2), ('given', 1), ('life', 3), ('aisha', 1), ('meaning', 1), ('name', 1), ('londie', 1), ('could', 1), ('without', 1), ('conceived', 1)])



##### 인코딩, 디코딩

In [40]:
tokenizer.texts_to_sequences(sentences) # 인코딩

[[1, 2],
 [1, 8],
 [1, 9],
 [10, 4, 11, 12],
 [13, 14, 3],
 [15, 4, 2],
 [1, 2, 5, 3],
 [1, 16],
 [17, 18, 19],
 [20, 21],
 [22, 23],
 [1, 24, 25, 7],
 [26, 6, 4],
 [1, 2, 5, 3],
 [1, 2],
 [6, 3],
 [6, 27],
 [28, 29],
 [30, 31, 7],
 [32, 33, 4],
 [2, 5, 3]]

In [43]:
tokenizer.sequences_to_texts([[1, 2], [1, 9]]) # 디코딩

["n't lovely", "n't precious"]

##### OOV 및 Padding

추가했을 시,
oov : 1번 토큰 (default)  
pad : 0번 토큰 (default)

In [49]:
vocab_size = 5

# num_words : 단어집합 내에서 사용할 단어 개수
tokenizer = Tokenizer(num_words=vocab_size+2, oov_token='<oov>') # +2 왜? oov와 pad 토큰 추가해야 하니까
tokenizer.fit_on_texts(sentences)
print(tokenizer.texts_to_sequences(sentences))

[[2, 3], [2, 1], [2, 1], [1, 5, 1, 1], [1, 1, 4], [1, 5, 3], [2, 3, 6, 4], [2, 1], [1, 1, 1], [1, 1], [1, 1], [2, 1, 1, 1], [1, 1, 5], [2, 3, 6, 4], [2, 3], [1, 4], [1, 1], [1, 1], [1, 1, 1], [1, 1, 5], [3, 6, 4]]


In [51]:
print('{}'.format(tokenizer.word_index['<oov>']))

1


### 한국어 Tokenize

In [52]:
!pip install konlpy



In [54]:
import pandas as pd
import numpy as np
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x2345173fa60>)

In [118]:
df_train = pd.read_table('ratings_test.txt', encoding='utf-8')
df_train.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [58]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


null 값 제거  
중복 문장 제거  
특수 문자 및 영어 제거  
형태소 분리 후 Tokenizer 단어 집합 생성하기

In [122]:
import re
df_train['document'] = df_train['document'].str.replace('[^ㄱ-ㅎ가-힣 ]', ' ')
df_train['document'] = df_train['document'].str.replace('[ ]{2,}', ' ')

In [123]:
df_train_idx = df_train.loc[df_train['document'] == ' '].index
df_train = df_train.drop(df_train_idx)

In [126]:
df_train.drop_duplicates(['document'], keep='first', inplace=True)

In [127]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48529 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        48529 non-null  int64 
 1   document  48529 non-null  object
 2   label     48529 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [128]:
from konlpy.tag import Okt, Komoran
tokenizer = Tokenizer()
okt = Okt()
komoran = Komoran()

In [150]:
tokenizer.fit_on_texts(temp3) # sentences는 항상 2차원이어야 한다
tokenizer.texts_to_sequences(temp3)

[[665],
 [87],
 [68],
 [159],
 [1],
 [23],
 [8],
 [11],
 [4677],
 [1543],
 [20],
 [929],
 [745],
 [9],
 [48],
 [848],
 [2567],
 [10406],
 [2678],
 [89],
 [361],
 [104],
 [110],
 [420],
 [142],
 [222],
 [16],
 [7846],
 [118],
 [1957],
 [105],
 [16207],
 [39],
 [19],
 [601],
 [157],
 [71717],
 [10],
 [5408],
 [6393],
 [203],
 [1],
 [3757],
 [145],
 [28],
 [3],
 [203],
 [2],
 [729],
 [73],
 [1114],
 [377],
 [281],
 [22],
 [71718],
 [13],
 [16208],
 [66],
 [1958],
 [3288],
 [2420],
 [4409],
 [10],
 [16209],
 [168],
 [440],
 [1248],
 [71719],
 [2421],
 [1028],
 [56],
 [265],
 [3289],
 [16210],
 [2837],
 [51],
 [7],
 [433],
 [2679],
 [4],
 [1249],
 [71720],
 [52],
 [2568],
 [16211],
 [84],
 [2],
 [5],
 [81],
 [383],
 [52],
 [2752],
 [41],
 [3153],
 [3],
 [1783],
 [429],
 [4],
 [12588],
 [915],
 [3154],
 [3],
 [7032],
 [202],
 [42],
 [1],
 [12589],
 [3970],
 [2288],
 [317],
 [16],
 [10407],
 [71721],
 [397],
 [65],
 [183],
 [1105],
 [3],
 [1410],
 [380],
 [649],
 [5],
 [71722],
 [534],
 [589]

#### 모범답안

In [152]:
df_train = pd.read_table('ratings_test.txt', encoding='utf-8')
df_train.head()

df_train = df_train.drop_duplicates(subset=['document'])
df_train = df_train.dropna(how='any')

df_train['document'] = df_train['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')

df_train['document'].replace('', np.nan, inplace=True)
df_train = df_train.dropna(how='any')

# 확인
df_train['document'].nunique() # 중복 제외한 개수
df_train.isnull().sum()

id          0
document    0
label       0
dtype: int64

##### stemming, norm -> stopword가 일반적!

In [154]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

from konlpy.tag import Okt
okt = Okt()

X_train = []

for sent in df_train['document']:
    temp_X = []
    temp_X = okt.morphs(sent, stem=True, norm=True)
    temp_X = [w for w in temp_X if not w in stopwords]
    
    X_train.append(temp_X)

X_train[:3]

[['굳다', 'ㅋ'],
 ['뭐',
  '야',
  '이',
  '평점',
  '들',
  '은',
  '나쁘다',
  '않다',
  '점',
  '짜다',
  '리',
  '는',
  '더',
  '더욱',
  '아니다'],
 ['지루하다', '않다', '완전', '막장', '임', '돈', '주다', '보기', '에는']]

In [156]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

tokenizer.texts_to_sequences(X_train)[:3]

[[637, 108],
 [76, 184, 1, 36, 11, 10, 406, 32, 33, 295, 778, 12, 56, 872, 29],
 [82, 32, 107, 390, 126, 127, 74, 164, 261]]

## TF-IDF 실습

In [291]:
def term_freq(term, document):
    count = 0
    for i in document.split():
        if i == term:
            count = count + 1 
    return count    

def document_freq(term, documents):
    count = 0
    for i in documents:
        if term in i:
            count = count + len([i])
    return count
    

def idf(term, documents):
    from math import log
    count = document_freq(term, documents)
#     n = 0
#     for i in documents:
#         for j in i.split():
#             n = n + len([j])
    return log(len(documents) / (1 + count))



def tf_idf(term, documents, idx):
    idf_tmp = idf(term, documents)
    tf_tmp = term_freq(term, documents[idx])
    return tf_tmp*idf_tmp

In [175]:
doc = '동해 물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 동해 가고 싶다'

docs = [
  '동해 물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 동해 가고 싶다',
  '남산 위에 저 소나무, 철갑을 두른 듯 바람 서리 불변함은 우리 기상일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 소나무 이쁘다',
  '가을 하늘 공활한데 높고 구름 없이 밝은 달은 우리 가슴 일편단심일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 가을 하늘 보고 싶다.',
  '이 기상과 이 마음으로 충성을 다하여 괴로우나 즐거우나 나라 사랑하세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 나라를 사랑하자'
]

In [269]:
term_freq('동해', doc)

2

In [268]:
document_freq('동해', docs)

1

In [292]:
idf('동해', docs)

0.6931471805599453

In [293]:
tf_idf('하늘', docs, 2)

1.3862943611198906

### TF-IDF 모범답안

In [353]:
def term_freq(term, document):
    return document.count(term)


def document_freq(term, documents):
    count = 0
    for i in documents:
        # True = 1, False = 0 으로 term이 나옴
        count += term in i
    return count
    

def idf(term, documents):
    from math import log
    count = document_freq(term, documents)

    n = len(set(' '.join(documents).split()))
    
    return log(len(documents) / (1 + count))


def tf_idf(term, documents, idx):
    idf_tmp = idf(term, documents)
    tf_tmp = term_freq(term, documents[idx])
    return tf_tmp*idf_tmp

### TF-IDF 전처리 및 라이브러리

In [296]:
docs = [
  '동해 물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 동해 가고 싶다',
  '남산 위에 저 소나무, 철갑을 두른 듯 바람 서리 불변함은 우리 기상일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 소나무 이쁘다',
  '가을 하늘 공활한데 높고 구름 없이 밝은 달은 우리 가슴 일편단심일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 가을 하늘 보고 싶다.',
  '이 기상과 이 마음으로 충성을 다하여 괴로우나 즐거우나 나라 사랑하세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 나라를 사랑하자'
]

In [299]:
from konlpy.tag import Okt
okt = Okt()

In [306]:
vocab = list(set(w for doc in docs for w in okt.nouns(doc)))
print(vocab)

['보우', '만세', '물', '이', '대한', '소나무', '보전', '사랑', '불변', '동해', '화려', '무궁화', '남산', '사람', '일편단심', '하느님', '하사', '마르고', '함', '가슴', '우리나라', '강산', '데', '달', '활', '길이', '하늘', '우리', '충성', '마음', '저', '가을', '나라', '철갑', '바람', '서리', '보고', '듯', '위', '구름', '기상', '백두산', '삼천리']


#### 가나다 순으로 정렬

In [308]:
vocab.sort()
vocab[:3]

['가슴', '가을', '강산']

#### DTM 실습

- 하나의 sentence 마다 빈도 벡터를 구한다
- 모두 모은다

In [346]:
lst1 = []
lst2 = []
lst3 = []
lst4 = []
for i in range(len(vocab)):
    lst1.append(term_freq(vocab[i], docs[0]))
for i in range(len(vocab)):
    lst2.append(term_freq(vocab[i], docs[1]))
for i in range(len(vocab)):
    lst3.append(term_freq(vocab[i], docs[2]))
for i in range(len(vocab)):
    lst4.append(term_freq(vocab[i], docs[3]))
# print(lst1, lst2, lst3, lst4)
# print(len(lst1), len(lst2), len(lst3), len(lst4))
    
tmp = pd.DataFrame([lst1, lst2, lst3, lst4], columns=vocab)
tmp

Unnamed: 0,가슴,가을,강산,구름,기상,길이,나라,남산,달,대한,...,일편단심,저,철갑,충성,하느님,하늘,하사,함,화려,활
0,0,0,1,0,0,1,1,0,0,2,...,0,0,0,0,1,0,1,0,1,0
1,0,0,1,0,1,1,0,1,0,2,...,0,1,1,0,0,0,0,1,1,0
2,1,2,1,1,0,1,0,0,1,2,...,1,0,0,0,0,2,0,0,1,1
3,0,0,1,0,1,1,2,0,0,2,...,0,0,0,1,0,0,0,0,1,0


#### DTM 모범답안

In [348]:
# list comprehension
pd.DataFrame([list(map(lambda x: term_freq(x, doc), vocab)) for doc in docs], columns=vocab)

# for문
res = []
for doc in docs:
    tf_res = []
    for i in vocab:
        tf_res.append(term_freq(i, doc))
    res.append(tf_res)
    
fin = pd.DataFrame(res, columns=vocab)

#### IDF 결과를 DataFrame으로 확인

In [356]:
res = []
for term in vocab:
    res.append(idf(term, docs))

pd.DataFrame(res, index=vocab, columns=['IDF']).head()

Unnamed: 0,IDF
가슴,0.693147
가을,0.693147
강산,-0.223144
구름,0.693147
기상,0.287682


#### TF-IDF 결과를 DataFrame으로 확인

In [357]:
res = []
for i in range(len(docs)):
    tf_idf_res = []
    for term in vocab:
        tf_idf_res.append(tf_idf(term, docs, i))
    res.append(tf_idf_res)

pd.DataFrame(res, columns=vocab)

Unnamed: 0,가슴,가을,강산,구름,기상,길이,나라,남산,달,대한,...,일편단심,저,철갑,충성,하느님,하늘,하사,함,화려,활
0,0.0,0.0,-0.223144,0.0,0.0,-0.223144,0.287682,0.0,0.0,-0.446287,...,0.0,0.0,0.0,0.0,0.693147,0.0,0.693147,0.0,-0.223144,0.0
1,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.0,0.693147,0.0,-0.446287,...,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.693147,-0.223144,0.0
2,0.693147,1.386294,-0.223144,0.693147,0.0,-0.223144,0.0,0.0,0.693147,-0.446287,...,0.693147,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,-0.223144,0.693147
3,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.575364,0.0,0.0,-0.446287,...,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,-0.223144,0.0


### Tensorflow로 BoW 구현하기

In [358]:
from tensorflow.keras.preprocessing.text import Tokenizer

t = Tokenizer()
# 형태소 분리 안 한 상태로 ㄱㄱ
t.fit_on_texts(docs)

print(t.word_index)

{'무궁화': 1, '삼천리': 2, '화려': 3, '강산': 4, '대한': 5, '사람': 6, '대한으로': 7, '길이': 8, '보전하세': 9, '동해': 10, '싶다': 11, '소나무': 12, '우리': 13, '가을': 14, '하늘': 15, '이': 16, '물과': 17, '백두산이': 18, '마르고': 19, '닳도록': 20, '하느님이': 21, '보우하사': 22, '우리나라': 23, '만세': 24, '가고': 25, '남산': 26, '위에': 27, '저': 28, '철갑을': 29, '두른': 30, '듯': 31, '바람': 32, '서리': 33, '불변함은': 34, '기상일세': 35, '이쁘다': 36, '공활한데': 37, '높고': 38, '구름': 39, '없이': 40, '밝은': 41, '달은': 42, '가슴': 43, '일편단심일세': 44, '보고': 45, '기상과': 46, '마음으로': 47, '충성을': 48, '다하여': 49, '괴로우나': 50, '즐거우나': 51, '나라': 52, '사랑하세': 53, '나라를': 54, '사랑하자': 55}


#### DTM 만들기

In [359]:
print(t.texts_to_matrix(docs, mode='count'))

[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1.]]


#### TF-IDF 만들기

In [361]:
pd.DataFrame(t.texts_to_matrix(docs, mode='tfidf'))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,0.0,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,0.587787,...,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612
